In [1]:
import pandas as pd
import selfies

In [2]:
# Load SMILES dataset
df = pd.read_csv('np_dataset_smiles_only.csv').drop(columns=['Unnamed: 0'])

In [3]:
df

Unnamed: 0,SMILES
0,CC[C@H](C)[C@@H](CO)NC(=O)/C=C/[C@@](C)(OC)[C@...
1,C=C(C)[C@@H]1C[C@@]2(C)C(=CC1=O)CC[C@@H](OC(=O...
2,COC1C(C(OC2OC(C(=O)O)=CC(O)C2O)C(N)=O)OC(n2ccc...
3,O=Nc1[nH]c(O)c(-c2ccccc2)c1-c1ccc(O)c([N+](=O)...
4,C[C@@]1(CO)CCC[C@]2(C)[C@H]3C[C@@]45OC4[C@H](O...
...,...
404316,O=C(NCCN=C(O)C(F)(F)C(F)(F)C(F)(F)F)C(F)(F)C(F...
404317,C=CC1=C(C)c2cc3[nH]c(c(C=CC(=O)O)c3C)c3c4nc(cc...
404318,CO[C]1[CH][C]([C]2[OH+][C]3[CH][C](O)[CH][C](O...
404319,CN(C)c1ccc(O)c2c1CC1CC3C(N(C)C)C(=O)C(C(N)=O)C...


In [4]:
# Convert SMILES to SELFIES
def smiles_to_selfies(smiles):
    try:
        selfies_str = selfies.encoder(smiles)
        return selfies_str
    except:
        return None
    
# Display rows whose SELFIES value is None
def display_na_selfies(df):
    none_selfies_rows = df[df['SELFIES'].isna()]
    print(none_selfies_rows)

In [5]:
# Apply the conversion function to create a new 'SELFIES' column
df['SELFIES'] = df['SMILES'].apply(smiles_to_selfies)

In [6]:
df

Unnamed: 0,SMILES,SELFIES
0,CC[C@H](C)[C@@H](CO)NC(=O)/C=C/[C@@](C)(OC)[C@...,[C][C][C@H1][Branch1][C][C][C@@H1][Branch1][Ri...
1,C=C(C)[C@@H]1C[C@@]2(C)C(=CC1=O)CC[C@@H](OC(=O...,[C][=C][Branch1][C][C][C@@H1][C][C@@][Branch1]...
2,COC1C(C(OC2OC(C(=O)O)=CC(O)C2O)C(N)=O)OC(n2ccc...,[C][O][C][C][Branch2][Ring1][=C][C][Branch2][R...
3,O=Nc1[nH]c(O)c(-c2ccccc2)c1-c1ccc(O)c([N+](=O)...,[O][=N][C][NH1][C][Branch1][C][O][=C][Branch1]...
4,C[C@@]1(CO)CCC[C@]2(C)[C@H]3C[C@@]45OC4[C@H](O...,[C][C@@][Branch1][Ring1][C][O][C][C][C][C@][Br...
...,...,...
404316,O=C(NCCN=C(O)C(F)(F)C(F)(F)C(F)(F)F)C(F)(F)C(F...,[O][=C][Branch2][Ring1][#C][N][C][C][N][=C][Br...
404317,C=CC1=C(C)c2cc3[nH]c(c(C=CC(=O)O)c3C)c3c4nc(cc...,[C][=C][C][=C][Branch1][C][C][C][=C][C][NH1][C...
404318,CO[C]1[CH][C]([C]2[OH+][C]3[CH][C](O)[CH][C](O...,[C][O][CH0][CH1][CH0][Branch2][Branch1][Branch...
404319,CN(C)c1ccc(O)c2c1CC1CC3C(N(C)C)C(=O)C(C(N)=O)C...,[C][N][Branch1][C][C][C][=C][C][=C][Branch1][C...


In [7]:
display_na_selfies(df)

                      SMILES SELFIES
374915      Cc1ccccc1[IH2]=O    None
383573  Cc1cccc([IH2](O)O)c1    None
403266    Cc1ccc([IH2]=O)cc1    None


In [8]:
# Drop rows whose SELFIES value is None
df_cleaned = df.dropna(subset=['SELFIES'])

In [9]:
df_cleaned

Unnamed: 0,SMILES,SELFIES
0,CC[C@H](C)[C@@H](CO)NC(=O)/C=C/[C@@](C)(OC)[C@...,[C][C][C@H1][Branch1][C][C][C@@H1][Branch1][Ri...
1,C=C(C)[C@@H]1C[C@@]2(C)C(=CC1=O)CC[C@@H](OC(=O...,[C][=C][Branch1][C][C][C@@H1][C][C@@][Branch1]...
2,COC1C(C(OC2OC(C(=O)O)=CC(O)C2O)C(N)=O)OC(n2ccc...,[C][O][C][C][Branch2][Ring1][=C][C][Branch2][R...
3,O=Nc1[nH]c(O)c(-c2ccccc2)c1-c1ccc(O)c([N+](=O)...,[O][=N][C][NH1][C][Branch1][C][O][=C][Branch1]...
4,C[C@@]1(CO)CCC[C@]2(C)[C@H]3C[C@@]45OC4[C@H](O...,[C][C@@][Branch1][Ring1][C][O][C][C][C][C@][Br...
...,...,...
404316,O=C(NCCN=C(O)C(F)(F)C(F)(F)C(F)(F)F)C(F)(F)C(F...,[O][=C][Branch2][Ring1][#C][N][C][C][N][=C][Br...
404317,C=CC1=C(C)c2cc3[nH]c(c(C=CC(=O)O)c3C)c3c4nc(cc...,[C][=C][C][=C][Branch1][C][C][C][=C][C][NH1][C...
404318,CO[C]1[CH][C]([C]2[OH+][C]3[CH][C](O)[CH][C](O...,[C][O][CH0][CH1][CH0][Branch2][Branch1][Branch...
404319,CN(C)c1ccc(O)c2c1CC1CC3C(N(C)C)C(=O)C(C(N)=O)C...,[C][N][Branch1][C][C][C][=C][C][=C][Branch1][C...


In [10]:
display_na_selfies(df_cleaned)

Empty DataFrame
Columns: [SMILES, SELFIES]
Index: []


In [11]:
# Save updated DataFrame with SELFIES column to csv file
df_cleaned.to_csv('np_dataset.csv', index=False)