In [6]:
import pandas as pd
import numpy as np
import os
import time
from rdkit import Chem
from fp_gen import KlekFPGenerator, MACCSFPGenerator, SubFPGenerator

In [7]:
def could_be_valid(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol is not None
    except:
        return False

In [8]:
def sparse_to_dense(sparse):
    return np.nonzero(sparse)[0].tolist()

In [12]:
generator = KlekFPGenerator(n_jobs=os.cpu_count())
data = pd.read_csv('../GRU_data/250k_smiles.csv', chunksize=5000, names=['smiles'])

for i, chunk in enumerate(data):
    print(chunk)
    chunk = chunk[chunk['smiles'].apply(could_be_valid)]
    mols = list(map(Chem.MolFromSmiles, chunk['smiles']))
    fps = generator.transform(mols)
    chunk['fps'] = list(map(sparse_to_dense, fps))
    if i == 0:
        chunk.to_csv('../GRU_data/250k_klek.csv', index=False)
    else: # append if already exists, otherwise without else firts chunk will be written twice
        with open('../GRU_data/250k_klek.csv', 'a') as f:
            chunk.to_csv(f, header=False, index=False)

                                                 smiles
0               CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1
1          C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1
2     N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...
3     CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...
4     N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...
...                                                 ...
4995  C[C@H](NS(=O)(=O)/C=C/c1ccccc1)c1ccc(C(F)(F)F)cc1
4996         Cc1cccc(O[C@@H](C)c2nnc(SCc3ccccc3C)n2C)c1
4997                     CCCCCNC(=O)CSc1cc(C)c2ccccc2n1
4998  Cc1cc(C(=O)CN2C(=O)N[C@@](C)(c3ccc(Cl)cc3)C2=O...
4999  O=C(NO[C@@H]1CCCCO1)[C@@H]1CCCN1C(=O)c1cc(Cl)c...

[5000 rows x 1 columns]
                                                 smiles
5000  CCOc1ccc(C(=O)N(Cc2ccc(Br)o2)[C@@H]2CCS(=O)(=O...
5001          CCOc1ccc(O[C@H](C)C(=O)Nc2ccc(C)cc2Br)cc1
5002       Cc1cnn(CCC(=O)N[C@H](C)c2ccc(-n3cccn3)cc2)c1
5003     O=C(Nc1ccccc1C(=O)NCc1csc(-c2ccccc2)n1)c1ccco1
5004            CCn1c(C

In [14]:
reload_data = pd.read_csv('../GRU_data/250k_klek.csv')
reload_data['fps'] = reload_data['fps'].apply(eval)
reload_data.head()

Unnamed: 0,smiles,fps
0,CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1,"[296, 503, 505, 668, 676, 838, 972, 986, 1153,..."
1,C[C@@H]1CC(Nc2cncc(-c3nncn3C)c2)C[C@@H](C)C1,"[0, 13, 296, 297, 298, 668, 676, 677, 679, 201..."
2,N#Cc1ccc(-c2ccc(O[C@@H](C(=O)N3CCCC3)c3ccccc3)...,"[0, 19, 32, 296, 340, 343, 604, 609, 620, 1152..."
3,CCOC(=O)[C@@H]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c...,"[0, 19, 222, 296, 297, 301, 304, 340, 343, 345..."
4,N#CC1=C(SCC(=O)Nc2cccc(Cl)c2)N=C([O-])[C@H](C#...,"[0, 17, 296, 340, 343, 345, 503, 505, 664, 676..."


In [22]:
X_data = pd.read_csv('../GRU_data/250k_selfies.csv')
X_data.head()

Unnamed: 0,selfies
0,[C][C][Branch1][C][C][Branch1][C][C][C][=C][C]...
1,[C][C@@H1][C][C][Branch2][Ring1][Ring2][N][C][...
2,[N][#C][C][=C][C][=C][Branch2][Ring2][Ring2][C...
3,[C][C][O][C][=Branch1][C][=O][C@@H1][C][C][C][...
4,[N][#C][C][=C][Branch2][Ring1][Ring1][S][C][C]...


In [23]:
assert len(reload_data) == len(X_data) # check if same length, otherwise something went wrong