In [76]:
import csv
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from tqdm.notebook import tqdm
import selfies as sf

In [77]:
klek1 = pd.read_csv('./GRU_data/chembl_klek.csv')
klek2 = pd.read_csv('./GRU_data/250k_klek.csv')
klek = pd.concat([klek1, klek2], ignore_index=True)

selfies1 = pd.read_csv('./GRU_data/chembl_selfies.csv')
selfies2 = pd.read_csv('./GRU_data/250k_selfies.csv')
selfies = pd.concat([selfies1, selfies2], ignore_index=True)


In [78]:
print('Chembl:')
print('fp size:', klek1.shape, 'selfies size:', selfies1.shape)
print('Zinc 250k:')
print('fp size:', klek2.shape, 'selfies size:', selfies2.shape)

Chembl:
fp size: (218984, 2) selfies size: (218984, 1)
Zinc 250k:
fp size: (183879, 2) selfies size: (183879, 1)


In [79]:
data = pd.concat([klek, selfies], axis=1)

In [80]:
data

Unnamed: 0,smiles,fps,selfies
0,Cc1ccc(C(=O)Nc2ccc(S(=O)(=O)N=C(N)N)cc2)cc1,"[668, 676, 838, 1012, 1023, 1145, 1192, 1317, ...",[C][C][=C][C][=C][Branch2][Ring1][=N][C][=Bran...
1,COc1ccc(/C=C/C(C)=O)c(OCC=C(C)C)c1,"[125, 132, 296, 646, 668, 1192, 1262, 1644, 25...",[C][O][C][=C][C][=C][Branch1][Branch2][/C][=C]...
2,O=C(O)/C=C\CCCCCn1ccnc1,"[125, 132, 135, 296, 337, 340, 343, 345, 346, ...",[O][=C][Branch1][C][O][/C][=C][\C][C][C][C][C]...
3,CC(C(=O)NO)N(Cc1ccccc1[N+](=O)[O-])C(=O)Nc1ccc...,"[0, 13, 19, 97, 101, 296, 609, 668, 671, 676, ...",[C][C][Branch1][#Branch1][C][=Branch1][C][=O][...
4,O=C(Cc1ccncc1)NCCNC1c2ccc(Cl)cc2CCc2cccnc21,"[0, 296, 340, 382, 386, 389, 466, 479, 482, 50...",[O][=C][Branch1][#Branch2][C][C][=C][C][=N][C]...
...,...,...,...
402858,CC1(C)CC[C@H](CNC(=O)Cn2ncc3ccccc3c2=O)c2ccccc21,"[0, 7, 33, 296, 297, 340, 466, 479, 482, 503, ...",[C][C][Branch1][C][C][C][C][C@H1][Branch2][Rin...
402859,Cn1ccnc1C(=O)c1ccc(NC(=O)C2CCN(C(=O)C(C)(C)C)C...,"[0, 19, 231, 296, 297, 301, 302, 340, 604, 609...",[C][N][C][=C][N][=C][Ring1][Branch1][C][=Branc...
402860,Cc1ccc(NC(=O)C(=O)N(C)Cc2ccccc2)c(C)c1,"[296, 565, 607, 609, 629, 668, 676, 838, 1013,...",[C][C][=C][C][=C][Branch2][Ring1][#Branch1][N]...
402861,Cc1cc(C(=O)Nc2ccc(OCC(N)=O)cc2)c(C)n1C1CC1,"[0, 227, 296, 503, 546, 646, 668, 676, 838, 10...",[C][C][=C][C][Branch2][Ring1][#Branch1][C][=Br...


In [81]:
data = data.drop_duplicates(subset=['smiles'])

In [82]:
data

Unnamed: 0,smiles,fps,selfies
0,Cc1ccc(C(=O)Nc2ccc(S(=O)(=O)N=C(N)N)cc2)cc1,"[668, 676, 838, 1012, 1023, 1145, 1192, 1317, ...",[C][C][=C][C][=C][Branch2][Ring1][=N][C][=Bran...
1,COc1ccc(/C=C/C(C)=O)c(OCC=C(C)C)c1,"[125, 132, 296, 646, 668, 1192, 1262, 1644, 25...",[C][O][C][=C][C][=C][Branch1][Branch2][/C][=C]...
2,O=C(O)/C=C\CCCCCn1ccnc1,"[125, 132, 135, 296, 337, 340, 343, 345, 346, ...",[O][=C][Branch1][C][O][/C][=C][\C][C][C][C][C]...
3,CC(C(=O)NO)N(Cc1ccccc1[N+](=O)[O-])C(=O)Nc1ccc...,"[0, 13, 19, 97, 101, 296, 609, 668, 671, 676, ...",[C][C][Branch1][#Branch1][C][=Branch1][C][=O][...
4,O=C(Cc1ccncc1)NCCNC1c2ccc(Cl)cc2CCc2cccnc21,"[0, 296, 340, 382, 386, 389, 466, 479, 482, 50...",[O][=C][Branch1][#Branch2][C][C][=C][C][=N][C]...
...,...,...,...
402858,CC1(C)CC[C@H](CNC(=O)Cn2ncc3ccccc3c2=O)c2ccccc21,"[0, 7, 33, 296, 297, 340, 466, 479, 482, 503, ...",[C][C][Branch1][C][C][C][C][C@H1][Branch2][Rin...
402859,Cn1ccnc1C(=O)c1ccc(NC(=O)C2CCN(C(=O)C(C)(C)C)C...,"[0, 19, 231, 296, 297, 301, 302, 340, 604, 609...",[C][N][C][=C][N][=C][Ring1][Branch1][C][=Branc...
402860,Cc1ccc(NC(=O)C(=O)N(C)Cc2ccccc2)c(C)c1,"[296, 565, 607, 609, 629, 668, 676, 838, 1013,...",[C][C][=C][C][=C][Branch2][Ring1][#Branch1][N]...
402861,Cc1cc(C(=O)Nc2ccc(OCC(N)=O)cc2)c(C)n1C1CC1,"[0, 227, 296, 503, 546, 646, 668, 676, 838, 10...",[C][C][=C][C][Branch2][Ring1][#Branch1][C][=Br...


In [83]:
out_df = pd.DataFrame()
out_df['smiles'] = data['smiles']
out_df['fps'] = data['fps']
out_df.to_csv('GRU_data/combined_klek.csv', sep=',', index=False)

In [84]:
data['selfies'].to_csv('GRU_data/combined_selfies.csv', sep=',', index=False)

# Smiles augmentation

In [117]:
data = data.reindex()
data.iloc[5001]

smiles                        CN1CN=C(NCc2ccccc2)c2[nH]cnc21
fps        [296, 466, 565, 607, 668, 676, 801, 1563, 2131...
selfies    [C][N][C][N][=C][Branch1][O][N][C][C][=C][C][=...
Name: 10001, dtype: object

In [135]:
import os

os.remove('./GRU_data/augmented.csv') 
for n, smile in enumerate(tqdm(data['smiles'])):
    break_out_flag = False
    smiles_batch = []
    selfies_batch = []
    temp_df = pd.DataFrame()
    
    for i in range(6):
        if i == 0:
            temp = smile
        else:
            temp = sme.randomize_smiles(smile)
        smiles_batch.append(temp)
        
    for j in range(6):
        if j == 0:
            selfies_batch.append(data.iloc[n][2])
        else:
            try:
                selfie = sf.encoder(smiles_batch[j])
                selfies_batch.append(selfie)
            except:
                break_out_flag = True
                break
    
    if break_out_flag == True:
        continue
    
    temp_df['smiles'] = smiles_batch
    temp_df['fps'] = [data.iloc[n][1]]*len(smiles_batch)
    temp_df['selfies'] = selfies_batch
    
    temp_df.to_csv('./GRU_data/augmented.csv', mode='a', index=False, header=False)

  0%|          | 0/397701 [00:00<?, ?it/s]