In [1]:
import random
import json
import pandas as pd
import selfies as sf
from tqdm import tqdm
from rdkit import Chem
import matplotlib.pyplot as plt

In [42]:
df_sas_selfies = pd.read_csv("data-bin-sas-selfies.csv", names=["ppl", "selfies"])
df_sas_selfies

Unnamed: 0,ppl,selfies
0,1.77,[C][=C][C][C][C][N][Ring1][Branch1][C][=Branch...
1,1.81,[C][N][C][=N][N][=C][Branch1][C][C][N][=C][Rin...
2,1.64,[C][C][=C][C][=Branch1][C][=O][NH1][C][Branch1...
3,1.61,[C][C][=Branch1][C][=O][C][C][=Branch1][C][=O]...
4,1.61,[C][C][Branch1][C][C][C][C][O][C][=Branch1][C]...
...,...,...
5118,1.72,[N][C][=C][C][=C][C][=C][NH1][C][Ring1][=Branc...
5119,1.62,[C][C][=C][Branch1][=Branch1][C][Branch1][C][N...
5120,1.61,[C][=C][Branch1][=N][C][C][=C][N][=C][Branch1]...
5121,1.73,[C][N][C][=C][C][=C][C][=C][Ring1][=Branch1][C...


In [43]:
def make_canonical_smiles(selfies):
    canon_smiles = None
    smiles = sf.decoder(selfies)

    try:
        canon_smiles = Chem.MolToSmiles(Chem.MolFromSmiles(smiles))
    except:
        pass  

    return canon_smiles

In [44]:
df_sas_selfies["smiles"] = df_sas_selfies["selfies"].apply(make_canonical_smiles)

In [45]:
df_sas_selfies

Unnamed: 0,ppl,selfies,smiles
0,1.77,[C][=C][C][C][C][N][Ring1][Branch1][C][=Branch...,C=C1CCCN1C(=O)OC(C)(C)CCN1CCCC1CNCn1cnc(=O)c2c...
1,1.81,[C][N][C][=N][N][=C][Branch1][C][C][N][=C][Rin...,CNc1nnc(C)nc1CCOCCC(C)=NNc1ncccc1NCN=C(NCCN)NC...
2,1.64,[C][C][=C][C][=Branch1][C][=O][NH1][C][Branch1...,Cc1cc(=O)[nH]c(NCCN)c1COCNC(=O)Nc1cccc(O)c1
3,1.61,[C][C][=Branch1][C][=O][C][C][=Branch1][C][=O]...,CC(=O)CC(=O)c1ccc(CO)[nH]1
4,1.61,[C][C][Branch1][C][C][C][C][O][C][=Branch1][C]...,COCCc1[nH]c(=O)[nH]c1CCCc1nncn1C(=O)OCCC(C)C
...,...,...,...
5118,1.72,[N][C][=C][C][=C][C][=C][NH1][C][Ring1][=Branc...,Nc1cc2ccc[nH]c-2cc1=O
5119,1.62,[C][C][=C][Branch1][=Branch1][C][Branch1][C][N...,Cc1c(C(N)=O)ncn1CCCNCC(C)n1ncnc1N1CC(N)C1
5120,1.61,[C][=C][Branch1][=N][C][C][=C][N][=C][Branch1]...,C=C(Cc1cnc(C)nc1)C(C)CCCCC(C)=C(C(=O)O)C(=O)CC
5121,1.73,[C][N][C][=C][C][=C][C][=C][Ring1][=Branch1][C...,CCC(=CC1OCC=Cc2c(NC)cccc21)c1cc(N)cc(Cl)c1N


In [50]:
with open("../Generations_aspirin_0.4/aspirin_unique_canonical_40M.csv", "r") as f:
    generation_aspirin = f.read().splitlines()
    
len(generation_aspirin)    

2978371

In [60]:
with open("../Generations_sas_3_selfies/sas_unique_canonical_40M.csv", "r") as f:
    generation_sas = f.read().splitlines()
    
len(generation_sas) 

1125025

In [52]:
gdb_df = pd.read_csv("/mnt/xtb/knarik/GDB13_every100_row.csv")
    
len(gdb_df)

9743527

In [53]:
gdb_df.head()

Unnamed: 0,id,name,sascore,aspirin_similarity
0,100,C=CC1CN1,4.2884,0.0294
1,200,NC1CC1O,3.0288,0.2162
2,300,C=CCC#CC,3.7369,0.0385
3,400,CCC1CCN1,3.0331,0.0455
4,500,C=NN=CCC,4.392,0.0312


### Creating non generated subsets: sas

In [91]:
sas_subset = gdb_df[gdb_df['sascore'] <= 3]["name"].values
len(sas_subset)

66692

In [92]:
set_sas_subset = set(sas_subset)
set_generation_sas = set(generation_sas)

In [93]:
not_generated_sas = set_sas_subset - set_generation_sas
len(not_generated_sas)

59571

In [94]:
not_generated_sas

{'NCc1ccnc(CNC=O)c1',
 'CCNc1ccoc1OCC(N)=O',
 'CCc1cnc(CO)n(O)c1=O',
 'CNc1c(C)cnc2[nH]ccc12',
 'Cc1cnc(N)c(O)c1C',
 'Nc1ccn(O)c1CC(=O)N1CC1',
 'CC(C)(C)Oc1c[nH]c(C(N)=O)c1',
 'C=C(CCCNC)OCC(=O)OC',
 'CN(CCCC1CC1)C(N)=S',
 'CN(CC#N)CCCCCO',
 'CC=C(C)CCNCCCCOC',
 'COc1cc2cccc(O)c2n1N',
 'CNCCOC(C(=O)O)=C(C)C',
 'Cc1c(N)cc(C=CO)cc1O',
 'NN=Cc1cc(N)cc(O)c(=O)c1',
 'N=C(N)c1ccc(Cl)cc1C(=N)N',
 'CC(=C(CCO)C(=O)O)N1CCC1',
 'Cc1ccccc1C=CC=NN',
 'CC(=O)Nc1nn(C)c(=O)nc1C',
 'C#Cc1c(C)cccc(NC)c1=O',
 'Cc1oc(=O)c(C#N)c2c1CCC2',
 'CCC(CC)=C(C)C(=O)CC1CC1',
 'CC(C)(O)c1sccc1C1CC1',
 'C=C(C=CC(=O)OCC)CCOC',
 'O=C(OC=NO)c1cccc(O)c1',
 'CC1C(=O)c2cccc(N)c2C1=O',
 'CCCNN=c1occcc1OC',
 'N=C(N)C=CCCN1CCCCC1',
 'CCC(C)C(=O)COC1CCCC1',
 'CC1(NCC2(N)CCCCC2)CC1',
 'CCCN1CC(C2=CC=CCC2)C1',
 'CCOCOCC(=O)NNC(C)=O',
 'Cc1ccc(CO)c2c(Cl)c[nH]c12',
 'CCOCCC1=CCCN(C)CC1',
 'COCCC1CCCCCCN1',
 'N#Cc1c(Cl)ccn1C(=O)C1CC1',
 'Cn1cc(CCN)c(CCCO)n1',
 'CC(CN)NC(=O)C1CCCC1',
 'C=CCCCCCOCC(=O)NN',
 'CCCOC(=O)Cc1cocc1N',
 'CC(

In [95]:
not_generated_sas = list(not_generated_sas)
random.shuffle(not_generated_sas)

In [127]:
with open(f"../data-bin/data-subsets/gen_sas_selfies.jsonl", "w") as f:
    for smi in tqdm(generation_sas):
        # Write
        selfies = sf.encoder(smi)
        new_line = {"text": selfies}
        json.dump(new_line, f)
        f.write("\n")

100%|███████████████████████████████| 1125025/1125025 [03:53<00:00, 4809.08it/s]


### Creating non generated subsets: aspirin

In [86]:
aspirin_subset = gdb_df[gdb_df['aspirin_similarity'] >= 0.4]["name"].values
len(aspirin_subset)

82760

In [87]:
set_aspirin_subset = set(aspirin_subset)
set_generation_aspirin = set(generation_aspirin)

In [88]:
not_generated_aspirin = set_aspirin_subset - set_generation_aspirin
len(not_generated_aspirin)

62897

In [89]:
not_generated_aspirin = list(not_generated_aspirin)
random.shuffle(not_generated_aspirin)

In [128]:
with open(f"../data-bin/data-subsets/gen_aspirin.jsonl", "w") as f:
    for smi in tqdm(generation_aspirin):
        # Write
        new_line = {"text": smi}
        json.dump(new_line, f)
        f.write("\n")

100%|█████████████████████████████| 2978371/2978371 [00:22<00:00, 132811.17it/s]


In [7]:
with open(f"../data-bin/data-subsets/gen_sas_selfies.jsonl", "r") as f:
    d = json.load(f)

JSONDecodeError: Extra data: line 2 column 1 (char 95)

In [18]:
data = []
with open(f"../data-bin/data-subsets/gen_sas_selfies.jsonl", "r") as f:
    for line in f:
        data.append(json.loads(line))

In [19]:
data[:1000]

[{'text': '[C][C][N][C][C][C][Branch1][C][C][=C][Branch1][Ring1][C][C][C][=Branch1][C][=O][O]'},
 {'text': '[C][C][=Branch1][C][=O][N][C][C][=C][C][=C][C][=Branch1][C][=O][C][Ring1][Branch2][=O]'},
 {'text': '[C][N][Branch1][C][C][C][C][N][Branch1][Ring2][C][C][=O][C][C][C][C][Ring1][Ring2]'},
 {'text': '[C][C][N][C][=C][C][Branch1][C][C][=C][C][Branch1][C][Cl][=C][Ring1][Branch2][C][C]'},
 {'text': '[C][C][=C][C][=C][Branch1][N][C][=Branch1][C][=O][C][C][C][N][Ring1][Ring2][C][O][Ring1][N]'},
 {'text': '[C][N][Branch1][Branch2][C][C][C][C][C][Ring1][Ring1][C][=Branch1][C][=N][C][C][=Branch1][C][=N][N]'},
 {'text': '[C][C][C][=N][C][Branch1][Ring1][N][C][=C][C][Branch1][C][O][=C][Ring1][=Branch2][C][C]'},
 {'text': '[C][C][C][=C][C][=Branch1][C][=O][N][=C][Branch1][Ring1][C][N][N][Ring1][=Branch2][C]'},
 {'text': '[C][N][C][C][C][C][C][=Branch1][C][=O][N][C][=Branch1][C][=O][N][C]'},
 {'text': '[C][C][C][C][C][Branch1][C][C][Branch1][=Branch2][C][N][C][C][=Branch1][C][=O][O][C][Ring1][

In [4]:
t = '{"text": "[C][C][N][C][C][C][Branch1][C][C][=C][Branch1][Ring1][C][C][C][=Branch1][C][=O][O]"}\n'

In [6]:
json.loads(t)['text']

'[C][C][N][C][C][C][Branch1][C][C][=C][Branch1][Ring1][C][C][C][=Branch1][C][=O][O]'