In [92]:
import pandas as pd
from helper.compound_querier import validate
from tqdm.autonotebook  import tqdm
import pickle
from difflib import SequenceMatcher
import numpy as np
import os

In [93]:
df = pd.read_pickle('./source/compounds/compounds.pkl')

In [94]:
df.head()

Unnamed: 0,smiles,inchi,inchi_key
0,C[Si](C)(C)OC[C@@H]1[C@H]([C@@H]([C@@H]([C@H](...,"InChI=1S/C54H130O16Si12/c1-71(2,3)55-37-41-47(...",AGCGTUNOUDOTHI-KRYNHHJKSA-N
1,C[Si](C)(C)OCC1C(C(C(C(O1)OC2=CC3=C(C(=C2)O[Si...,"InChI=1S/C50H92O14Si8/c1-65(2,3)53-32-42-44(61...",IADLJLVXUGSEBP-UHFFFAOYSA-N
2,CC1C(C(C(C(O1)OCC2C(C(C(C(O2)OC3=CC4=C(C(=O)CC...,"InChI=1S/C52H98O15Si8/c1-34-45(62-70(9,10)11)4...",KGNFGWOIKRFMAR-UHFFFAOYSA-N
3,CC1C(C(C(C(O1)OC2C(C(C(OC2OC3=CC4=C(C(=O)CC(O4...,"InChI=1S/C51H96O14Si8/c1-34-44(61-69(11,12)13)...",KSAMZDLNUNJNIH-UHFFFAOYSA-N
4,CC1C(C(C(C(O1)OCC2C(C(C(C(O2)OC3=C(OC4=C(C3=O)...,"InChI=1S/C66H128O19Si11/c1-43-53(77-88(9,10)11...",XEPGSEKVGCRXJO-UHFFFAOYSA-N


In [95]:
df.isna().sum()

smiles       418
inchi        647
inchi_key      0
dtype: int64

In [96]:
for c in df.columns:
    print(c, df[c].nunique())

smiles 12520
inchi 12012
inchi_key 12948


In [97]:
len(df)

12949

In [98]:
smiles = pd.read_csv('./source/dataset/smiles.txt', delimiter='\t', header=None)
smiles.head()

Unnamed: 0,0,1
0,IPYJZDLWVOYHJP-UHFFFAOYSA-N,CC(C)(C)O[Si](C[Si](C1=CC=CC=C1)(C2=CC=CC=C2)O...
1,YFOLLJZSHSKLDF-UHFFFAOYSA-N,CC(C)CCCC(C)CCCC(C)CCCC(C)O[Si](C)(C)C
2,VMEFAYALEKTMDF-UHFFFAOYSA-N,CCC(N)P(=O)(O[Si](C)(C)C)O[Si](C)(C)C
3,DWGSZPGNJMZEMM-UHFFFAOYSA-N,CC=C=CCC(=O)OC1(CCCCO1)[SiH3]
4,GMVRQOAQHDVEKA-UHFFFAOYSA-N,C[Si](C)(C)OCCCCCCCCCO[Si](C)(C)C


In [99]:
smiles.columns = ['inchi_key', 'smiles']

In [100]:
inchi_keys = pd.read_csv('./source//dataset/inchi_keys.txt', delimiter='\t', header=None)

In [101]:
inchi_keys.head()

Unnamed: 0,0
0,MOBSUIKFQIWSJI-BXQJSQRVSA-N
1,MSRAOOKTXOTZPK-UHFFFAOYSA-N
2,BCATYCVSLYNDPX-UERWRGBPSA-N
3,XBKDHFNKZGURLX-UHFFFAOYSA-N
4,RASLARXBAZCUFF-NWSQWKLXSA-N


In [102]:
inchi_keys.columns = ['inchi_key']

In [103]:
df_inchi_keys = set(df['inchi_key'].values)
smiles_inchi_keys = set(smiles['inchi_key'].values)
all_inchi_keys = set(inchi_keys['inchi_key'].values)

In [104]:
len(smiles_inchi_keys - df_inchi_keys), len(df_inchi_keys - smiles_inchi_keys)

(0, 0)

In [105]:
len(all_inchi_keys - df_inchi_keys), len(df_inchi_keys - all_inchi_keys)

(0, 0)

In [106]:
len(all_inchi_keys - smiles_inchi_keys), len(smiles_inchi_keys - all_inchi_keys)

(0, 0)

In [107]:
df_smiles = set(df['smiles'].values)
smiles_smiles = set(smiles['smiles'].values)

In [108]:
len(smiles_smiles - df_smiles), len(df_smiles - smiles_smiles)

(1284, 1191)

In [109]:
in_df_not_in_smiles = df_smiles - smiles_smiles
len(in_df_not_in_smiles)

1191

In [110]:
TEST_ID = 0
if os.path.exists('./source/compounds/similarity.pkl'):
    similar_smiles_in_smiles = pickle.load(open('./source/compounds/similarity.pkl', 'rb'))
    if similar_smiles_in_smiles is not None and len(similar_smiles_in_smiles) == 3:
        print('Similarity matrix loaded')
        print(similar_smiles_in_smiles['row_smiles'][TEST_ID], '->', similar_smiles_in_smiles['column_smiles'][np.argmax(similar_smiles_in_smiles['similarity'][TEST_ID])])
    else:
        print('Similarity matrix not loaded')
else:
    print('Similarity matrix not loaded')

Similarity matrix loaded
c1cccc2nc(c3ccccc3c12)O[Si](C)(C)C -> CCCCN(CCCC)CCO[Si](C)(C)C


In [113]:
similarity = np.zeros((len(in_df_not_in_smiles), len(smiles_smiles)))
row_smiles = [x for x in list(in_df_not_in_smiles) if not pd.isna(x)]
column_smiles = [x for x in list(smiles_smiles) if not pd.isna(x)]
for idx_i, i in tqdm(enumerate(in_df_not_in_smiles), desc='Not in smiles but in df', total=len(in_df_not_in_smiles)):
    for idx_j, j in enumerate(smiles_smiles):
        if pd.isna(i) or pd.isna(j):
            continue
        score = SequenceMatcher(None, i.lower(), j.lower()).ratio()
        similarity[idx_i, idx_j] = score

pickle.dump({'similarity': similarity, 'row_smiles': row_smiles, 'column_smiles': column_smiles}, open('./source/compounds/similarity.pkl', 'wb'))

Not in smiles but in df:   0%|          | 0/1191 [00:00<?, ?it/s]

In [131]:
mean_max = np.mean(np.max(similarity, axis=1))
mean_max

0.8124235756961824

In [132]:
min_max = np.min(np.max(similarity, axis=1))
min_max

0.576271186440678

In [133]:

max_max = np.max(np.max(similarity, axis=1))
max_max

1.0

In [138]:
median_max = np.median(np.max(similarity, axis=1))
median_max

0.810126582278481

(773, 13030)

In [135]:
# drop columns that have nan in column_smiles
similarity = similarity[:, ~pd.isna(column_smiles)]
column_smiles = [x for x in column_smiles if not pd.isna(x)]
len(row_smiles), len(column_smiles)

(773, 13030)

In [136]:
for i in range(10):
    print(np.max(similarity[i]), row_smiles[i], column_smiles[np.argmax(similarity[i])])

0.75 c1(c(c(ccc1Cl)O[Si](C)(C)C)Cl)c1c(cccc1Cl)Cl CC1(CCC(=C1)O[Si](C)(C)C)CCC2=CCCCC2
0.8409090909090909 Cc1ccc(cc1)C=CC(=O)c2cc(F)ccc2O[Si](C)(C)C CC1CC(CC(C1)(C)C)OC(=O)C2=CC=CC=C2O[Si](C)(C)C
0.8157894736842105 C(=O)(c1c(ccc(c1)Cl)O[Si](C)(C)C)c1ccccc1 CC(=O)C1=C(C=CC(=C1)Cl)O[Si](C)(C)C
0.8478260869565217 C(=O)(/C=C/c1cc(c(cc1)O[Si](C)(C)C)OC)OCC COC(=O)/C=C/C1=CC(=C(C=C1)O[Si](C)(C)C)O[Si](C)(C)C
0.8852459016393442 c1(c(cccc1)C(CC)C)O[Si](C)(C)C CC1=C(CC2CC1C2(C)C)O[Si](C)(C)C
0.7951807228915663 c1(cc(=O)c2cc(ccc2o1)O[Si](C)(C)C)c1ccc(cc1)OC CC(=O)OC(C[Si](C)(C)C)C1=CC=C(C=C1)OC
0.7474747474747475 c1(c(c(=O)c2ccc(cc2o1)OC)O[Si](C)(C)C)c1c(cccc1)OC CCC(C(=O)C1=CC=C(C=C1)C(=O)O[Si](C)(C)C)N2CCCC2=O
0.8674698795180723 c1c(c(cc2ccccc12)N[Si](C)(C)C)O[Si](C)(C)C CC(CC1=CC=CC=C1)N([Si](C)(C)C)[Si](C)(C)C
0.7848101265822784 c1(c(cccc1)O[Si](C)(C)C)c1c(cc(cc1Cl)Cl)Cl CC1(CCCC(=C1)O[Si](C)(C)C)CCC2=CCCCC2
0.84375 c1(c(cc(cc1)C)Cc1ccccc1)O[Si](C)(C)C CC(C)(C)C1CCCCC1O[Si](C)(C)C
