In [1]:
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

In [2]:
with open('accepted_smiles.txt') as f:
    SMILES = list(f)
print(len(SMILES))

2138


In [3]:
ATL_444 = Chem.MolFromSmiles('C#CCN1C=NC2=C1N=C(C#C[C@@]3(CCC[C@H](C3)C)O)N=C2N')
istradefylline = Chem.MolFromSmiles('O=C2N(c1nc(n(c1C(=O)N2CC)C)\C=C\c3ccc(OC)c(OC)c3)CC')
preladenant = Chem.MolFromSmiles('COCCOc(cc4)ccc4N(CC3)CCN3CCn(c2nc1N)ncc2c(n5)n1nc5-c6occc6')
SCH_58261 = Chem.MolFromSmiles('c1ccccc1CCN1N=CC2=C1N=C(N)N3C2=NC(C4=CC=CO4)=N3')
SCH_442_416 = Chem.MolFromSmiles('n1n3c(nc4c(c3nc1c2occc2)cnn4CCCc5ccc(OC)cc5)N')
ZM_241_385 = Chem.MolFromSmiles('Oc1ccc(CCNc2nc3nc(c4ccco4)nn3c(N)n2)cc1')

In [4]:
# поскольку с некоторыми молекулами возникли проблемы при конвертации в необходимый мне в дальнейшем формат,
# а также были проблемы с их санитайзингом, я решила проблему с ошибкой валентностей гениально: удалила их

mols = [smile for smile in SMILES if Chem.MolFromSmiles(smile) is not None]

RDKit ERROR: [00:50:06] Explicit valence for atom # 12 O, 3, is greater than permitted
[00:50:06] Explicit valence for atom # 12 O, 3, is greater than permitted
RDKit ERROR: [00:50:06] Explicit valence for atom # 13 O, 3, is greater than permitted
[00:50:06] Explicit valence for atom # 13 O, 3, is greater than permitted
RDKit ERROR: [00:50:06] Explicit valence for atom # 19 O, 3, is greater than permitted
[00:50:06] Explicit valence for atom # 19 O, 3, is greater than permitted
RDKit ERROR: [00:50:06] Explicit valence for atom # 19 O, 3, is greater than permitted
RDKit ERROR: [00:50:06] Explicit valence for atom # 20 O, 3, is greater than permitted
[00:50:06] Explicit valence for atom # 19 O, 3, is greater than permitted
[00:50:06] Explicit valence for atom # 20 O, 3, is greater than permitted
RDKit ERROR: [00:50:06] Explicit valence for atom # 13 O, 3, is greater than permitted
[00:50:06] Explicit valence for atom # 13 O, 3, is greater than permitted
RDKit ERROR: [00:50:06] Explicit v

In [5]:
rdkit_mols = []

for mol in mols:
    rdkit_mols.append(Chem.MolFromSmiles(mol))

print(len(SMILES))
print(len(rdkit_mols))

2138
2127


In [6]:
# сделаем ещё один велосипед

def tanimoto(reference_molecule, researched_molecule):

    fp = AllChem.GetMorganFingerprintAsBitVect(reference_molecule, 2, nBits = 1024)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(researched_molecule, 2, nBits = 1024)

    same_parts = set(fp.GetOnBits()) & set(fp2.GetOnBits()) 
    different_parts = set(fp.GetOnBits()) | set(fp2.GetOnBits())
    
    return len(same_parts)/len(different_parts)

In [7]:
ATL_444_similarity = {}

for mol in rdkit_mols:
    sim = tanimoto(ATL_444, mol)
    ATL_444_similarity[mol] = sim

In [8]:
ATL_444_similarity_values = set(val for dic in ATL_444_similarity for val in ATL_444_similarity.values())
ATL_444_similarity_values

{0.056179775280898875,
 0.05813953488372093,
 0.058823529411764705,
 0.06,
 0.061224489795918366,
 0.061855670103092786,
 0.0625,
 0.06315789473684211,
 0.06382978723404255,
 0.06451612903225806,
 0.06521739130434782,
 0.06593406593406594,
 0.0660377358490566,
 0.06666666666666667,
 0.06741573033707865,
 0.06818181818181818,
 0.06896551724137931,
 0.06930693069306931,
 0.06976744186046512,
 0.07,
 0.07058823529411765,
 0.0707070707070707,
 0.07142857142857142,
 0.07216494845360824,
 0.07291666666666667,
 0.07317073170731707,
 0.07368421052631578,
 0.07407407407407407,
 0.07446808510638298,
 0.07476635514018691,
 0.07526881720430108,
 0.07547169811320754,
 0.07608695652173914,
 0.0761904761904762,
 0.07692307692307693,
 0.07766990291262135,
 0.07777777777777778,
 0.0784313725490196,
 0.07865168539325842,
 0.07920792079207921,
 0.07954545454545454,
 0.08,
 0.08045977011494253,
 0.08080808080808081,
 0.08139534883720931,
 0.08163265306122448,
 0.08235294117647059,
 0.08247422680412371,
 0

In [9]:
istradefylline_similarity = {}

for mol in rdkit_mols:
    sim = tanimoto(istradefylline, mol)
    istradefylline_similarity[mol] = sim

In [10]:
istradefylline_similarity_values = set(val for dic in istradefylline_similarity for val in istradefylline_similarity.values())
istradefylline_similarity_values

{0.043478260869565216,
 0.04878048780487805,
 0.04938271604938271,
 0.05555555555555555,
 0.05714285714285714,
 0.05813953488372093,
 0.058823529411764705,
 0.0594059405940594,
 0.05952380952380952,
 0.06,
 0.060240963855421686,
 0.06097560975609756,
 0.06172839506172839,
 0.06315789473684211,
 0.06329113924050633,
 0.06382978723404255,
 0.0641025641025641,
 0.06451612903225806,
 0.06493506493506493,
 0.06666666666666667,
 0.06741573033707865,
 0.06818181818181818,
 0.06896551724137931,
 0.06976744186046512,
 0.07058823529411765,
 0.07142857142857142,
 0.07228915662650602,
 0.07317073170731707,
 0.07368421052631578,
 0.07407407407407407,
 0.07446808510638298,
 0.075,
 0.07526881720430108,
 0.0759493670886076,
 0.07692307692307693,
 0.07766990291262135,
 0.07777777777777778,
 0.0784313725490196,
 0.07865168539325842,
 0.07894736842105263,
 0.07920792079207921,
 0.07954545454545454,
 0.08,
 0.08045977011494253,
 0.08080808080808081,
 0.08139534883720931,
 0.08163265306122448,
 0.08235294

In [11]:
preladenant_similarity = {}

for mol in rdkit_mols:
    sim = tanimoto(preladenant, mol)
    preladenant_similarity[mol] = sim

In [12]:
preladenant_similarity_values = set(val for dic in preladenant_similarity for val in preladenant_similarity.values())
preladenant_similarity_values

{0.0784313725490196,
 0.08,
 0.08080808080808081,
 0.08163265306122448,
 0.08247422680412371,
 0.08333333333333333,
 0.08411214953271028,
 0.08421052631578947,
 0.08490566037735849,
 0.0851063829787234,
 0.08571428571428572,
 0.08737864077669903,
 0.08823529411764706,
 0.0891089108910891,
 0.09,
 0.09009009009009009,
 0.09090909090909091,
 0.09183673469387756,
 0.09259259259259259,
 0.09278350515463918,
 0.09345794392523364,
 0.09375,
 0.09401709401709402,
 0.09433962264150944,
 0.09473684210526316,
 0.09523809523809523,
 0.09574468085106383,
 0.09615384615384616,
 0.0967741935483871,
 0.0970873786407767,
 0.09803921568627451,
 0.09900990099009901,
 0.0990990990990991,
 0.1,
 0.10091743119266056,
 0.10101010101010101,
 0.10185185185185185,
 0.10204081632653061,
 0.102803738317757,
 0.10309278350515463,
 0.10377358490566038,
 0.10416666666666667,
 0.10434782608695652,
 0.10476190476190476,
 0.10526315789473684,
 0.10576923076923077,
 0.10619469026548672,
 0.10638297872340426,
 0.1067961

In [13]:
SCH_58261_similarity = {}

for mol in rdkit_mols:
    sim = tanimoto(SCH_58261, mol)
    SCH_58261_similarity[mol] = sim

In [14]:
SCH_58261_similarity_values = set(val for dic in SCH_58261_similarity for val in SCH_58261_similarity.values())
SCH_58261_similarity_values

{0.04597701149425287,
 0.046511627906976744,
 0.04938271604938271,
 0.056179775280898875,
 0.056818181818181816,
 0.05747126436781609,
 0.06097560975609756,
 0.06172839506172839,
 0.06329113924050633,
 0.06451612903225806,
 0.06521739130434782,
 0.06593406593406594,
 0.06666666666666667,
 0.06741573033707865,
 0.06818181818181818,
 0.06896551724137931,
 0.06930693069306931,
 0.06976744186046512,
 0.07058823529411765,
 0.07142857142857142,
 0.07216494845360824,
 0.07228915662650602,
 0.07291666666666667,
 0.07317073170731707,
 0.07368421052631578,
 0.07407407407407407,
 0.07446808510638298,
 0.075,
 0.07526881720430108,
 0.0759493670886076,
 0.07608695652173914,
 0.07692307692307693,
 0.07777777777777778,
 0.07792207792207792,
 0.07865168539325842,
 0.07920792079207921,
 0.07954545454545454,
 0.08,
 0.08045977011494253,
 0.08080808080808081,
 0.08139534883720931,
 0.08163265306122448,
 0.08235294117647059,
 0.08247422680412371,
 0.08333333333333333,
 0.08421052631578947,
 0.084337349397

In [15]:
SCH_442_416_similarity = {}

for mol in rdkit_mols:
    sim = tanimoto(SCH_442_416, mol)
    SCH_442_416_similarity[mol] = sim

In [16]:
SCH_442_416_similarity_values = set(val for dic in SCH_442_416_similarity for val in SCH_442_416_similarity.values())
SCH_442_416_similarity_values

{0.06451612903225806,
 0.06862745098039216,
 0.06896551724137931,
 0.06976744186046512,
 0.07058823529411765,
 0.07142857142857142,
 0.07216494845360824,
 0.07368421052631578,
 0.07547169811320754,
 0.07608695652173914,
 0.07692307692307693,
 0.07777777777777778,
 0.0784313725490196,
 0.07865168539325842,
 0.07954545454545454,
 0.08,
 0.08045977011494253,
 0.08080808080808081,
 0.08139534883720931,
 0.08163265306122448,
 0.08181818181818182,
 0.08235294117647059,
 0.08247422680412371,
 0.08256880733944955,
 0.08333333333333333,
 0.08421052631578947,
 0.08433734939759036,
 0.08490566037735849,
 0.0851063829787234,
 0.08536585365853659,
 0.08571428571428572,
 0.08602150537634409,
 0.08653846153846154,
 0.08695652173913043,
 0.08737864077669903,
 0.08791208791208792,
 0.08823529411764706,
 0.08888888888888889,
 0.0891089108910891,
 0.0898876404494382,
 0.09,
 0.09090909090909091,
 0.09183673469387756,
 0.09195402298850575,
 0.09259259259259259,
 0.09278350515463918,
 0.09302325581395349,


In [17]:
ZM_241_385_similarity = {}

for mol in rdkit_mols:
    sim = tanimoto(ZM_241_385, mol)
    ZM_241_385_similarity[mol] = sim

In [18]:
ZM_241_385_similarity_values = set(val for dic in ZM_241_385_similarity for val in ZM_241_385_similarity.values())
ZM_241_385_similarity_values

{0.06172839506172839,
 0.06896551724137931,
 0.06976744186046512,
 0.07058823529411765,
 0.07228915662650602,
 0.07317073170731707,
 0.07407407407407407,
 0.07526881720430108,
 0.0759493670886076,
 0.07608695652173914,
 0.07692307692307693,
 0.07920792079207921,
 0.08045977011494253,
 0.08080808080808081,
 0.08139534883720931,
 0.08163265306122448,
 0.08235294117647059,
 0.08247422680412371,
 0.08333333333333333,
 0.08421052631578947,
 0.08433734939759036,
 0.0851063829787234,
 0.08536585365853659,
 0.08602150537634409,
 0.08641975308641975,
 0.08695652173913043,
 0.0875,
 0.08791208791208792,
 0.08860759493670886,
 0.08888888888888889,
 0.0891089108910891,
 0.08974358974358974,
 0.0898876404494382,
 0.09,
 0.09090909090909091,
 0.09183673469387756,
 0.09195402298850575,
 0.09210526315789473,
 0.09278350515463918,
 0.09302325581395349,
 0.09375,
 0.09411764705882353,
 0.09473684210526316,
 0.09523809523809523,
 0.09574468085106383,
 0.0963855421686747,
 0.0967741935483871,
 0.097560975

In [19]:
SCH_442_416_filter = {}

for k, v in SCH_442_416_similarity.items():
    if v > 0.25:
        SCH_442_416_filter[k] = v

len(SCH_442_416_filter)

11

In [20]:
SCH_58261_filter = {}

for k, v in SCH_58261_similarity.items():
    if v > 0.25:
        SCH_58261_filter[k] = v
        
len(SCH_58261_filter)

4

In [21]:
preladenant_filter = {}

for k, v in preladenant_similarity.items():
    if v > 0.25:
        preladenant_filter[k] = v
        
len(preladenant_filter)

4

In [22]:
istradefylline_filter = {}

for k, v in istradefylline_similarity.items():
    if v > 0.25:
        istradefylline_filter[k] = v
        
len(istradefylline_filter)

31

In [23]:
tanimoto_dict = {**istradefylline_filter, **preladenant_filter, **SCH_58261_filter, **SCH_442_416_filter}

In [24]:
len(tanimoto_dict)

47

In [25]:
tanimoto_dict

{<rdkit.Chem.rdchem.Mol at 0x7f1a566eec40>: 0.2676056338028169,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566f1f40>: 0.25925925925925924,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566f28e0>: 0.36507936507936506,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566f43a0>: 0.2564102564102564,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566f4580>: 0.2727272727272727,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566f5040>: 0.2987012987012987,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566f6460>: 0.2716049382716049,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566f8b80>: 0.25333333333333335,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566f8d60>: 0.25882352941176473,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566f8e20>: 0.28378378378378377,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566f9c40>: 0.273972602739726,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566f9d00>: 0.2597402597402597,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566f9ee0>: 0.25882352941176473,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566fa460>: 0.2702702702702703,
 <rdkit.Chem.rdchem.Mol at 0x7f1a566fa8e0>: 0.25333333333333335,
 <rdkit.Chem.rdchem.Mol at 0x7f1a5

In [26]:
tanimoto_smiles = []
for k in list(tanimoto_dict.keys()):
    tanimoto_smiles.append(Chem.MolToSmiles(k))

In [27]:
tanimoto_smiles

['COc1cccc(Cn2c(=O)n(Cc3ccccc3)c(=O)c3nccnc32)c1',
 'CCCCNC(=O)Cn1c(=O)n(Cc2ccc(OC)cc2)c(=O)c2c1c(C)nn2CC',
 'CCn1c(=O)n(Cc2ccc(OC)cc2)c(=O)c2nccnc21',
 'COc1ccc(C(=O)c2cn(CC(=O)c3cc(OC)ccc3OC)c3nc(C)ccc3c2=O)cc1',
 'CCn1nc(C)c2c1c(=O)n(Cc1ccc(F)cc1)c(=O)n2Cc1cccc(OC)c1',
 'CCn1cc(C(=O)NC(C)c2ccc(OC)c(OC)c2)c(=O)c2ccc(C)nc21',
 'COC(=O)c1cn(CC(=O)NCCc2ccc(OC)c(OC)c2)c2nc(C)ccc2c1=O',
 'COc1ccc(Cn2c(=O)c3nccnc3n(Cc3ccccc3F)c2=O)cc1',
 'COc1ccc(CCNC(=O)Cn2cc(C(=O)c3ccccc3)c(=O)c3ccc(C)nc32)cc1OC',
 'COc1ccc(Cn2c(=O)c3nccnc3n(Cc3cc(C)ccc3C)c2=O)cc1',
 'COc1ccc(Cn2c(=O)c3nccnc3n(Cc3ccccc3C)c2=O)cc1',
 'COc1ccc(Cn2c(=O)c3nccnc3n(Cc3ccc(F)cc3Cl)c2=O)cc1',
 'COc1ccc(CCNC(=O)Cn2cc(C(=O)c3ccc(C)cc3)c(=O)c3ccc(C)nc32)cc1OC',
 'CCn1nc(C)c2c1c(=O)n(CCc1ccccc1)c(=O)n2Cc1cccc(C)c1',
 'COc1cccc(Cn2c(=O)n(Cc3ccc(F)cc3)c(=O)c3nccnc32)c1',
 'COc1cccc(CNC(=O)Cn2c(=O)n(CCc3ccccc3)c(=O)c3nccnc32)c1OC',
 'COc1ccc(Cn2c(=O)c3nccnc3n(Cc3ccccc3Cl)c2=O)cc1',
 'CCOc1ccc(CNC(=O)Cn2nc(C)c3c(C)onc3c2=O)cc1OC',
 'CCn

In [28]:
with open('tanimoto_smiles.txt', 'w', encoding='utf-8') as f:
    for item in tanimoto_smiles:    
        f.write("%s\n" % item)