In [2]:
from generationRNN.generate_molecules import GenerateMolecules
from affinityCNN.predict_affinity import AffinityPrediction
from rdkit import Chem
import numpy as np
from molCombiner.mol_comb import MolComb
import os

In [20]:
fasta = "MSRPPPTGKMPGAPETAPGDGAGASRQRKLEALIRDPRSPINVESLLDGLNSLVLDLDFPALRKNKNIDNFLNRYEKIVKKIRGLQMKAEDYDVVKVIGRGAFGEVQLVRHKASQKVYAMKLLSKFEMIKRSDSAFFWEERDIMAFANSPWVVQLFYAFQDDRYLYMVMEYMPGGDLVNLMSNYDVPEKWAKFYTAEVVLALDAIHSMGLIHRDVKPDNMLLDKHGHLKLADFGTCMKMDETGMVHCDTAVGTPDYISPEVLKSQGGDGFYGRECDWWSVGVFLYEMLVGDTPFYADSLVGTYSKIMDHKNSLCFPEDAEISKHAKNLICAFLTDREVRLGRNGVEEIRQHPFFKNDQWHWDNIRETAAPVVPELSSDIDSSNFDDIEDDKGDVETFPIPKAFVGNQLPFIGFTYYRENLLLSDSPSCRENDSIQSRKNEESQEIQKKLYTLEEHLSNEMQAKEELEQKCKSVNTRLEKTAKELEEEITLRKSVESALRQLEREKALLQHKNAEYQRKADHEADKKRNLENDVNSLKDQLEDL"
comb = MolComb(fasta)
fitness = np.vectorize(lambda mol: comb.get_affinity(mol))

In [5]:
with open('generationRNN/data/100k_SMILES.txt') as f:
    raw_text = f.read()
raw_data = np.array(raw_text.split("\n"))
smiles = []
for mol in raw_data:
    smiles.append(mol.replace('/', "").replace('@', "").replace('\\', "").replace('.', ""))
raw_data = smiles
k = 10
pop = np.array(raw_data[:k])

In [6]:
history = np.empty((1,2))
while True:
    print("Getting fitness")
    f = fitness(pop)
    bestf = np.array([f.min(), pop[f.argmin()]])
    history = np.append(history, bestf)
    
    print(history)
    print(f.mean())
    
    if (f.min() < 50):
        break
    
    p = (f.max() - f)**2 / ((f.max() - f)**2).sum()
    parents = np.random.choice(pop, size=(k,2), p=p)

    print("Combining molecules")
    pop = np.apply_along_axis(lambda row: comb.combine(row[0], row[1]), axis=1, arr=parents).flatten()

Getting fitness
['0.0' '1.0' '53.72521209716797'
 'CCOC(=O)[CH]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c2CCCCC3)C1']
540.8024612426758
Combining molecules
Predicting cddd representations...
Decoding predicted cddd representations...
['Cc1ccc(-c2nc(C=O)c(SC3=C(C=O)CCCCC3(C=O)C3CCCCCC3C#N)n2CC#N)cc1']
Predicting cddd representations...
Decoding predicted cddd representations...
['N#Cc1ccc(C(Oc2ccc(C3CCCCN3)cc2)c2ccccc2)cc1']
Predicting cddd representations...
Decoding predicted cddd representations...
['Cn1ccc(C(Oc2ccc(C=O)c(C(C=O)c3ccccc3)c2)c2ccc(C=O)cc2)c1']
Predicting cddd representations...
Decoding predicted cddd representations...
['CC(=CSc1nc(C#N)cs1)C1(C#N)CC(NC(C)=O)CCCC1C#N']
Predicting cddd representations...
Decoding predicted cddd representations...
['N#CC1=C(SCc2ccccc2)N(CC(N)=O)C2(CCCCC2)N=C1Cl']
Predicting cddd representations...
Decoding predicted cddd representations...
['Cc1ccc(-c2nc(C3CCCCNC3C)n3c2CCCCC3)cc1']
Predicting cddd representations...
Decoding predicted cddd represe

In [None]:
%%time
fitness(pop[0])

In [None]:
"C[CH]1CN(C(=O)c2cc(Br)cn2C)CC[CH]1[NH3]" in raw_data

In [7]:
%%time
f = fitness(pop)

CPU times: user 2.25 s, sys: 233 ms, total: 2.49 s
Wall time: 2.58 s


In [9]:
test_pop = raw_data[:4]

In [10]:
test_pop

['CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1',
 'C[CH]1CC(Nc2cncc(-c3nncn3C)c2)C[CH](C)C1',
 'N#Cc1ccc(-c2ccc(O[CH](C(=O)N3CCCC3)c3ccccc3)cc2)cc1',
 'CCOC(=O)[CH]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c2CCCCC3)C1']

In [16]:
from multiprocessing import Pool

In [14]:
test_pop

['CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1',
 'C[CH]1CC(Nc2cncc(-c3nncn3C)c2)C[CH](C)C1',
 'N#Cc1ccc(-c2ccc(O[CH](C(=O)N3CCCC3)c3ccccc3)cc2)cc1',
 'CCOC(=O)[CH]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c2CCCCC3)C1']

In [27]:
def test_func(mol):
    return(comb.get_affinity(mol))

In [30]:
p = Pool()
result = p.map(test_func, test_pop[0])
p.close()
p.join()

In [26]:
result

['CC(C)(C)c1ccc2occ(CC(=O)Nc3ccccc3F)c2c1',
 'C[CH]1CC(Nc2cncc(-c3nncn3C)c2)C[CH](C)C1',
 'N#Cc1ccc(-c2ccc(O[CH](C(=O)N3CCCC3)c3ccccc3)cc2)cc1',
 'CCOC(=O)[CH]1CCCN(C(=O)c2nc(-c3ccc(C)cc3)n3c2CCCCC3)C1']

In [22]:
%%time
fitness(raw_data[0])

CPU times: user 223 ms, sys: 61.4 ms, total: 284 ms
Wall time: 292 ms


310.0136413574219