In [1]:
#Reference Paper:
'''
[1]LINGO, an Efficient Holographic Text Based Method To Calculate Biophysical
Properties and Intermolecular Similarities
David Vidal, Michael Thormann ,and Miquel Pons
Journal of Chemical infomation and modeling
2005,45(2),pp 386-393
http://pubs.acs.org/doi/abs/10.1021/ci0496797
[2]Lingos, finite state machines, and fast similarity searching
J. Andrew Grant, James A. Haigh, Barry T. Pickup et al.
2006,46(5),pp 1912-1918
http://pubs.acs.org/doi/abs/10.1021/ci6002152
'''

def similar(mols,q):
    #get lingos:
    'q: the length of the lingo'
    lingos = []
    for mol in mols:
        n = len(mol)
        for i in range(n-q+1):
            if mol[i:i+q] in lingos:
                continue
            lingos.append(mol[i:i+q])
    #calculate Tc
    l = len(lingos)
    numerator = 0
    'cnumerator = 0 '
    for lingo in lingos:
        NA = sum([mols[0][i:i+q]==lingo for i in range(len(mols[0])-q+1)])
        NB = sum([mols[1][i:i+q]==lingo for i in range(len(mols[1])-q+1)])
        numerator += 1 - abs(NA-NB) / float(NA+NB)
        'cnumerator += min(NA,NB)'
    Tc = numerator / float(l) #Reference [1]
    'Tab = float(cnumerator) / (len(mols[0])+len(mols[1]) - 2*q +2 -cnumerator)' #Reference[2]
    return Tc #Or Tab


#Test
mols = ['CC(C)C=CCCCCC(=O)NCc1ccc(c(c1)OC)O','COC1=C(C=CC(=C1)C=O)O']

print(similar(mols,4))

0.044444444444444446


In [2]:
import pandas as pd

#read data from csv as 'molList'
molList = pd.read_csv('data/ABCB1.csv',index_col=0)
molX = pd.read_csv('data/test_drug.txt')
#molList.head()
molX.head()


Unnamed: 0.1,Unnamed: 0,DrugID,Drug_Name,Canonical_SMILES,substrate
0,74,DR00182,Dasatinib,CC1=C(C(=CC=C1)Cl)NC(=O)C2=CN=C(S2)NC3=NC(=NC(...,1
1,235,DR00668,Nisoldipine,CC1=C(C(C(=C(N1)C)C(=O)OCC(C)C)C2=CC=CC=C2[N+]...,1
2,232,DR00658,Fluphenazine,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...,1
3,394,DR01484,Apafant,Cc1nnc2CN=C(c3ccccc3Cl)c4cc(CCC(=O)N5CCOCC5)sc...,1
4,89,DR00220,Eliglustat tartrate,CCCCCCCC(=O)NC(CN1CCCC1)C(C2=CC3=C(C=C2)OCCO3)...,1


In [3]:
results = pd.DataFrame(columns=['Drug', 'Score','GeneName','DrugX'])

In [4]:
 len(molX['Canonical_SMILES'])


20

In [5]:
result_data = []
#create for loop to calculate the similarity between molX and each mol in ABCB1 and store the results in the array 'results'
for i in range(len(molList['Canonical_SMILES'])):
    for j in range(len(molX['Canonical_SMILES'])):
        result_data.append([
            molList['Drug_Name'][i],
            similar([molX['Canonical_SMILES'][j], molList['Canonical_SMILES'][i]], 4),
            molList['Gene_name'][i],
            molX['Drug_Name'][j]
        ])

results = pd.DataFrame(result_data, columns=['Drug', 'Score', 'GeneName', 'DrugX'])

#show first 30 rows of the results


In [6]:
#binarize the results with threshold 0.6
results['Similarity'] = results['Score'].apply(lambda x: 1 if x>0.6 else 0)
display(results['Similarity'].sum())

results[results['Similarity']==1]


17

Unnamed: 0,Drug,Score,GeneName,DrugX,Similarity
1480,Dasatinib,1.0,ABCB1,Dasatinib,1
1784,Eliglustat tartrate,1.0,ABCB1,Eliglustat tartrate,1
3181,Nifedipine,0.836601,ABCB1,Nisoldipine,1
3588,Cyclosporine,1.0,ABCB1,Cyclosporine,1
4026,Clonidine,1.0,ABCB1,Clonidine,1
4429,Dipyridamole,1.0,ABCB1,Dipyridamole,1
4642,Fluphenazine,1.0,ABCB1,Fluphenazine,1
4662,Trifluoperazine,0.703704,ABCB1,Fluphenazine,1
4701,Nisoldipine,1.0,ABCB1,Nisoldipine,1
5062,Perphenazine,0.72549,ABCB1,Fluphenazine,1


In [7]:
#test of SMILES in shit data frame vs a correct SMILE


molList['Canonical_SMILES'][0]
molCorrect = 'CC(C)C(=O)OC1=C(C=C(C=C1)CO)C(CCN(C(C)C)C(C)C)C2=CC=CC=C2.C(=CC(=O)O)C(=O)O'
#save the similarity results versis DrugID to data frame 'results'
score = similar([molList['Canonical_SMILES'][0],molCorrect],4)


score

0.635593220338983