In [1]:
#Reference Paper:
'''
[1]LINGO, an Efficient Holographic Text Based Method To Calculate Biophysical
Properties and Intermolecular Similarities
David Vidal, Michael Thormann ,and Miquel Pons
Journal of Chemical infomation and modeling
2005,45(2),pp 386-393
http://pubs.acs.org/doi/abs/10.1021/ci0496797
[2]Lingos, finite state machines, and fast similarity searching
J. Andrew Grant, James A. Haigh, Barry T. Pickup et al.
2006,46(5),pp 1912-1918
http://pubs.acs.org/doi/abs/10.1021/ci6002152
'''

def similar(mols,q):
    #get lingos:
    'q: the length of the lingo'
    lingos = []
    for mol in mols:
        n = len(mol)
        for i in range(n-q+1):
            if mol[i:i+q] in lingos:
                continue
            lingos.append(mol[i:i+q])
    #calculate Tc
    l = len(lingos)
    numerator = 0
    'cnumerator = 0 '
    for lingo in lingos:
        NA = sum([mols[0][i:i+q]==lingo for i in range(len(mols[0])-q+1)])
        NB = sum([mols[1][i:i+q]==lingo for i in range(len(mols[1])-q+1)])
        numerator += 1 - abs(NA-NB) / float(NA+NB)
        'cnumerator += min(NA,NB)'
    Tc = numerator / float(l) #Reference [1]
    'Tab = float(cnumerator) / (len(mols[0])+len(mols[1]) - 2*q +2 -cnumerator)' #Reference[2]
    return Tc #Or Tab


#Test
mols = ['CC(C)C=CCCCCC(=O)NCc1ccc(c(c1)OC)O','COC1=C(C=CC(=C1)C=O)O']

print(similar(mols,4))

0.044444444444444446


In [2]:
import pandas as pd

#read data from csv as 'molList'
#molList = pd.read_csv('data/ABCB1.csv',index_col=0)
molList = pd.read_csv('data/fulldata.csv',index_col=0)
molX = pd.read_csv('data/test_drug.txt')
molList.head()
#molX.head()


Unnamed: 0,TransporterID,Gene_name,Protein_Name_generally,Synonyms,DrugID,Drug_Name,synonyms,final_smiles
0,DTD0001,ABCC1,Multidrug resistance-associated protein 1,ABC29; ABCC; ABCC1; ATP-binding cassette sub-f...,DR00015,Adefovir,((2-(6-Amino-9H-purin-9-yl)ethoxy)methyl)phosp...,C1=NC(=C2C(=N1)N(C=N2)CCOCP(=O)(O)O)N
1,DTD0002,ABCC2,Multidrug resistance-associated protein 2,ABC30; ABCC2; ATP-binding cassette sub-family ...,DR00015,Adefovir,((2-(6-Amino-9H-purin-9-yl)ethoxy)methyl)phosp...,C1=NC(=C2C(=N1)N(C=N2)CCOCP(=O)(O)O)N
2,DTD0004,ABCG2,Breast cancer resistance protein,ABC15; ABCG2; ABCP; ATP-binding cassette sub-f...,DR00015,Adefovir,((2-(6-Amino-9H-purin-9-yl)ethoxy)methyl)phosp...,C1=NC(=C2C(=N1)N(C=N2)CCOCP(=O)(O)O)N
3,DTD0015,ABCC4,Multidrug resistance-associated protein 4,ABCC4; ATP-binding cassette sub-family C membe...,DR00015,Adefovir,((2-(6-Amino-9H-purin-9-yl)ethoxy)methyl)phosp...,C1=NC(=C2C(=N1)N(C=N2)CCOCP(=O)(O)O)N
4,DTD0024,SLC22A6,Organic anion transporter 1,OAT1; PAH transporter; PAHT; ROAT1; Renal orga...,DR00015,Adefovir,((2-(6-Amino-9H-purin-9-yl)ethoxy)methyl)phosp...,C1=NC(=C2C(=N1)N(C=N2)CCOCP(=O)(O)O)N


In [3]:
result_data = []
#create for loop to calculate the similarity between molX and each mol in ABCB1 and store the results in the array 'results'
for i in range(len(molList['final_smiles'])):
    for j in range(len(molX['Canonical_SMILES'])):
        result_data.append([
            molList['Drug_Name'][i],
            similar([molX['Canonical_SMILES'][j], molList['final_smiles'][i]], 4),
            molList['Gene_name'][i],
            molX['Drug_Name'][j]
        ])

results = pd.DataFrame(result_data, columns=['Drug', 'Score', 'GeneName', 'DrugX'])

#show first 30 rows of the results


In [4]:
#binarize the results with threshold 0.6
results['Similarity'] = results['Score'].apply(lambda x: 1 if x>0.6 else 0)
display(results['Similarity'].sum())

results[results['Similarity']==1]


57

Unnamed: 0,Drug,Score,GeneName,DrugX,Similarity
8988,Cyclosporine,0.97561,ABCC2,Cyclosporine,1
9008,Cyclosporine,0.97561,ABCB1,Cyclosporine,1
9028,Cyclosporine,0.97561,ABCG2,Cyclosporine,1
13620,Dasatinib,0.717647,ABCB1,Dasatinib,1
13640,Dasatinib,0.717647,ABCG2,Dasatinib,1
13660,Dasatinib,0.717647,SLCO1B1,Dasatinib,1
13680,Dasatinib,0.717647,SLCO1B3,Dasatinib,1
14204,Eliglustat tartrate,0.772069,ABCB1,Eliglustat tartrate,1
15721,Nifedipine,0.836601,ABCB1,Nisoldipine,1
16946,Clonidine,1.0,ABCB1,Clonidine,1


In [5]:
#drop if drug = drugX
results_unique = results[results['Similarity']==1]
results_unique = results_unique[results_unique['Drug'] != results_unique['DrugX']]
results_unique

Unnamed: 0,Drug,Score,GeneName,DrugX,Similarity
15721,Nifedipine,0.836601,ABCB1,Nisoldipine,1
17922,Trifluoperazine,0.703704,ABCB1,Fluphenazine,1
18402,Perphenazine,0.72549,ABCB1,Fluphenazine,1
18422,Perphenazine,0.72549,SLC22A1,Fluphenazine,1
19039,Hydroxyurea,0.857143,ABCB1,Urea,1
19059,Hydroxyurea,0.857143,SLCO1B1,Urea,1
19079,Hydroxyurea,0.857143,SLCO1A2,Urea,1
19099,Hydroxyurea,0.857143,SLCO1B3,Urea,1
19161,Nimodipine,0.7,ABCB1,Nisoldipine,1
19862,Trans-flupentixol,0.849333,ABCB1,Fluphenazine,1


In [15]:
#remove duplicates in molR['final_smiles']

molR

Unnamed: 0,TransporterID,Gene_name,Protein_Name_generally,Synonyms,DrugID,Drug_Name,synonyms,final_smiles
895,DTD0003,ABCB1,P-glycoprotein 1,ABC20; ABCB1; ATP-binding cassette sub-family ...,DR00658,Fluphenazine,1-(2-Hydroxyethyl)-4-(3-(trifluoromethyl-10-ph...,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...
898,DTD0003,ABCB1,P-glycoprotein 1,ABC20; ABCB1; ATP-binding cassette sub-family ...,DR00668,Nisoldipine,"(+-)-Isobutyl methyl 1,4-dihydro-2,6-dimethyl-...",CC1=C(C(C(=C(N1)C)C(=O)OCC(C)C)C2=CC=CC=C2[N+]...
1314,DTD0006,SLC22A2,Organic cation transporter 2,OCT2; SLC22A2; Solute carrier family 22 member...,DR00106,Tetraethylammonium,4-04-00-00331 (Beilstein Handbook Reference); ...,CC[N+](CC)(CC)CC
1595,DTD0023,SLC22A5,Organic cation/carnitine transporter 2,CDSP; High-affinity sodium-dependent carnitine...,DR00255,L-carnitine,(-)-(R)-3-Hydroxy-4-(trimethylammonio)butyrate...,C[N+](C)(C)CC(CC(=O)[O-])O
1646,DTD0029,SLCO1A2,Organic anion transporting polypeptide 1A2,OATP-A; OATP1A2; Organic anion-transporting po...,DR00048,Urea,.,C(=O)(N)N


In [19]:
molR = molList[molList['Drug_Name'].isin(results_unique['DrugX'])]
molR = molR.drop_duplicates(subset=['final_smiles'])
result_data_val = []
#create for loop to calculate the similarity between molX and each mol in ABCB1 and store the results in the array 'results'
for i in range(len(molList['final_smiles'])):
    for j in range(len(molR['final_smiles'])):
        result_data_val.append([
            molList['Drug_Name'].iloc[i],
            similar([molR['final_smiles'].iloc[j], molList['final_smiles'].iloc[i]], 4),
            molList['Gene_name'].iloc[i],
            molR['Drug_Name'].iloc[j]
        ])


In [21]:
results_val = pd.DataFrame(result_data_val, columns=['Drug', 'Score', 'GeneName', 'DrugX'])
results_val['Similarity'] = results_val['Score'].apply(lambda x: 1 if x>0.6 else 0)
display(results_val['Similarity'].sum())

results_unique_val = results_val[results_val['Similarity']==1]
results_unique_val = results_unique_val[results_unique_val['Drug'] != results_unique_val['DrugX']]
results_unique_val


28

Unnamed: 0,Drug,Score,GeneName,DrugX,Similarity
3931,Nifedipine,0.836601,ABCB1,Nisoldipine,1
4480,Trifluoperazine,0.703704,ABCB1,Fluphenazine,1
4600,Perphenazine,0.72549,ABCB1,Fluphenazine,1
4605,Perphenazine,0.72549,SLC22A1,Fluphenazine,1
4759,Hydroxyurea,0.857143,ABCB1,Urea,1
4764,Hydroxyurea,0.857143,SLCO1B1,Urea,1
4769,Hydroxyurea,0.857143,SLCO1A2,Urea,1
4774,Hydroxyurea,0.857143,SLCO1B3,Urea,1
4791,Nimodipine,0.7,ABCB1,Nisoldipine,1
4965,Trans-flupentixol,0.849333,ABCB1,Fluphenazine,1
