In [26]:
import pandas as pd
import random
random.seed(100)

#Drugs and transporter data downloaded from varidt database
smiles=pd.read_csv('data/SMILES_and_InChI_for_the_drugs_transported_by_DTs.txt',sep='\t') #Useful -> SMILES of drugs
matching= pd.read_csv('data/Synonyms_of_DTs_and_their_corresponding_drugs.txt', sep='\t')#useful -> matching between transporter and drug

In [27]:
ABCB1= matching[matching['Gene_name']=='ABCB1']
ABCB1.shape

(433, 7)

In [28]:
#Merge the SMILEs to ABCB1

ABCB1=pd.merge(ABCB1,smiles[['Canonical_SMILES','DrugID']],on="DrugID")
ABCB1.head(3)

Unnamed: 0,TransporterID,Gene_name,Protein_Name_generally,Synonyms,DrugID,Drug_Name,synonyms,Canonical_SMILES
0,DTD0003,ABCB1,P-glycoprotein 1,ABC20; ABCB1; ATP-binding cassette sub-family ...,DR00001,Fesoterodine fumarate,Toviaz (TN),CC(C)C(=O)OC1=C(C=C(C=C1)CO)C(CCN(C(C)C)C(C)C)...
1,DTD0003,ABCB1,P-glycoprotein 1,ABC20; ABCB1; ATP-binding cassette sub-family ...,DR00002,Paraquat,"1,1'-dimethyl-4,4' bipyridinium dichloride",C[N+]1=CC=C(C=C1)C2=CC=[N+](C=C2)C.[Cl-].[Cl-]
2,DTD0003,ABCB1,P-glycoprotein 1,ABC20; ABCB1; ATP-binding cassette sub-family ...,DR00008,Venetoclax,Venclexta,CC1(CCC(=C(C1)C2=CC=C(C=C2)Cl)CN3CCN(CC3)C4=CC...


In [29]:
#Create test dataframe: 
col_testdf=['DrugID', 'Drug_Name','Canonical_SMILES','substrate'] #Substrate: 1 if is substrate, 0 if not

idx_pos_test = random.sample(range(0, ABCB1.shape[0]),10)
test_df = pd.DataFrame(data= ABCB1.iloc[idx_pos_test][col_testdf[0:3]],columns= col_testdf)
test_df['substrate']=1

#Get drugs that are supposedly not ABCB1 targets
notABCB1 = smiles[~smiles['DrugID'].isin(ABCB1['DrugID'])]
idx_neg_test = random.sample(range(0, notABCB1.shape[0]),10)
testneg_df=pd.DataFrame(data= notABCB1.iloc[idx_neg_test][col_testdf[0:3]],columns= col_testdf)
testneg_df['substrate']=0
test_df = pd.concat(objs=[test_df,testneg_df], axis=0)

#Drop of those drugs from ABCB1 dataset
ABCB1 = ABCB1.drop(index=idx_pos_test)
display(~test_df['DrugID'].apply(lambda x:x in (ABCB1['DrugID']))) #Test molecule not in database anymore
test_df.head(13)

74     True
235    True
232    True
394    True
89     True
361    True
201    True
374    True
179    True
221    True
500    True
795    True
92     True
526    True
99     True
74     True
728    True
443    True
243    True
44     True
Name: DrugID, dtype: bool

Unnamed: 0,DrugID,Drug_Name,Canonical_SMILES,substrate
74,DR00182,Dasatinib,CC1=C(C(=CC=C1)Cl)NC(=O)C2=CN=C(S2)NC3=NC(=NC(...,1
235,DR00668,Nisoldipine,CC1=C(C(C(=C(N1)C)C(=O)OCC(C)C)C2=CC=CC=C2[N+]...,1
232,DR00658,Fluphenazine,C1CN(CCN1CCCN2C3=CC=CC=C3SC4=C2C=C(C=C4)C(F)(F...,1
394,DR01484,Apafant,Cc1nnc2CN=C(c3ccccc3Cl)c4cc(CCC(=O)N5CCOCC5)sc...,1
89,DR00220,Eliglustat tartrate,CCCCCCCC(=O)NC(CN1CCCC1)C(C2=CC3=C(C=C2)OCCO3)...,1
361,DR01399,Mithramycin,CC1C(C(CC(O1)OC2CC(OC(C2O)C)OC3=CC4=CC5=C(C(=O...,1
201,DR00521,Clonidine,C1CN=C(N1)NC2=C(C=CC=C2Cl)Cl,1
374,DR01424,Droloxifene,C(=C(\c1ccccc1)/CC)(/c1cc(O)ccc1)\c1ccc(cc1)OC...,1
179,DR00448,Cyclosporine,CCC1C(=O)N(CC(=O)N(C(C(=O)NC(C(=O)N(C(C(=O)NC(...,1
221,DR00613,Dipyridamole,C1CCN(CC1)C2=NC(=NC3=C2N=C(N=C3N4CCCCC4)N(CCO)...,1


In [6]:
display(test_df['DrugID'].apply(lambda x:x in (ABCB1['DrugID'])))

74     False
235    False
232    False
394    False
89     False
361    False
201    False
374    False
179    False
221    False
Name: DrugID, dtype: bool

In [30]:
#Get good SMILEs from pubCHEM API
import pubchempy as pcp
name = smiles[smiles['DrugID']=="DR00001"]['Drug_Name'][0]
c= pcp.get_compounds(name,'name')
c=c[0]
#gotsmile= requests.get("https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchi/"+inchi+"/property/smiles/JSON")

In [31]:
c=c.to_dict(properties=['inchi','canonical_smiles'])
c['canonical_smiles']

'CC(C)C(=O)OC1=C(C=C(C=C1)CO)C(CCN(C(C)C)C(C)C)C2=CC=CC=C2.C(=CC(=O)O)C(=O)O'

In [33]:
#Retrieve good smiles from pubchem database from the name, some SMILES are not correct in the  varidt database
#Need to look for: is there only one chemical compound found with the name? Is there something found for it?
new_smiles=[]
multiple_ans=[]
for name in smiles['Drug_Name']:
    c=pcp.get_compounds(name,'name')
    if len(c)==1:
        multiple_ans.append(0)
        c=c[0]
        c=c.to_dict(properties=['canonical_smiles'])
        new_smiles.append(c['canonical_smiles'])
    elif len(c)>1:
        multiple_ans.append(1)
        n_s=[]
        for comp in c:
            comp=comp.to_dict(properties=['canonical_smiles'])
            n_s.append(comp['canonical_smiles'])
        new_smiles.append(n_s)
    else:
        print('Problem drug '+name+' has no answers')
        multiple_ans.append('NaN')
        new_smiles.append('NaN')

smiles.insert(smiles.shape[1],'new_smiles',new_smiles)
smiles.insert(smiles.shape[1],'multiple_ans',multiple_ans) 

Problem drug Creatine ALS-08 has no answers
Problem drug Zinc salts has no answers
Problem drug Vasopressin has no answers
Problem drug Arsenite has no answers
Problem drug Epipodophyllotoxins has no answers
Problem drug Carboxydichlorofluorescein has no answers
Problem drug Beta-lactam antibiotics has no answers
Problem drug Antiparasitics has no answers
Problem drug Amino acids has no answers
Problem drug Peptides has no answers
Problem drug Sodium phosphate dibasic/Sodium phosphate monobasic has no answers


  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:
  if self.charge is not 0:


KeyboardInterrupt: 

In [232]:
#export txt files

#smiles.to_csv("data/smiles.txt",index=False,header=True)
#test_df.to_csv('data/test_drug.txt')

In [233]:
smiles=pd.read_csv('data/smiles.txt')
smiles.drop(axis=1,labels=['Unnamed: 0'],inplace= True)

In [234]:
final_smiles=[]

for i in range(smiles.shape[0]):
    drug=smiles.iloc[i]
    if drug['multiple_ans']==0:
        final_smiles.append(drug['new_smiles'])
    elif drug['multiple_ans']=='NaN':
        final_smiles.append(drug['Canonical_SMILES'])
    else: #If multiple answers from pubChem: take whatever is before the dot (main body of the compound)
        smol= str(drug['new_smiles']).split(sep=',')[0].split(sep='.')[0][2:]
        final_smiles.append(smol)
        
    

In [235]:
smiles['final_smiles']=final_smiles
smiles.dropna(inplace=True)
smiles

Unnamed: 0,DrugID,Drug_Name,InChI,Canonical_SMILES,new_smiles,multiple_ans,final_smiles
0,DR00001,Fesoterodine fumarate,InChI=1S/C26H37NO3/c1-18(2)26(29)30-25-13-12-2...,CC(C)C(=O)OC1=C(C=C(C=C1)CO)C(CCN(C(C)C)C(C)C)...,CC(C)C(=O)OC1=C(C=C(C=C1)CO)C(CCN(C(C)C)C(C)C)...,0.0,CC(C)C(=O)OC1=C(C=C(C=C1)CO)C(CCN(C(C)C)C(C)C)...
1,DR00002,Paraquat,InChI=1S/C12H14N2.2ClH/c1-13-7-3-11(4-8-13)12-...,C[N+]1=CC=C(C=C1)C2=CC=[N+](C=C2)C.[Cl-].[Cl-],C[N+]1=CC=C(C=C1)C2=CC=[N+](C=C2)C,0.0,C[N+]1=CC=C(C=C1)C2=CC=[N+](C=C2)C
2,DR00003,Lisdexamfetamine,InChI=1S/C15H25N3O/c1-12(11-13-7-3-2-4-8-13)18...,CC(CC1=CC=CC=C1)NC(=O)C(CCCCN)N,CC(CC1=CC=CC=C1)NC(=O)C(CCCCN)N,0.0,CC(CC1=CC=CC=C1)NC(=O)C(CCCCN)N
3,DR00004,Gaboxadol,InChI=1S/C6H8N2O2/c9-6-4-1-2-7-3-5(4)10-8-6/h7...,C1CNCC2=C1C(=O)NO2,C1CNCC2=C1C(=O)NO2,0.0,C1CNCC2=C1C(=O)NO2
4,DR00005,L-glutamic acid,"InChI=1S/C5H9NO4/c6-3(5(9)10)1-2-4(7)8/h3H,1-2...",C(CC(=O)O)C(C(=O)O)N,C(CC(=O)O)C(C(=O)O)N,0.0,C(CC(=O)O)C(C(=O)O)N
...,...,...,...,...,...,...,...
879,DR01713,S-licarbazepine,InChI=1S/C15H14N2O2/c16-15(19)17-12-7-3-1-5-10...,C1C(C2=CC=CC=C2N(C3=CC=CC=C31)C(=O)N)O,C1C(C2=CC=CC=C2N(C3=CC=CC=C31)C(=O)N)O,0.0,C1C(C2=CC=CC=C2N(C3=CC=CC=C31)C(=O)N)O
880,DR01715,Parabis,InChI=1S/C13H10Cl2O2/c14-10-1-3-12(16)8(6-10)5...,C1=CC(=C(C=C1Cl)CC2=C(C=CC(=C2)Cl)O)O,C1=CC(=C(C=C1Cl)CC2=C(C=CC(=C2)Cl)O)O,0.0,C1=CC(=C(C=C1Cl)CC2=C(C=CC(=C2)Cl)O)O
881,DR01717,Saphris,InChI=1S/C17H16ClNO.C4H4O4/c1-19-9-14-12-4-2-3...,CN1CC2C(C1)C3=C(C=CC(=C3)Cl)OC4=CC=CC=C24.C(=C...,CN1CC2C(C1)C3=C(C=CC(=C3)Cl)OC4=CC=CC=C24.C(=C...,0.0,CN1CC2C(C1)C3=C(C=CC(=C3)Cl)OC4=CC=CC=C24.C(=C...
883,DR01722,Gepirone,InChI=1S/C19H29N5O2/c1-19(2)14-16(25)24(17(26)...,CC1(CC(=O)N(C(=O)C1)CCCCN2CCN(CC2)C3=NC=CC=N3)C,CC1(CC(=O)N(C(=O)C1)CCCCN2CCN(CC2)C3=NC=CC=N3)C,0.0,CC1(CC(=O)N(C(=O)C1)CCCCN2CCN(CC2)C3=NC=CC=N3)C


## Making dataframe for all transporters

In [223]:
matching=pd.merge(left=matching,right=smiles[['DrugID','final_smiles']],on='DrugID')

In [226]:
matching.to_csv("data/fulldata.csv")