In [1]:
# import library

# https://www.rdkit.org/
#https://github.com/rdkit/rdkit
from rdkit.Chem import AllChem, Descriptors3D
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.ML.Descriptors import MoleculeDescriptors

# https://pandas.pydata.org
import pandas as pd

# https://numpy.org/doc/stable/release.html
import numpy as np

#https://github.com/mordred-descriptor/mordred
from mordred import Calculator, descriptors

from rdkit.Chem import DataStructs

In [2]:
#import smile dataset
df_smile = pd.read_csv("../../dataset/dataset_Smile_nonsaids_19_36row.csv")
df_filtereddataset = pd.read_csv("../3 - Cleaning & Transforming Data/1 Dataset_Filtered.csv")
df_filtereddataset_drugdropdupe = df_filtereddataset['Drug'].drop_duplicates(inplace=False)
df_filtereddataset_drugdropdupe = pd.DataFrame(df_filtereddataset_drugdropdupe.reset_index(drop=True))
merged = df_filtereddataset_drugdropdupe.merge(df_smile.iloc[:,[0,1]], left_on='Drug', right_on='DRUG_NAME', how='left')



In [3]:
# selecting allopurinol in Drug columns and replace its SMILES to OC1=NC=NC2=C1C=NN2
#merged.loc[merged['Drug'] == 'allopurinol', 'SMILES'] = 'OC1=NC=NC2=C1C=NN2C'


In [4]:
merged



Unnamed: 0,Drug,DRUG_NAME,SMILES
0,carbamazepine,carbamazepine,C1=CC=C2C(=C1)C=CC3=CC=CC=C3N2C(=O)N
1,lamotrigine,lamotrigine,C1=CC(=C(C(=C1)Cl)Cl)C2=C(N=C(N=N2)N)N
2,allopurinol,allopurinol,C1=NNC2=C1C(=O)NC=N2
3,nevirapine,nevirapine,CC1=C2C(=NC=C1)N(C3=C(C=CC=N3)C(=O)N2)C4CC4
4,phenytoin,phenytoin,C1=CC=C(C=C1)C2(C(=O)NC(=O)N2)C3=CC=CC=C3
5,valproic acid,valproic acid,CCCC(CCC)C(=O)O
6,phenobarbital,phenobarbital,CCC1(C(=O)NC(=O)NC1=O)C2=CC=CC=C2
7,paracetamol,paracetamol,CC(=O)NC1=CC=C(C=C1)O
8,sulfamethoxazole,sulfamethoxazole,CC1=CC(=NO1)NS(=O)(=O)C2=CC=C(C=C2)N
9,oxicam NSAIDs,oxicam NSAIDs,CN1C(=C(C2=CC=CC=C2S1(=O)=O)O)C(=O)NC3=CC=CC=N3


 ## MOGRAN
 

Note: #//### Molecular Graph (https://www.blopig.com/blog/2022/02/how-to-turn-a-smiles-string-into-a-molecular-graph-for-pytorch-geometric/) #https://github.com/iwatobipen/playground/blob/master/MAP4FP_test.ipynb

In [5]:
from rdkit import Chem
from rdkit.Chem import RDKFingerprint
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem

def RDkit_Morgan(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 

    # Morgan Fingerprint
    Mol_descriptors =[]
    for mol in mols:
        fingerprint_morgan = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2)
        fingerprint_morgan_np = np.array(fingerprint_morgan)
        Mol_descriptors.append(fingerprint_morgan_np)
    return Mol_descriptors

Mol_descriptors = RDkit_Morgan(merged['SMILES'])
Mol_descriptors = pd.DataFrame(Mol_descriptors)
Mol_descriptors = pd.concat([merged['DRUG_NAME'],Mol_descriptors], axis=1)
Mol_descriptors

Unnamed: 0,DRUG_NAME,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,carbamazepine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,lamotrigine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,allopurinol,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,nevirapine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,phenytoin,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,valproic acid,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,phenobarbital,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,paracetamol,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,sulfamethoxazole,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,oxicam NSAIDs,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [6]:
Mol_descriptors.nunique().value_counts()

1     1822
2      226
14       1
dtype: int64

In [7]:
Mol_descriptors.to_csv('drug_descriptors_dataset_morgan.csv',index=False)

### Dropped column that contain same values

In [8]:
#check if any column got the same value in all rows in final_dataset.iloc[:,5:]
#if yes, then drop that column
dropped_columns = []
dropped_dataset = Mol_descriptors.copy()
for col_name in Mol_descriptors.iloc[:,5:].columns.tolist():
    if Mol_descriptors[col_name].nunique() == 1:
        dropped_dataset = dropped_dataset.drop(columns=col_name)
        dropped_columns.append(col_name)
print(f"Done, removed {len(dropped_columns)} columns: {dropped_columns}")

dropped_dataset.to_csv('drug_descriptors_dataset_morgan_dropped.csv', index=None)
# final_dataset.to_csv('2 Final_Dataset.csv')

Done, removed 1819 columns: [5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 24, 25, 26, 27, 28, 29, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 75, 76, 77, 78, 79, 81, 82, 83, 84, 85, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 124, 125, 126, 127, 128, 129, 131, 132, 133, 134, 135, 136, 137, 138, 139, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 205, 206, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,

In [9]:
dropped_dataset

Unnamed: 0,DRUG_NAME,0,1,2,3,4,11,20,23,30,...,1922,1948,1956,1964,1974,1977,1985,2012,2039,2043
0,carbamazepine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,lamotrigine,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,allopurinol,0,0,0,0,0,1,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3,nevirapine,0,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
4,phenytoin,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
5,valproic acid,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,phenobarbital,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,paracetamol,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,sulfamethoxazole,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,oxicam NSAIDs,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [12]:
# fp1 = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(merged['SMILES'][0]), radius=2)
# fp2 = AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(merged['SMILES'][1]), radius=2)
# DataStructs.TanimotoSimilarity(fp1, fp2, returnDistance=1)

## RDKit Descriptors

In [13]:
def RDkit_descriptors(smiles):
    mols = [Chem.MolFromSmiles(i) for i in smiles] 
    calc = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    desc_names = calc.GetDescriptorNames()
    
    Mol_descriptors =[]
    for mol in mols:
        # add hydrogens to molecules
        mol=Chem.AddHs(mol)
        # Calculate all 200 descriptors for each molecule
        descriptors = calc.CalcDescriptors(mol)
        Mol_descriptors.append(descriptors)
    return Mol_descriptors,desc_names 

# Function call
Mol_descriptors,desc_names = RDkit_descriptors(merged['SMILES'])

In [14]:
df_withdescriptors = pd.DataFrame(Mol_descriptors, columns=desc_names)

In [15]:
df_withdescriptors = pd.concat([merged['DRUG_NAME'],df_withdescriptors], axis=1)


In [16]:
df_withdescriptors

Unnamed: 0,DRUG_NAME,MaxEStateIndex,MinEStateIndex,MaxAbsEStateIndex,MinAbsEStateIndex,qed,MolWt,HeavyAtomMolWt,ExactMolWt,NumValenceElectrons,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,carbamazepine,12.739529,-1.532351,12.739529,0.357593,0.748363,236.274,224.178,236.094963,88,...,0,0,0,0,0,0,0,0,0,1
1,lamotrigine,7.915334,-0.518197,7.915334,0.030546,0.81381,256.096,249.04,255.007851,82,...,0,0,0,0,0,0,0,0,0,0
2,allopurinol,11.342037,-0.85787,11.342037,0.172685,0.519237,136.114,132.082,136.038511,50,...,0,0,0,0,0,0,0,0,0,0
3,nevirapine,13.213574,-3.21146,13.213574,0.069433,0.861716,266.304,252.192,266.116761,100,...,0,0,0,0,0,0,0,0,0,0
4,phenytoin,13.156878,-3.097454,13.156878,0.204208,0.800195,252.273,240.177,252.089878,94,...,0,0,0,0,0,0,0,0,0,1
5,valproic acid,11.896426,-4.419097,11.896426,2.505787,0.642372,144.214,128.086,144.11503,60,...,0,0,0,0,0,0,0,0,0,0
6,phenobarbital,12.880175,-3.984632,12.880175,0.654072,0.736884,232.239,220.143,232.084792,88,...,0,0,0,0,0,0,0,0,0,1
7,paracetamol,11.46504,-3.157138,11.46504,0.206873,0.595026,151.165,142.093,151.063329,58,...,0,0,0,0,0,0,0,0,0,0
8,sulfamethoxazole,12.689804,-5.177639,12.689804,0.224447,0.804737,253.283,242.195,253.052112,90,...,0,1,0,0,0,0,0,0,0,0
9,oxicam NSAIDs,13.340427,-5.48246,13.340427,0.35005,0.870184,331.353,318.249,331.062677,118,...,0,1,0,0,0,0,0,0,0,0


In [17]:
df_withdescriptors.to_csv('drug_descriptors_dataset.csv', index=False)

## Mordred


In [18]:
molecules = [Chem.MolFromSmiles(mol) for mol in merged['SMILES']]

In [19]:
calc = Calculator(descriptors,ignore_3D=False)

In [20]:
cols = list(calc.pandas(molecules).columns)

100%|██████████| 14/14 [00:02<00:00,  5.55it/s]


In [21]:
dat = np.zeros([len(molecules),len(cols)])
for mol in range(len(molecules)):
    tmp_mds = calc(molecules[mol])
    mds = np.zeros([1,len(tmp_mds)])
    for i in range(len(tmp_mds)):
        mds[0,i] = float(tmp_mds[i])
    dat[mol,:] = mds
MDs = pd.DataFrame(columns=cols,data=dat)
MDs

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,14.158715,11.786952,0.0,0.0,23.668756,2.461791,4.856592,23.668756,1.314931,3.837566,...,9.872048,59.456223,236.094963,7.869832,521.0,33.0,96.0,115.0,5.166667,3.972222
1,12.296614,10.788834,0.0,0.0,19.944236,2.401756,4.803512,19.944236,1.246515,3.697257,...,9.701677,48.568511,255.007851,11.087298,417.0,25.0,82.0,96.0,6.166667,3.527778
2,7.806684,7.343579,0.0,0.0,13.098358,2.369838,4.63395,13.098358,1.309836,3.261311,...,9.161465,53.745115,136.038511,9.717036,105.0,12.0,52.0,61.0,2.833333,2.222222
3,16.199155,13.295586,0.0,0.0,26.927238,2.535864,4.933973,26.927238,1.346362,3.994834,...,10.161882,73.059238,266.116761,7.826964,676.0,39.0,114.0,141.0,5.388889,4.277778
4,14.883193,13.073243,0.0,0.0,25.111327,2.549743,5.01664,25.111327,1.321649,3.892953,...,10.077147,67.20697,252.089878,8.131932,617.0,32.0,102.0,124.0,5.506944,4.208333
5,6.542301,7.03709,1.0,0.0,11.25272,2.135779,4.271558,11.25272,1.125272,3.124531,...,8.313607,37.579015,144.11503,5.542886,131.0,10.0,38.0,39.0,5.222222,2.611111
6,12.871263,11.71668,0.0,0.0,21.134139,2.510533,5.021066,21.134139,1.243185,3.7587,...,10.012207,50.356197,232.084792,8.002924,458.0,31.0,88.0,107.0,6.506944,3.875
7,8.106344,7.427049,0.0,0.0,13.221782,2.219936,4.439872,13.221782,1.20198,3.285958,...,8.795279,40.344003,151.063329,7.553166,166.0,11.0,50.0,53.0,4.583333,2.5
8,13.202929,11.709699,0.0,0.0,20.712705,2.383498,4.742568,20.712705,1.218394,3.758938,...,9.66377,63.242131,253.052112,9.037575,535.0,22.0,88.0,100.0,6.506944,3.625
9,17.998256,15.004272,0.0,0.0,29.232339,2.54792,5.09584,29.232339,1.270971,4.073006,...,10.315299,57.982106,331.062677,9.196185,1116.0,42.0,124.0,150.0,8.340278,4.972222


In [22]:
MDs = pd.concat([merged['DRUG_NAME'],MDs], axis=1)
MDs_checkmissing = MDs.copy()

In [23]:
Col_containing_all_missing = MDs_checkmissing.isna().sum(axis=0)[(MDs_checkmissing.isna().sum(axis=0) != 0)].sort_values(ascending=False)[MDs_checkmissing.isna().sum(axis=0)[(MDs_checkmissing.isna().sum(axis=0) != 0)].sort_values(ascending=False) == 14].index.tolist()

In [24]:
MDs.drop(columns=Col_containing_all_missing, inplace=True)

In [25]:
MDs.isna().any().sum()

303

In [26]:
MDs.fillna(0, inplace=True)

In [27]:
MDs.isna().any().sum()

0

In [28]:
MDs

Unnamed: 0,DRUG_NAME,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,carbamazepine,14.158715,11.786952,0.0,0.0,23.668756,2.461791,4.856592,23.668756,1.314931,...,9.872048,59.456223,236.094963,7.869832,521.0,33.0,96.0,115.0,5.166667,3.972222
1,lamotrigine,12.296614,10.788834,0.0,0.0,19.944236,2.401756,4.803512,19.944236,1.246515,...,9.701677,48.568511,255.007851,11.087298,417.0,25.0,82.0,96.0,6.166667,3.527778
2,allopurinol,7.806684,7.343579,0.0,0.0,13.098358,2.369838,4.63395,13.098358,1.309836,...,9.161465,53.745115,136.038511,9.717036,105.0,12.0,52.0,61.0,2.833333,2.222222
3,nevirapine,16.199155,13.295586,0.0,0.0,26.927238,2.535864,4.933973,26.927238,1.346362,...,10.161882,73.059238,266.116761,7.826964,676.0,39.0,114.0,141.0,5.388889,4.277778
4,phenytoin,14.883193,13.073243,0.0,0.0,25.111327,2.549743,5.01664,25.111327,1.321649,...,10.077147,67.20697,252.089878,8.131932,617.0,32.0,102.0,124.0,5.506944,4.208333
5,valproic acid,6.542301,7.03709,1.0,0.0,11.25272,2.135779,4.271558,11.25272,1.125272,...,8.313607,37.579015,144.11503,5.542886,131.0,10.0,38.0,39.0,5.222222,2.611111
6,phenobarbital,12.871263,11.71668,0.0,0.0,21.134139,2.510533,5.021066,21.134139,1.243185,...,10.012207,50.356197,232.084792,8.002924,458.0,31.0,88.0,107.0,6.506944,3.875
7,paracetamol,8.106344,7.427049,0.0,0.0,13.221782,2.219936,4.439872,13.221782,1.20198,...,8.795279,40.344003,151.063329,7.553166,166.0,11.0,50.0,53.0,4.583333,2.5
8,sulfamethoxazole,13.202929,11.709699,0.0,0.0,20.712705,2.383498,4.742568,20.712705,1.218394,...,9.66377,63.242131,253.052112,9.037575,535.0,22.0,88.0,100.0,6.506944,3.625
9,oxicam NSAIDs,17.998256,15.004272,0.0,0.0,29.232339,2.54792,5.09584,29.232339,1.270971,...,10.315299,57.982106,331.062677,9.196185,1116.0,42.0,124.0,150.0,8.340278,4.972222


In [29]:
MDs.to_csv('drug_descriptors_dataset_mordred.csv', index=False)