In [28]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, PandasTools
from mordred import Calculator, descriptors
from Mold2_pywrapper import Mold2
from padelpy import from_smiles
from tqdm import tqdm

# Defining Fucntions

In [2]:
def save_df(df, name):
    df.to_csv(f"../../data/fda_approved_with_descriptors/{name}.csv", index=False)

# Importing Database

In [3]:
approved_drugs = pd.read_csv("../../data/fda_approved_drugs.csv")
approved_drugs.head()

Unnamed: 0,name,chembl_id,clean_smiles,first_approval_year,indication_class,molecule_type,withdrawn_flag,therapeutic_flag,polymer_flag,inorganic_flag,natural_product_flag,oral,parenteral,topical
0,GUANIDINE HYDROCHLORIDE,CHEMBL1200728,N=C(N)N,1939,,Small molecule,False,True,False,False,False,True,False,False
1,ACETOHYDROXAMIC ACID,CHEMBL734,CC(=O)NO,1983,Enzyme Inhibitor (urease),Small molecule,False,True,False,False,False,True,False,False
2,HYDROXYUREA,CHEMBL467,NC(=O)NO,1967,Antineoplastic,Small molecule,False,True,False,False,False,True,False,False
3,CYSTEAMINE,CHEMBL602,NCCS,1994,CYSTEAMINE HYDROCHLORIDE,Small molecule,False,True,False,False,False,True,False,True
4,DIMETHYL SULFOXIDE,CHEMBL504,C[S+](C)[O-],1978,Anti-Inflammatory (topical),Small molecule,False,True,False,False,False,False,True,False


In [4]:
# adding column with mol
PandasTools.AddMoleculeColumnToFrame(approved_drugs, smilesCol='clean_smiles', molCol='ROMol')

# RDKit

## Fingerprint

### RDKit Fingerprint

In [5]:
# Creating a copy of the dataframe to calculate the fingerprints
approved_drugs_fingerprint_rdkit = approved_drugs.copy()

In [6]:
# Function to generate RDKit fingerprints
def generate_fingerprint_rdkit(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fingerprint = Chem.RDKFingerprint(mol)
    return fingerprint

In [7]:
# Generate fingerprints and create new columns for each bit
fingerprints = approved_drugs_fingerprint_rdkit['clean_smiles'].apply(generate_fingerprint_rdkit)
num_bits = fingerprints.iloc[0].GetNumBits()

for i in range(num_bits):
    col_name = f'Bit_{i}'
    approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))

  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fing

In [8]:
approved_drugs_fingerprint_rdkit.columns

Index(['name', 'chembl_id', 'clean_smiles', 'first_approval_year',
       'indication_class', 'molecule_type', 'withdrawn_flag',
       'therapeutic_flag', 'polymer_flag', 'inorganic_flag',
       ...
       'Bit_2038', 'Bit_2039', 'Bit_2040', 'Bit_2041', 'Bit_2042', 'Bit_2043',
       'Bit_2044', 'Bit_2045', 'Bit_2046', 'Bit_2047'],
      dtype='object', length=2062)

In [9]:
# saving new dataset
save_df(approved_drugs_fingerprint_rdkit, "fda_approved_fingerprint_rdkit")

### Morgan Fingerprint

In [10]:
# Creating a copy of the dataframe to calculate the fingerprints
approved_drugs_fingerprint_morgan = approved_drugs.copy()

In [11]:
# Function to generate Morgan fingerprints
def generate_fingerprint_morgan(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2)  # Change radius as per your requirement
    return fingerprint

In [12]:
# Generate fingerprints and create new columns for each bit
fingerprints = approved_drugs_fingerprint_morgan['clean_smiles'].apply(generate_fingerprint_morgan)
num_bits = fingerprints.iloc[0].GetNumBits()

for i in range(num_bits):
    col_name = f'Bit_{i}'
    approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))

  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved

In [13]:
approved_drugs_fingerprint_morgan.columns

Index(['name', 'chembl_id', 'clean_smiles', 'first_approval_year',
       'indication_class', 'molecule_type', 'withdrawn_flag',
       'therapeutic_flag', 'polymer_flag', 'inorganic_flag',
       ...
       'Bit_2038', 'Bit_2039', 'Bit_2040', 'Bit_2041', 'Bit_2042', 'Bit_2043',
       'Bit_2044', 'Bit_2045', 'Bit_2046', 'Bit_2047'],
      dtype='object', length=2062)

In [14]:
# saving new dataset
save_df(approved_drugs_fingerprint_morgan, "fda_approved_fingerprint_morgan")

## Descriptors

In [15]:
# Creating a copy of the dataframe to calculate the fingerprints
approved_drugs_rdkit_descriptors = approved_drugs.copy()

In [16]:
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)

    descriptors = {}
    for descriptor_name, descriptor_function in Descriptors.descList:
        try:
            descriptors[descriptor_name] = descriptor_function(mol)
        except:
            descriptors[descriptor_name] = None

    return pd.Series(descriptors)

In [17]:
rdkit_descriptors = approved_drugs_rdkit_descriptors['clean_smiles'].apply(calculate_descriptors)
approved_drugs_rdkit_descriptors = pd.concat([approved_drugs_rdkit_descriptors, rdkit_descriptors], axis=1)
approved_drugs_rdkit_descriptors.head()

Unnamed: 0,name,chembl_id,clean_smiles,first_approval_year,indication_class,molecule_type,withdrawn_flag,therapeutic_flag,polymer_flag,inorganic_flag,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,GUANIDINE HYDROCHLORIDE,CHEMBL1200728,N=C(N)N,1939,,Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ACETOHYDROXAMIC ACID,CHEMBL734,CC(=O)NO,1983,Enzyme Inhibitor (urease),Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,HYDROXYUREA,CHEMBL467,NC(=O)NO,1967,Antineoplastic,Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,CYSTEAMINE,CHEMBL602,NCCS,1994,CYSTEAMINE HYDROCHLORIDE,Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DIMETHYL SULFOXIDE,CHEMBL504,C[S+](C)[O-],1978,Anti-Inflammatory (topical),Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


As we can see below, 108 fields in the descriptors are 'None'. These correspond to 9 molecules, that had 12 descriptors return None. I will print them out below

In [18]:
# checking if there are molecules with None in the descriptor
rdkit_descriptors.isnull().sum().sum()

108

In [19]:
approved_drugs_rdkit_descriptors[approved_drugs_rdkit_descriptors.iloc[:, 24].isnull()].iloc[:, 0:3]

Unnamed: 0,name,chembl_id,clean_smiles
491,ACETARSONE,CHEMBL1330792,CC(=O)Nc1cc([As](=O)(O)O)ccc1O
1015,BISMUTH SUBSALICYLATE,CHEMBL1120,O=C1O[Bi](O)Oc2ccccc21
1131,THIMEROSAL,CHEMBL508338,CC[Hg]Sc1ccccc1C(=O)O
1176,AUROTHIOGLUCOSE,CHEMBL2354773,OC[C@H]1O[C@H](S[Au])[C@H](O)[C@@H](O)[C@@H]1O
1193,BISMUTH SUBGALLATE,CHEMBL1592101,O=C(O)c1cc(O)c2c(c1)O[Bi](O)O2
1551,MERSALYL,CHEMBL1200943,COC(CNC(=O)c1ccccc1OCC(=O)O)C[Hg]O
1799,SODIUM STIBOGLUCONATE,CHEMBL3991035,O=C(O)[C@@H]1O[Sb]2(O)(O[Sb]34(O)O[C@H]([C@H](...
1809,MERBROMIN,CHEMBL3833381,O=C(O)c1ccccc1-c1c2cc(Br)c(=O)cc-2oc2c([Hg]O)c...
1873,HYDRARGAPHEN,CHEMBL3833364,O=S(=O)(O[Hg]c1ccccc1)c1cc2ccccc2cc1Cc1cc2cccc...


In [20]:
approved_drugs_rdkit_descriptors[approved_drugs_rdkit_descriptors.iloc[:, 24].isnull()].iloc[:, 24:28]

Unnamed: 0,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge
491,,,,
1015,,,,
1131,,,,
1176,,,,
1193,,,,
1551,,,,
1799,,,,
1809,,,,
1873,,,,


In [21]:
approved_drugs_rdkit_descriptors[approved_drugs_rdkit_descriptors.iloc[:, 24].isnull()].iloc[:, 31:39]

Unnamed: 0,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW
491,,,,,,,,
1015,,,,,,,,
1131,,,,,,,,
1176,,,,,,,,
1193,,,,,,,,
1551,,,,,,,,
1799,,,,,,,,
1809,,,,,,,,
1873,,,,,,,,


I do not know why these molecules could not have their descriptors calculated. I will leave them in the dataset for now, but they will have to be dealt with in order to use machine learning models in the future.

In [22]:
# saving new dataset
save_df(approved_drugs_rdkit_descriptors, "fda_approved_rdkit")

# Mordred

In [23]:
# Creating a copy of the dataframe to calculate the fingerprints
approved_drugs_mordred = approved_drugs.copy()

In [31]:
# initializing object for mordred descriptors calculation, for only 2d descriptors, and both 2d and 3d descriptors
calc_2d = Calculator(descriptors, ignore_3D=True)
calc_2d_3d = Calculator(descriptors, ignore_3D=False)

In [32]:
mordred_descriptors_2d = calc_2d.pandas(approved_drugs_mordred["ROMol"])

 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 1731/1895 [00:59<00:15, 10.76it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 1810/1895 [01:07<00:21,  3.87it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊   | 1862/1895 [01:16<00:06,  5.39it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 1868/1895 [01:18<00:06,  3.96it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1895/1895 [01:25<00:00, 22.15it/s]


In [39]:
mordred_descriptors_2d.head()

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,2.44949,2.44949,0,3,3.464102,1.732051,3.464102,3.464102,0.866025,2.178059,...,6.188264,24.179697,59.048347,6.560927,9,0,12.0,9.0,3.111111,1.0
1,3.047207,3.305183,0,0,5.226252,1.847759,3.695518,5.226252,1.04525,2.408576,...,6.834109,27.25413,75.032028,7.503203,18,2,16.0,14.0,3.361111,1.333333
2,3.047207,3.305183,0,0,5.226252,1.847759,3.695518,5.226252,1.04525,2.408576,...,6.834109,27.25413,76.027277,8.447475,18,2,16.0,14.0,3.361111,1.333333
3,2.12132,2.3401,0,1,4.472136,1.618034,3.236068,4.472136,1.118034,2.155909,...,5.509388,22.328143,77.02992,7.00272,10,1,10.0,8.0,2.5,1.25
4,2.44949,2.44949,0,0,3.464102,1.732051,3.464102,3.464102,0.866025,2.178059,...,6.188264,24.179697,78.013936,7.801394,9,0,12.0,9.0,3.111111,1.0


In [35]:
mordred_descriptors_2d_3d = calc_2d_3d.pandas(approved_drugs_mordred["ROMol"])

 91%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▉                | 1731/1895 [00:59<00:14, 11.37it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 95%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▊          | 1791/1895 [01:04<00:11,  8.71it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 96%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▋        | 1810/1895 [01:07<00:30,  2.83it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 98%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▍   | 1859/1895 [01:15<00:06,  5.76it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


 99%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎  | 1868/1895 [01:17<00:04,  6.68it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1895/1895 [01:25<00:00, 22.26it/s]


In [40]:
mordred_descriptors_2d_3d.head()

Unnamed: 0,ABC,ABCGG,nAcid,nBase,SpAbs_A,SpMax_A,SpDiam_A,SpAD_A,SpMAD_A,LogEE_A,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,2.44949,2.44949,0,3,3.464102,1.732051,3.464102,3.464102,0.866025,2.178059,...,6.188264,24.179697,59.048347,6.560927,9,0,12.0,9.0,3.111111,1.0
1,3.047207,3.305183,0,0,5.226252,1.847759,3.695518,5.226252,1.04525,2.408576,...,6.834109,27.25413,75.032028,7.503203,18,2,16.0,14.0,3.361111,1.333333
2,3.047207,3.305183,0,0,5.226252,1.847759,3.695518,5.226252,1.04525,2.408576,...,6.834109,27.25413,76.027277,8.447475,18,2,16.0,14.0,3.361111,1.333333
3,2.12132,2.3401,0,1,4.472136,1.618034,3.236068,4.472136,1.118034,2.155909,...,5.509388,22.328143,77.02992,7.00272,10,1,10.0,8.0,2.5,1.25
4,2.44949,2.44949,0,0,3.464102,1.732051,3.464102,3.464102,0.866025,2.178059,...,6.188264,24.179697,78.013936,7.801394,9,0,12.0,9.0,3.111111,1.0


In [41]:
# merging the 2d descriptors to a dataset
approved_drugs_mordred_2d = pd.concat([approved_drugs_mordred, mordred_descriptors_2d], axis=1)
approved_drugs_mordred_2d.head()

Unnamed: 0,name,chembl_id,clean_smiles,first_approval_year,indication_class,molecule_type,withdrawn_flag,therapeutic_flag,polymer_flag,inorganic_flag,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,GUANIDINE HYDROCHLORIDE,CHEMBL1200728,N=C(N)N,1939,,Small molecule,False,True,False,False,...,6.188264,24.179697,59.048347,6.560927,9,0,12.0,9.0,3.111111,1.0
1,ACETOHYDROXAMIC ACID,CHEMBL734,CC(=O)NO,1983,Enzyme Inhibitor (urease),Small molecule,False,True,False,False,...,6.834109,27.25413,75.032028,7.503203,18,2,16.0,14.0,3.361111,1.333333
2,HYDROXYUREA,CHEMBL467,NC(=O)NO,1967,Antineoplastic,Small molecule,False,True,False,False,...,6.834109,27.25413,76.027277,8.447475,18,2,16.0,14.0,3.361111,1.333333
3,CYSTEAMINE,CHEMBL602,NCCS,1994,CYSTEAMINE HYDROCHLORIDE,Small molecule,False,True,False,False,...,5.509388,22.328143,77.02992,7.00272,10,1,10.0,8.0,2.5,1.25
4,DIMETHYL SULFOXIDE,CHEMBL504,C[S+](C)[O-],1978,Anti-Inflammatory (topical),Small molecule,False,True,False,False,...,6.188264,24.179697,78.013936,7.801394,9,0,12.0,9.0,3.111111,1.0


In [42]:
# merging the 2d and 3d descriptors to a dataset
approved_drugs_mordred_2d_3d = pd.concat([approved_drugs_mordred, mordred_descriptors_2d_3d], axis=1)
approved_drugs_mordred_2d_3d.head()

Unnamed: 0,name,chembl_id,clean_smiles,first_approval_year,indication_class,molecule_type,withdrawn_flag,therapeutic_flag,polymer_flag,inorganic_flag,...,SRW10,TSRW10,MW,AMW,WPath,WPol,Zagreb1,Zagreb2,mZagreb1,mZagreb2
0,GUANIDINE HYDROCHLORIDE,CHEMBL1200728,N=C(N)N,1939,,Small molecule,False,True,False,False,...,6.188264,24.179697,59.048347,6.560927,9,0,12.0,9.0,3.111111,1.0
1,ACETOHYDROXAMIC ACID,CHEMBL734,CC(=O)NO,1983,Enzyme Inhibitor (urease),Small molecule,False,True,False,False,...,6.834109,27.25413,75.032028,7.503203,18,2,16.0,14.0,3.361111,1.333333
2,HYDROXYUREA,CHEMBL467,NC(=O)NO,1967,Antineoplastic,Small molecule,False,True,False,False,...,6.834109,27.25413,76.027277,8.447475,18,2,16.0,14.0,3.361111,1.333333
3,CYSTEAMINE,CHEMBL602,NCCS,1994,CYSTEAMINE HYDROCHLORIDE,Small molecule,False,True,False,False,...,5.509388,22.328143,77.02992,7.00272,10,1,10.0,8.0,2.5,1.25
4,DIMETHYL SULFOXIDE,CHEMBL504,C[S+](C)[O-],1978,Anti-Inflammatory (topical),Small molecule,False,True,False,False,...,6.188264,24.179697,78.013936,7.801394,9,0,12.0,9.0,3.111111,1.0


In [43]:
# saving new datasets
save_df(approved_drugs_mordred_2d, "fda_approved_mordred_2d")
save_df(approved_drugs_mordred_2d_3d, "fda_approved_mordred_2d_and_3d")

# Mold2

In [63]:
# Creating a copy of the dataframe to calculate the fingerprints
approved_drugs_mold2 = approved_drugs.copy()

In [67]:
# initializing mold 2 object
mold2 = Mold2()

In [68]:
mold2_descriptors = mold2.calculate(approved_drugs_mold2["ROMol"])

Mold2 calculates a large and diverse set of molecular descriptors encoding two-
dimensional chemical structure information. Comparative analysis of Mold2 descriptors
with those calculated from commercial software on several published datasets
demonstrated that Mold2 descriptors convey sufficient structural information. In addition,
better models were generated using Mold2 descriptors than the compared commercial
software packages. This publicly available software is developed by the Center for
Bioinformatics, which is led by Dr. Weida Tong, at the National Center for Toxicological
Research (NCTR).
    
Mold2 is a product designed and produced by the National Center for Toxicological
Research (NCTR).  FDA and NCTR retain ownership of this product.

Please address any questions or suggestions to Dr. Huixiao Hong, National Center for Toxicological
Research, at 870-543-7296 or Huixiao.Hong@fda.hhs.gov.

###################################

Should you publish results based on the Mold² desc

In [69]:
mold2_descriptors.head()

Unnamed: 0,D001,D002,D003,D004,D005,D006,D007,D008,D009,D010,...,D768,D769,D770,D771,D772,D773,D774,D775,D776,D777
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,3.49381,0.0,-1.45757
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.32193,0.523826,0.0,-0.816018
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2.32193,1.81453,0.0,-1.4534
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.83248,0.0,-0.172449
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.430677,0.0,-0.172449


In [None]:
# merging the descriptors to the dataset
approved_drugs_mold2 = pd.concat([approved_drugs_mold2, mold2_descriptors], axis=1)
approved_drugs_mold2.head()

In [71]:
# saving new dataset
save_df(approved_drugs_mold2, "fda_approved_mold2")

# PaDEL

In [26]:
# Creating a copy of the dataframe to calculate the fingerprints
approved_drugs_padel = approved_drugs.copy()

In [27]:
smiles = approved_drugs_padel["clean_smiles"].tolist()

In [29]:
# calculating the descriptors and saving to a list
# molecules with descriptors that can't be calculated are saved in the list error
correct_descriptors = []
error = []
for smi in tqdm(smiles):
    try:
        desc = from_smiles(smi)
        desc["clean_smiles"] = smi
        correct_descriptors.append(desc)
    except:
        error.append(smi)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1895/1895 [2:19:08<00:00,  4.41s/it]


In [30]:
len(correct_descriptors)

1887

In [31]:
len(error)

8

In [32]:
error

['CCC[N+]12[C@H]3C[C@@]45c6ccccc6N(C)[C@H]4[C@@H]1C[C@@H]([C@H](CC)[C@H]2O)[C@H]3[C@H]5O',
 'COC(=O)[C@@H]1CC2=CC(=O)CC[C@]2(C)[C@@]23O[C@@H]2C[C@@]2(C)[C@@H](CC[C@@]24CCC(=O)O4)[C@H]13',
 'CO[C@@]12CC[C@@]3(C[C@@H]1[C@](C)(O)C(C)(C)C)[C@H]1Cc4ccc(O)c5c4[C@@]3(CCN1CC1CC1)[C@H]2O5',
 'CC(=O)O[C@H]1C[C@H](O[C@H]2[C@@H](O)C[C@H](O[C@H]3[C@@H](O)C[C@H](O[C@H]4CC[C@]5(C)[C@H]6C[C@@H](O)[C@]7(C)[C@@H](C8=CC(=O)OC8)CC[C@]7(O)[C@@H]6CC[C@@H]5C4)O[C@@H]3C)O[C@@H]2C)O[C@H](C)[C@H]1O[C@@H]1O[C@H](CO)[C@@H](O)[C@H](O)[C@H]1O',
 'COC(=O)N[C@H](C(=O)N1CCC[C@H]1c1nc2cc([C@H]3CC[C@H](c4cc5nc([C@@H]6CCCN6C(=O)[C@@H](NC(=O)OC)[C@@H](C)OC)[nH]c5cc4F)N3c3cc(F)c(N4CCC(c5ccc(F)cc5)CC4)c(F)c3)c(F)cc2[nH]1)[C@@H](C)OC',
 'CCCCCOc1ccc(-c2ccc(-c3ccc(C(=O)N[C@H]4C[C@@H](O)[C@@H](O)NC(=O)[C@@H]5[C@@H](O)[C@@H](C)CN5C(=O)[C@H]([C@@H](C)O)NC(=O)[C@H]([C@H](O)[C@@H](O)c5ccc(O)cc5)NC(=O)[C@@H]5C[C@@H](O)CN5C(=O)[C@H]([C@@H](C)O)NC4=O)cc3)cc2)cc1',
 'CN[C@H](CC(C)C)C(=O)N[C@H]1C(=O)N[C@@H](CC(N)=O)C(=O)N[C@H]2C(=O)N[C

We can see here that these 8 molecules had errors when calculating the PaDEL descriptors. I will add the descriptors as NaN in the pandas DataFrame.

In [33]:
descriptors_df = pd.DataFrame.from_records(correct_descriptors)

In [44]:
approved_drugs_padel = pd.merge(approved_drugs_padel, descriptors_df, on="clean_smiles", how="outer")

In [47]:
print(approved_drugs_padel.shape)
approved_drugs_padel.head()

(1895, 1890)


Unnamed: 0,name,chembl_id,clean_smiles,first_approval_year,indication_class,molecule_type,withdrawn_flag,therapeutic_flag,polymer_flag,inorganic_flag,...,P1s,P2s,E1s,E2s,E3s,Ts,As,Vs,Ks,Ds
0,GUANIDINE HYDROCHLORIDE,CHEMBL1200728,N=C(N)N,1939,,Small molecule,False,True,False,False,...,0.5845817433510382,0.3749362313434469,0.5715024765805244,0.5124390804896523,0.3067387428562952,2.7337797789655407,1.9283565476591928,4.843418654905854,0.4392769620417278,1.390680299926472
1,ACETOHYDROXAMIC ACID,CHEMBL734,CC(=O)NO,1983,Enzyme Inhibitor (urease),Small molecule,False,True,False,False,...,0.7142160951225184,0.2296440610540655,0.5858994371721172,0.5007418965111342,0.2210701270223774,3.3687697995775965,2.462689742456657,6.183481745158208,0.5713241426837775,1.3077114607056288
2,HYDROXYUREA,CHEMBL467,NC(=O)NO,1967,Antineoplastic,Small molecule,False,True,False,False,...,0.7435205264641558,0.2388275646059327,0.5802844597648864,0.4911280064589335,0.5837587847426486,3.1888554948892534,1.982036434220474,5.2725340243267045,0.6152807896962338,1.6551712509664689
3,CYSTEAMINE,CHEMBL602,NCCS,1994,CYSTEAMINE HYDROCHLORIDE,Small molecule,False,True,False,False,...,0.7186811733561231,0.1949261819574725,0.9967286251880472,0.5160858685812225,0.3488457535520902,3.60603945096066,2.8480145866097053,7.021565495926363,0.5780217600341847,1.86166024732136
4,DIMETHYL SULFOXIDE,CHEMBL504,C[S+](C)[O-],1978,Anti-Inflammatory (topical),Small molecule,False,True,False,False,...,,,,,,,,,,


In [48]:
# saving new dataset
save_df(approved_drugs_padel, "fda_approved_padel")