In [52]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors

# Importing Notebook

In [2]:
approved_drugs = pd.read_csv("../../data/fda_approved_drugs.csv")
approved_drugs.head()

Unnamed: 0,name,chembl_id,clean_smiles,first_approval_year,indication_class,molecule_type,withdrawn_flag,therapeutic_flag,polymer_flag,inorganic_flag,natural_product_flag,oral,parenteral,topical
0,GUANIDINE HYDROCHLORIDE,CHEMBL1200728,N=C(N)N,1939,,Small molecule,False,True,False,False,False,True,False,False
1,ACETOHYDROXAMIC ACID,CHEMBL734,CC(=O)NO,1983,Enzyme Inhibitor (urease),Small molecule,False,True,False,False,False,True,False,False
2,HYDROXYUREA,CHEMBL467,NC(=O)NO,1967,Antineoplastic,Small molecule,False,True,False,False,False,True,False,False
3,CYSTEAMINE,CHEMBL602,NCCS,1994,CYSTEAMINE HYDROCHLORIDE,Small molecule,False,True,False,False,False,True,False,True
4,DIMETHYL SULFOXIDE,CHEMBL504,C[S+](C)[O-],1978,Anti-Inflammatory (topical),Small molecule,False,True,False,False,False,False,True,False


In [40]:
def save_df(df, name):
    df.to_csv(f"../../data/fda_approved_with_descriptors/{name}.csv", index=False)

# RDKit

## Fingerprint

### RDKit Fingerprint

In [34]:
# Creating a copy of the dataframe to calculate the fingerprints
approved_drugs_fingerprint_rdkit = approved_drugs.copy()

In [35]:
# Function to generate RDKit fingerprints
def generate_fingerprint_rdkit(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fingerprint = Chem.RDKFingerprint(mol)
    return fingerprint

In [36]:
# Generate fingerprints and create new columns for each bit
fingerprints = approved_drugs_fingerprint_rdkit['clean_smiles'].apply(generate_fingerprint_rdkit)
num_bits = fingerprints.iloc[0].GetNumBits()

for i in range(num_bits):
    col_name = f'Bit_{i}'
    approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))

  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_rdkit[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fing

In [38]:
approved_drugs_fingerprint_rdkit.columns

Index(['name', 'chembl_id', 'clean_smiles', 'first_approval_year',
       'indication_class', 'molecule_type', 'withdrawn_flag',
       'therapeutic_flag', 'polymer_flag', 'inorganic_flag',
       ...
       'Bit_2038', 'Bit_2039', 'Bit_2040', 'Bit_2041', 'Bit_2042', 'Bit_2043',
       'Bit_2044', 'Bit_2045', 'Bit_2046', 'Bit_2047'],
      dtype='object', length=2062)

In [43]:
# saving new dataset
save_df(approved_drugs_fingerprint_rdkit, "fda_approved_fingerprint_rdkit")

### Morgan Fingerprint

In [44]:
# Creating a copy of the dataframe to calculate the fingerprints
approved_drugs_fingerprint_morgan = approved_drugs.copy()

In [48]:
# Function to generate Morgan fingerprints
def generate_fingerprint_morgan(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2)  # Change radius as per your requirement
    return fingerprint

In [49]:
# Generate fingerprints and create new columns for each bit
fingerprints = approved_drugs_fingerprint_morgan['clean_smiles'].apply(generate_fingerprint_morgan)
num_bits = fingerprints.iloc[0].GetNumBits()

for i in range(num_bits):
    col_name = f'Bit_{i}'
    approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))

  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved_drugs_fingerprint_morgan[col_name] = fingerprints.apply(lambda x: x.GetBit(i))
  approved

In [50]:
approved_drugs_fingerprint_morgan.columns

Index(['name', 'chembl_id', 'clean_smiles', 'first_approval_year',
       'indication_class', 'molecule_type', 'withdrawn_flag',
       'therapeutic_flag', 'polymer_flag', 'inorganic_flag',
       ...
       'Bit_2038', 'Bit_2039', 'Bit_2040', 'Bit_2041', 'Bit_2042', 'Bit_2043',
       'Bit_2044', 'Bit_2045', 'Bit_2046', 'Bit_2047'],
      dtype='object', length=2062)

In [51]:
# saving new dataset
save_df(approved_drugs_fingerprint_morgan, "fda_approved_fingerprint_morgan")

## Descriptors

In [53]:
# Creating a copy of the dataframe to calculate the fingerprints
approved_drugs_rdkit_descriptors = approved_drugs.copy()

In [59]:
def calculate_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)

    descriptors = {}
    for descriptor_name, descriptor_function in Descriptors.descList:
        try:
            descriptors[descriptor_name] = descriptor_function(mol)
        except:
            descriptors[descriptor_name] = None

    return pd.Series(descriptors)

In [107]:
rdkit_descriptors = approved_drugs_rdkit_descriptors['clean_smiles'].apply(calculate_descriptors)
approved_drugs_rdkit_descriptors = pd.concat([approved_drugs_rdkit_descriptors, rdkit_descriptors], axis=1)
approved_drugs_rdkit_descriptors.head()

Unnamed: 0,name,chembl_id,clean_smiles,first_approval_year,indication_class,molecule_type,withdrawn_flag,therapeutic_flag,polymer_flag,inorganic_flag,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,GUANIDINE HYDROCHLORIDE,CHEMBL1200728,N=C(N)N,1939,,Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ACETOHYDROXAMIC ACID,CHEMBL734,CC(=O)NO,1983,Enzyme Inhibitor (urease),Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,HYDROXYUREA,CHEMBL467,NC(=O)NO,1967,Antineoplastic,Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,CYSTEAMINE,CHEMBL602,NCCS,1994,CYSTEAMINE HYDROCHLORIDE,Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,DIMETHYL SULFOXIDE,CHEMBL504,C[S+](C)[O-],1978,Anti-Inflammatory (topical),Small molecule,False,True,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


As we can see below, 108 fields in the descriptors are 'None'. These correspond to 9 molecules, that had 12 descriptors return None. I will print them out below

In [136]:
# checking if there are molecules with None in the descriptor
rdkit_descriptors.isnull().sum().sum()

108

In [132]:
approved_drugs_rdkit_descriptors[approved_drugs_rdkit_descriptors.iloc[:, 24].isnull()].iloc[:, 0:3]

Unnamed: 0,name,chembl_id,clean_smiles
491,ACETARSONE,CHEMBL1330792,CC(=O)Nc1cc([As](=O)(O)O)ccc1O
1015,BISMUTH SUBSALICYLATE,CHEMBL1120,O=C1O[Bi](O)Oc2ccccc21
1131,THIMEROSAL,CHEMBL508338,CC[Hg]Sc1ccccc1C(=O)O
1176,AUROTHIOGLUCOSE,CHEMBL2354773,OC[C@H]1O[C@H](S[Au])[C@H](O)[C@@H](O)[C@@H]1O
1193,BISMUTH SUBGALLATE,CHEMBL1592101,O=C(O)c1cc(O)c2c(c1)O[Bi](O)O2
1551,MERSALYL,CHEMBL1200943,COC(CNC(=O)c1ccccc1OCC(=O)O)C[Hg]O
1799,SODIUM STIBOGLUCONATE,CHEMBL3991035,O=C(O)[C@@H]1O[Sb]2(O)(O[Sb]34(O)O[C@H]([C@H](...
1809,MERBROMIN,CHEMBL3833381,O=C(O)c1ccccc1-c1c2cc(Br)c(=O)cc-2oc2c([Hg]O)c...
1873,HYDRARGAPHEN,CHEMBL3833364,O=S(=O)(O[Hg]c1ccccc1)c1cc2ccccc2cc1Cc1cc2cccc...


In [122]:
approved_drugs_rdkit_descriptors[approved_drugs_rdkit_descriptors.iloc[:, 24].isnull()].iloc[:, 24:28]

Unnamed: 0,MaxPartialCharge,MinPartialCharge,MaxAbsPartialCharge,MinAbsPartialCharge
491,,,,
1015,,,,
1131,,,,
1176,,,,
1193,,,,
1551,,,,
1799,,,,
1809,,,,
1873,,,,


In [125]:
approved_drugs_rdkit_descriptors[approved_drugs_rdkit_descriptors.iloc[:, 24].isnull()].iloc[:, 31:39]

Unnamed: 0,BCUT2D_MWHI,BCUT2D_MWLOW,BCUT2D_CHGHI,BCUT2D_CHGLO,BCUT2D_LOGPHI,BCUT2D_LOGPLOW,BCUT2D_MRHI,BCUT2D_MRLOW
491,,,,,,,,
1015,,,,,,,,
1131,,,,,,,,
1176,,,,,,,,
1193,,,,,,,,
1551,,,,,,,,
1799,,,,,,,,
1809,,,,,,,,
1873,,,,,,,,


I do not know why these molecules could not have their descriptors calculated. I will leave them in the dataset for now, but they will have to be dealt with in order to use machine learning models in the future.

In [137]:
# saving new dataset
save_df(approved_drugs_rdkit_descriptors, "fda_approved_rdkit")

# Mordred

In [138]:
# Creating a copy of the dataframe to calculate the fingerprints
approved_drugs_mordred = approved_drugs.copy()

In [139]:
# saving new dataset
save_df(approved_drugs_mordred, "fda_approved_mordred")

# PyL3DMD

In [140]:
# Creating a copy of the dataframe to calculate the fingerprints
approved_drugs_pyl3md = approved_drugs.copy()

In [141]:
# saving new dataset
save_df(approved_drugs_pyl3md, "fda_approved_pyl3dmd")

# Mold2

In [142]:
# Creating a copy of the dataframe to calculate the fingerprints
approved_drugs_mold2 = approved_drugs.copy()

In [143]:
# saving new dataset
save_df(approved_drugs_mold2, "fda_approved_mold2")