<a href="https://colab.research.google.com/github/brooke57/BrainTumorImageClassification/blob/main/Drug_Discovery_EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **TKR_FLT3 Drug Discovery EDA**

In [None]:
! pip install rdkit-pypi

Collecting rdkit-pypi
  Downloading rdkit_pypi-2021.9.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (20.6 MB)
[K     |████████████████████████████████| 20.6 MB 2.9 MB/s 
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2021.9.4


In [None]:
# Import necessary libraries
import pandas as pd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/data/preprocessed_TKR_bioactivity_data.csv')

In [None]:
df.head()

Unnamed: 0,mol_id,canon_smiles,std_value,Bioactivity
0,CHEMBL330863,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,128.0,active
1,CHEMBL124660,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,220.0,active
2,CHEMBL126699,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,8790.0,inactive
3,CHEMBL445636,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,1910.0,inactive
4,CHEMBL941,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,30000.0,inactive


In [None]:
def lipinski(smiles, verbose=False):

  moldata= []
  for elem in smiles:
    mol=Chem.MolFromSmiles(elem) 
    moldata.append(mol)

  MolWt_list = []
  MolLogP_list = []
  NumHDonors_list = []
  NumHAcceptors_list = []

  for mol in moldata:        
        
    desc_MolWt = Descriptors.MolWt(mol)
    MolWt_list.append(desc_MolWt)
    desc_MolLogP = Descriptors.MolLogP(mol)
    MolLogP_list.append(desc_MolLogP)
    desc_NumHDonors = Lipinski.NumHDonors(mol)
    NumHDonors_list.append(desc_NumHDonors)
    desc_NumHAcceptors = Lipinski.NumHAcceptors(mol)
    NumHAcceptors_list.append(desc_NumHAcceptors)
            
  df_list = list(zip(MolWt_list, MolLogP_list, NumHDonors_list, NumHAcceptors_list))

  descrip = pd.DataFrame(df_list, columns=['MolWt', 'MolLogP', 'NumHDonors', 'NumHAcceptors'])
  return descrip

In [None]:
lipinski_df = lipinski(df['canon_smiles'])
lipinski_df.head()

Unnamed: 0,MolWt,MolLogP,NumHDonors,NumHAcceptors
0,576.742,5.2805,1,8
1,562.715,5.0345,1,8
2,543.672,4.50748,1,8
3,543.672,4.36498,1,8
4,493.615,4.59032,2,7


In [None]:
# Combining original dataframe with lipinski descriptors dataframe
cmbd_df = pd.concat([df, lipinski_df], axis=1)
cmbd_df

Unnamed: 0,mol_id,canon_smiles,std_value,Bioactivity,MolWt,MolLogP,NumHDonors,NumHAcceptors
0,CHEMBL330863,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,128.0,active,576.742,5.28050,1,8
1,CHEMBL124660,COc1cc2c(N3CCN(C(=O)Nc4ccc(OC(C)C)cc4)CC3)ncnc...,220.0,active,562.715,5.03450,1,8
2,CHEMBL126699,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,8790.0,inactive,543.672,4.50748,1,8
3,CHEMBL445636,COc1cc2c(N3CCN(C(=O)Nc4ccc(C#N)cc4)CC3)ncnc2cc...,1910.0,inactive,543.672,4.36498,1,8
4,CHEMBL941,Cc1ccc(NC(=O)c2ccc(CN3CCN(C)CC3)cc2)cc1Nc1nccc...,30000.0,inactive,493.615,4.59032,2,7
...,...,...,...,...,...,...,...,...
2839,CHEMBL1971943,Cc1c[nH]c2nccc(Oc3c(F)cc(Nc4cc(Cl)nc(N)n4)cc3F...,303.0,active,402.792,4.71102,3,6
2840,CHEMBL4088216,CN1C(=O)[C@@H](N2CCc3cn(Cc4ccccc4)nc3C2=O)COc2...,1000.0,active,402.454,2.35370,0,5
2841,CHEMBL4531334,CN1CCN(C(=O)C(C)(C)c2ccc(C(=O)Nc3cn4cc(-c5ccnc...,668.0,active,482.588,3.70020,1,6
2842,CHEMBL4549667,CN1C(=O)[C@@H](N2CCc3c(nn(Cc4ccccc4)c3Br)C2=O)...,1000.0,active,481.350,3.11620,0,5
