# Chemistry Department Project 2021 - Neural Network For Predicting The Antioxidant & Cytotoxicity Of Compounds

- Processing PubChem Bioassays CSVs

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

## Custom Functions

In [2]:
def minmaxmedian(dataframe):
  """Returns the Median, Minimum and Maximum IC50 Values of given CID"""
  ic_fifty_median = dict()
  ic_fifty_max = dict()
  ic_fifty_min = dict()

  for cid in dataframe.cid.unique():
    temp = dataframe[dataframe['cid']==cid]
    ic_fifty_median[str(cid)] = np.median((list(temp['acvalue'])))
    ic_fifty_max[str(cid)] = np.max((list(temp['acvalue'])))
    ic_fifty_min[str(cid)] = np.min((list(temp['acvalue'])))

  return ic_fifty_median, ic_fifty_max, ic_fifty_min

## Combining multiple CSVs into single CSV

In [3]:
directory = "/content/drive/MyDrive/Current-Work/ML-Chemistry-Department-2021/PubChem-Bioactivity_CSVs"

In [4]:
combine_bioactivity = pd.DataFrame()

for csv_name in os.listdir(directory):

  # File path
  csv_path = f"{directory}/{csv_name}"

  csv_file = pd.read_csv(csv_path)

  csv_file_subset = csv_file[['acvalue', 'cid', 'pmid', 'acname', 'aidname']]

  combine_bioactivity = pd.concat([combine_bioactivity, csv_file_subset], axis=0)

In [5]:
combine_bioactivity = combine_bioactivity[combine_bioactivity['acvalue'].notna()]

In [6]:
combine_bioactivity.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4311 entries, 0 to 25
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   acvalue  4311 non-null   float64
 1   cid      4311 non-null   int64  
 2   pmid     2178 non-null   float64
 3   acname   4311 non-null   object 
 4   aidname  4311 non-null   object 
dtypes: float64(2), int64(1), object(2)
memory usage: 202.1+ KB


In [7]:
combine_bioactivity.to_excel("all_comp_bioactivity_combine.xlsx", index=False)

In [8]:
! cp -r /content/all_comp_bioactivity_combine.xlsx /content/drive/MyDrive/Current-Work/ML-Chemistry-Department-2021/

In [9]:
combine_bioactivity

Unnamed: 0,acvalue,cid,pmid,acname,aidname
0,1.0000,162350,,Potency,qHTS for Stage-Specific Inhibitors of Vaccinia...
1,4.4668,162350,,Potency,"qHTS for Agonist of gsp, the Etiologic Mutatio..."
2,19.9526,162350,,Potency,qHTS Assay for Inhibitors of Tyrosyl-DNA Phosp...
3,29.0929,162350,,Potency,qHTS for induction of synthetic lethality in t...
4,50.0000,162350,23206866.0,IC50,Cytotoxicity against human KATO III cells asse...
...,...,...,...,...,...
21,147.0000,10143,11141105.0,IC50,Antiamnesic activity against Entamoeba histoly...
22,200.0000,10143,29103873.0,IC50,Antiproliferative activity against human MGC80...
23,200.0000,10143,29103873.0,IC50,Antiproliferative activity against human HeLa ...
24,200.0000,10143,29103873.0,IC50,Antiproliferative activity against human HepG2...


In [10]:
# https://pandas.pydata.org/docs/reference/api/pandas.Series.str.contains.html

# Selecting entries which have cytotoxicity
cytotoxicity_bioactivity = combine_bioactivity.loc[combine_bioactivity['aidname'].str.contains("cytotoxicity", case=False)]

# Selecting entries which have antioxidant
antioxidant_bioactivity = combine_bioactivity.loc[combine_bioactivity['aidname'].str.contains("antioxidant", case=False)]

In [11]:
# Selecting entries with IC50 values only
antioxidant_bioactivity_IC_fifty = antioxidant_bioactivity[antioxidant_bioactivity['acname']=='IC50']
cytotoxicity_bioactivity_IC_fifty = cytotoxicity_bioactivity[cytotoxicity_bioactivity['acname']=='IC50']

In [12]:
cytotoxicity_bioactivity_IC_fifty.head()

Unnamed: 0,acvalue,cid,pmid,acname,aidname
4,50.0,162350,23206866.0,IC50,Cytotoxicity against human KATO III cells asse...
5,50.0,162350,23206866.0,IC50,Cytotoxicity against human HL60 cells assessed...
6,50.0,162350,23206866.0,IC50,Cytotoxicity against human U937 cells assessed...
7,50.0,162350,23206866.0,IC50,Cytotoxicity against human THP1 cells assessed...
8,50.0,162350,23206866.0,IC50,Cytotoxicity against human Jurkat cells assess...


In [13]:
len(cytotoxicity_bioactivity_IC_fifty.cid.value_counts())

16

In [14]:
antioxidant_bioactivity_IC_fifty.head()

Unnamed: 0,acvalue,cid,pmid,acname,aidname
18,1.59,72276,9461655.0,IC50,Antioxidant activity assessed as superoxide-sc...
63,15.0,72276,28006914.0,IC50,Antioxidant activity assessed as DPPH radical ...
67,18.3,72276,10425115.0,IC50,Antioxidant activity assessed as superoxide ra...
91,49.0,72276,14640528.0,IC50,Antioxidant activity assessed as DPPH radical ...
115,119.8,72276,15380193.0,IC50,In vitro antioxidant effect by the inhibition ...


In [15]:
len(antioxidant_bioactivity_IC_fifty.cid.value_counts())

11

In [16]:
antioxidant_bioactivity_IC_fifty.to_csv("antioxidant_bioactivity_IC_fifty.csv", index=False)
cytotoxicity_bioactivity_IC_fifty.to_csv("cytotoxicity_bioactivity_IC_fifty.csv", index=False)

In [17]:
! cp -r /content/cytotoxicity_bioactivity_IC_fifty.csv /content/drive/MyDrive/Current-Work/ML-Chemistry-Department-2021/
! cp -r /content/antioxidant_bioactivity_IC_fifty.csv /content/drive/MyDrive/Current-Work/ML-Chemistry-Department-2021/

In [18]:
cytotoxicity_ic_fifty_median, cytotoxicity_ic_fifty_max, cytotoxicity_ic_fifty_min = minmaxmedian(cytotoxicity_bioactivity_IC_fifty)

antioxidant_ic_fifty_median, antioxidant_ic_fifty_max, antioxidant_ic_fifty_min = minmaxmedian(antioxidant_bioactivity_IC_fifty)

In [19]:
path = "/content/drive/MyDrive/Current-Work/ML-Chemistry-Department-2021/Excel-Files/compounds_1D_descriptors.xlsx"

descriptors = pd.read_excel(path)
descriptors.head()

Unnamed: 0,PubChem_CID,CompName,Heavy_Atom,Rotat_Bond,Ele_Charge,Mol_Mass,H_Donor,H_Acceptor,XLogP,Can_SMILE,Iso_SMILE,Cac_Fingerprint
0,162350,Apigenin-6-C-glucoside,31,3,0,432.105652,7,10,0.2,C1=CC(=CC=C1C2=CC(=O)C3=C(O2)C=C(C(=C3O)C4C(C(...,C1=CC(=CC=C1C2=CC(=O)C3=C(O2)C=C(C(=C3O)[C@H]4...,1110000001111000001111000000000000000000000000...
1,10085878,Argentinine,22,4,0,295.157227,1,3,4.3,CN(C)CCC1=CC(=C(C2=C1C=CC3=CC=CC=C32)OC)O,CN(C)CCC1=CC(=C(C2=C1C=CC3=CC=CC=C32)OC)O,1110000001111010001100000000000000000000000000...
2,73160,Catechin,21,1,0,290.079041,5,6,0.4,C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)O,C1[C@H]([C@@H](OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=...,1100000001110000001110000000000000000000000000...
3,72276,Epicatechin,21,1,0,290.079041,5,6,0.4,C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)O,C1[C@H]([C@H](OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C...,1100000001110000001110000000000000000000000000...
4,114776,Homoorientin,32,3,0,448.100555,8,11,-0.2,C1=CC(=C(C=C1C2=CC(=O)C3=C(O2)C=C(C(=C3O)C4C(C...,C1=CC(=C(C=C1C2=CC(=O)C3=C(O2)C=C(C(=C3O)[C@H]...,1110000001111000001111000000000000000000000000...


In [20]:
def final_data(ic_fifty_min, ic_fifty_max, ic_fifty_med, descriptors):
  temp_list = list()

  for key in ic_fifty_min.keys():

    entry = descriptors[descriptors['PubChem_CID']==int(key)]

    cid = str(entry.PubChem_CID.values[0])
    comp_name = str(entry.CompName.values[0])
    smiles = entry.Can_SMILE.values[0]
    heavy_atom = entry.Heavy_Atom.values[0]
    bond = entry.Rotat_Bond.values[0]
    mass = entry.Mol_Mass.values[0]
    donor = entry.H_Donor.values[0]
    acceptor = entry.H_Acceptor.values[0]
    xlogp = entry.XLogP.values[0]
    fingerprint = str(entry.Cac_Fingerprint.values[0])
    ic_min = ic_fifty_min[key]
    ic_max = ic_fifty_max[key]
    ic_med = ic_fifty_med[key]

    temp_list.append((cid, comp_name, smiles, heavy_atom, bond, mass, 
                      donor, acceptor, xlogp, fingerprint, ic_min, 
                      ic_max, ic_med))

  columns_names=['PubChem_CID', 'Compound', 'SMILES', 'Heavy_Atom', 'Rotat_Bond', 
                'Mol_Mass', 'H_Donor', 'H_Acceptor', 'XLogP', 
                'Fingerprint', 'IC50_min', 'IC50_max', 'IC50_med']

  final_df = pd.DataFrame(temp_list, columns=columns_names)  

  return final_df

In [21]:
antioxidant_final = final_data(antioxidant_ic_fifty_min, antioxidant_ic_fifty_max, antioxidant_ic_fifty_median, descriptors)
antioxidant_final

Unnamed: 0,PubChem_CID,Compound,SMILES,Heavy_Atom,Rotat_Bond,Mol_Mass,H_Donor,H_Acceptor,XLogP,Fingerprint,IC50_min,IC50_max,IC50_med
0,72276,Epicatechin,C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)O,21,1,290.079041,5,6,0.4,1100000001110000001110000000000000000000000000...,1.59,25000.0,33.65
1,5280863,Kaempferol,C1=CC(=CC=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O,21,1,286.047729,4,6,1.9,1100000001110000001110000000000000000000000000...,0.84,19000.0,89.7
2,5280343,Quercetin,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,22,1,302.042664,5,7,1.5,1100000001110000001110000000000000000000000000...,0.00011,8500.0,6.0
3,5280804,Quercetin 3-O-glucoside,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)OC4...,33,4,464.09549,8,12,0.4,1110000001111000001111000000000000000000000000...,11.7,75.3,32.2
4,637760,Chalcone,C1=CC=C(C=C1)C=CC(=O)C2=CC=CC=C2,16,3,208.088821,0,1,3.1,1100000001110000001000000000000000000000000000...,70.9,3139260.0,98.09
5,689043,Caffeic acid,C1=CC(=C(C=C1C=CC(=O)O)O)O,13,2,180.042252,3,4,1.2,1100000001110000001110000000000000000000000000...,0.1,287.0,18.1
6,3469,Gentisic acid,C1=CC(=C(C=C1O)C(=O)O)O,11,1,154.026611,3,4,1.6,1000000001100000001110000000000000000000000000...,4.26,4.26,4.26
7,7428,4-O-methyl gallate,COC(=O)C1=CC(=C(C(=C1)O)O)O,13,2,184.03717,3,5,0.9,1100000001110000001110000000000000000000000000...,16.49,16.49,16.49
8,370,Gallic acid,C1=C(C=C(C(=C1O)O)O)C(=O)O,12,1,170.02153,4,5,0.7,1000000001100000001110000000000000000000000000...,0.53,100000.0,13.33
9,14985,Vitamin E (tocopherols),CC1=C(C2=C(CCC(O2)(C)CCCC(C)CCCC(C)CCCC(C)C)C(...,31,12,430.381073,1,2,10.7,1111000001111000001100000000000000000000000000...,0.8,350.0,22.8


In [22]:
cytotoxicity_final = final_data(cytotoxicity_ic_fifty_min, cytotoxicity_ic_fifty_max, cytotoxicity_ic_fifty_median, descriptors)
cytotoxicity_final

Unnamed: 0,PubChem_CID,Compound,SMILES,Heavy_Atom,Rotat_Bond,Mol_Mass,H_Donor,H_Acceptor,XLogP,Fingerprint,IC50_min,IC50_max,IC50_med
0,162350,Apigenin-6-C-glucoside,C1=CC(=CC=C1C2=CC(=O)C3=C(O2)C=C(C(=C3O)C4C(C(...,31,3,432.105652,7,10,0.2,1110000001111000001111000000000000000000000000...,50.0,50.0,50.0
1,72276,Epicatechin,C1C(C(OC2=CC(=CC(=C21)O)O)C3=CC(=C(C=C3)O)O)O,21,1,290.079041,5,6,0.4,1100000001110000001110000000000000000000000000...,25.0,184.1,25.0
2,5280863,Kaempferol,C1=CC(=CC=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O,21,1,286.047729,4,6,1.9,1100000001110000001110000000000000000000000000...,10.0,400.0,53.685
3,5280343,Quercetin,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)O)O)O,22,1,302.042664,5,7,1.5,1100000001110000001110000000000000000000000000...,2.5,177.5,23.0
4,5280804,Quercetin 3-O-glucoside,C1=CC(=C(C=C1C2=C(C(=O)C3=C(C=C(C=C3O2)O)O)OC4...,33,4,464.09549,8,12,0.4,1110000001111000001111000000000000000000000000...,10.0,10.0,10.0
5,160597,Anonaine,C1CNC2CC3=CC=CC=C3C4=C2C1=CC5=C4OCO5,20,0,265.110291,1,3,2.8,1100000001111010001100000000000000000000000000...,8.6,28.9,18.6
6,637760,Chalcone,C1=CC=C(C=C1)C=CC(=O)C2=CC=CC=C2,16,3,208.088821,0,1,3.1,1100000001110000001000000000000000000000000000...,6.4,100.0,24.95
7,689043,Caffeic acid,C1=CC(=C(C=C1C=CC(=O)O)O)O,13,2,180.042252,3,4,1.2,1100000001110000001110000000000000000000000000...,27.16,700.0,500.0
8,3469,Gentisic acid,C1=CC(=C(C=C1O)C(=O)O)O,11,1,154.026611,3,4,1.6,1000000001100000001110000000000000000000000000...,256.0,15000.0,7628.0
9,7428,4-O-methyl gallate,COC(=O)C1=CC(=C(C(=C1)O)O)O,13,2,184.03717,3,5,0.9,1100000001110000001110000000000000000000000000...,267.2,267.2,267.2


In [23]:
cytotoxicity_final.to_csv("cytotoxicity_final.csv", index=False)
antioxidant_final.to_csv("antioxidant_final.csv", index=False)

In [24]:
! cp -r /content/cytotoxicity_final.csv /content/drive/MyDrive/Current-Work/ML-Chemistry-Department-2021/
! cp -r /content/antioxidant_final.csv /content/drive/MyDrive/Current-Work/ML-Chemistry-Department-2021/