In [1]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem
from rdkit.Chem.Fingerprints import FingerprintMols

import urllib
from urllib.request import urlopen
from bs4 import BeautifulSoup

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#hiding warning messages
import warnings
warnings.filterwarnings("ignore")

#Reading in Molecular Properties CSV
data = pd.read_csv('NCGC_Molecular_Properties_CSV')
#data = data.astype(float, errors = 'ignore')
data.shape

(328, 17)

In [2]:
#Making a 'target column', where inactives are 0 and actives are 1
activity_list = data['Activity Summary']
activity_list = activity_list.tolist()
target_list = []
i = 0

while (i < len(activity_list)):
    if (activity_list[i] == 'active antagonist'):
        target_list.append(1)
        i = i + 1
    elif (activity_list[i] == 'active agonist'):
        target_list.append(1)
        i = i + 1
    else:
        target_list.append(0)
        i = i + 1
        
#Making strings ints
target_list = [int(i) for i in target_list]

#adding target column to data
data['target'] = target_list

# MACCS Keys

In [3]:
#Making list of isomericSMILES identifiers
iSMILES = data['IsomericSMILES']
iSMILES[0]

'C1=CC=C(C=C1)C[C@@H](C(=O)O)N'

In [4]:
#Making list of molecules from iSMILES
i = 0
ms = []
while (i < len(iSMILES)):
    molecule = Chem.MolFromSmiles(iSMILES[i])
    ms.append(molecule)
    i = i + 1

In [5]:
#Getting MACCS fingerprints list
#Turning fingerprints to bit strings
maccs_fps = [MACCSkeys.GenMACCSKeys(x).ToBitString()[1:] for x in ms ]

# Topological Fingerprint (Daylight Analogue)

In [6]:
from rdkit.Chem import rdmolops

#getting list of topological fingerprints
top_fps = [rdmolops.RDKFingerprint(x, fpSize=2048, minPath=1, maxPath=7).ToBitString() for x in ms]

# Morgan Fingerprint (ECFP)

In [7]:
from rdkit.Chem import AllChem

#getting morgan ecfp fingerprint
ecfp_fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1024).ToBitString() for x in ms]

# Morgan Fingerprint (FCFP)

In [8]:
fcfp_fps = [AllChem.GetMorganFingerprintAsBitVect(x,4,nBits=1024,useFeatures=True).ToBitString() for x in ms]

# PubChem FP Decoding

In [9]:
#Getting CID list
CID_list = data['PUBCHEM_CID']

#Making them integers (removing decimals)
CID_list = [int(i) for i in CID_list]

#Changing list to str, dropping start/end brackets, removing spaces
CID_str = (str(CID_list)[1:-1])
CID_str = CID_str.replace(' ', '')

In [10]:
#Getting the Pubchem Fingerprints for each CID

#opening and reading the 1URLs
url1 = ('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/cid/' + CID_str + '/property/Fingerprint2D/TXT')
html1 = urlopen(url1) 
soup1 = BeautifulSoup(html1, 'lxml')

pub_fp = soup1.get_text()

#pub_fp string to pub_fp list
pub_fp = pub_fp.split()

In [11]:
#Decoding Fingerprints
from base64 import b64decode

def PCFP_BitString(pcfp_base64) :

    pcfp_bitstring = "".join( ["{:08b}".format(x) for x in b64decode( pcfp_base64 )] )[32:913]
    return pcfp_bitstring

i = 0
pub_fp_decoded = []
while (i < len(pub_fp)):
    fp = PCFP_BitString(pub_fp[i])
    pub_fp_decoded.append(fp)
    i = i + 1

# FinalDFTox21

In [12]:
fps_df = pd.DataFrame()
fps_df['Name'] = data['PUBCHEM_CID']
fps_df['MACCS'] = maccs_fps
fps_df['Topological'] = top_fps
fps_df['Morgan ECFP'] = ecfp_fps
fps_df['Morgan FCFP'] = fcfp_fps
fps_df['Pubchem FP'] = pub_fp_decoded
fps_df['Activity'] = data['target']
fps_df.head()

Unnamed: 0,Name,MACCS,Topological,Morgan ECFP,Morgan FCFP,Pubchem FP,Activity
0,6140,0000000000000000000000000000000000000000000000...,0000000010100000010000000000000000000000000000...,0100000000000000000000000000000010000000001000...,1110100000000000000100001000000010000000000000...,1100000001110010001100000000000000000000000000...,0
1,7321,0000000000000000000000010000000000000000000000...,0000000000000000010000000000000001000000000000...,0000000000000000000000000000000000000000000000...,0010100000000000100101000000000000000000000000...,1000000001100011001110000000000000000000000000...,0
2,7389,0010000000000000000000000000000000000000001100...,0000000000001000000000000000000000000000000000...,0001000000000000000000000001000000000000000000...,1110100100000000000100000000000000000000000000...,1100000001100010001100000000000000000000000000...,0
3,8030,0000000000000000000000000000000000010000000000...,0000000000000000000000000000000000000000000000...,0000000000000010000000000000000000000000000000...,0000101000000000000000000000000000000000000010...,1000000001100000000000000000000001000000000000...,0
4,8247,0000000000000000000000000000000000000000000000...,1001000000000000000000000000000000000000000000...,0000000000000000000000000000000001000000000010...,1111100000000000000000000000000000000000000000...,1100000001110011001000000000000000000000000000...,1


In [13]:
#Writing to csv
fps_df.to_csv('Fingerprints NCGC', index = False)


# Formatting Input Files

In [14]:
# Formatting - MACCS

#Adding activity_score/Making input train data
#making list of maccs1, maccs2, etc.
i = 0
maccs_names = ['Name', 'Activity']
while (i < len(maccs_fps[0])):
    string = "MACCS" + str(i + 1)
    maccs_names.append(string)
    i = i + 1

i = 0
CID_list = fps_df['Name'].tolist()
act_list = fps_df['Activity'].tolist()

first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(maccs_fps[0])])
input_ncgc_maccs = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(maccs_fps)):
    bit_row1 = [int(y) for y in str(maccs_fps[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_ncgc_maccs = input_ncgc_maccs.append(row1df)
    row.clear()   
    x = x + 1
    
#Reindexing df, MACCS1, MACCS2, etc as column headers
input_ncgc_maccs.columns = [maccs_names]

#removing decimals
input_ncgc_maccs = input_ncgc_maccs.astype(int)

#Changing int to float (because scikit learn wants it like that)
#input_ncgc_maccs = input_ncgc_maccs.astype(float)

#Writing ncgc MACCS DATA
input_ncgc_maccs.to_csv('input_ncgc_maccs.csv', index = False)





# Formatting - Topological Fingerprint 

#Adding activity_score/Making input train data
#making list of names etc.
i = 0
top_names = ['Name', 'Activity']
while (i < len(top_fps[0])):
    string = "TOP" + str(i + 1)
    top_names.append(string)
    i = i + 1

first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(top_fps[0])])
input_ncgc_top = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(top_fps)):
    bit_row1 = [int(y) for y in str(top_fps[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_ncgc_top = input_ncgc_top.append(row1df)
    row.clear()   
    x = x + 1
    
#Reindexing df, MACCS1, MACCS2, etc as column headers
input_ncgc_top.columns = [top_names]

#removing decimals
input_ncgc_top = input_ncgc_top.astype(int)

#Changing int to float (because scikit learn wants it like that)
#input_ncgc_top = input_ncgc_top.astype(float)

#Writing ncgc Topological DATA
input_ncgc_top.to_csv('input_ncgc_top.csv', index = False)





# Formatting - Morgan (ecfp)

#Adding activity_score/Making input train data
#making list of names etc.
i = 0
ecfp_names = ['Name', 'Activity']
while (i < len(ecfp_fps[0])):
    string = "ECFP" + str(i + 1)
    ecfp_names.append(string)
    i = i + 1

first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(ecfp_fps[0])])
input_ncgc_ecfp = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(ecfp_fps)):
    bit_row1 = [int(y) for y in str(ecfp_fps[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_ncgc_ecfp = input_ncgc_ecfp.append(row1df)
    row.clear()   
    x = x + 1
    
#Reindexing df, namesetc as column headers
input_ncgc_ecfp.columns = [ecfp_names]

#Changing int to float (because scikit learn wants it like that)
#input_ncgc_ecfp = input_ncgc_ecfp.astype(float)

#removing decimals
input_ncgc_ecfp = input_ncgc_ecfp.astype(int)

#Writing ncgc DATA(to TAB DELIMTED FILE?)
input_ncgc_ecfp.to_csv('input_ncgc_ecfp.csv', index = False)




# Formatting - Morgan (fcfp)

#Adding activity_score/Making input train data
#making list of names etc.
i = 0
fcfp_names = ['Name', 'Activity']
while (i < len(fcfp_fps[0])):
    string = "FCFP" + str(i + 1)
    fcfp_names.append(string)
    i = i + 1

#Splitting each bitstring fp into individual bits, and putting them in a df by row
first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(fcfp_fps[0])])
input_ncgc_fcfp = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(fcfp_fps)):
    bit_row1 = [int(y) for y in str(fcfp_fps[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_ncgc_fcfp = input_ncgc_fcfp.append(row1df)
    row.clear()   
    x = x + 1

    
#Reindexing df, namesetc as column headers
input_ncgc_fcfp.columns = [fcfp_names]

#removing decimals
input_ncgc_fcfp = input_ncgc_fcfp.astype(int)

#Changing int to float (because scikit learn wants it like that)
#input_ncgc_fcfp = input_ncgc_fcfp.astype(float)

#Writing ncgc DATA to csv
input_ncgc_fcfp.to_csv('input_ncgc_fcfp.csv', index = False)




# Formatting - PubchemFP

#Adding activity_score/Making input train data
#making list of names etc.
i = 0
pub_fp_names = ['Name', 'Activity']
while (i < len(pub_fp_decoded[0])):
    string = "PubFP" + str(i + 1)
    pub_fp_names.append(string)
    i = i + 1

first_row = []
first_row.append(CID_list[0])
first_row.append(act_list[0])
first_row.extend([int(a) for a in str(pub_fp_decoded[0])])
input_ncgc_pub = pd.DataFrame(first_row).T
x = 1
row = []
while (x < len(pub_fp_decoded)):
    bit_row1 = [int(y) for y in str(pub_fp_decoded[x])]
    row.append(CID_list[x])
    row.append(act_list[x])
    row.extend(bit_row1)
    row1df = pd.DataFrame(row).T
    input_ncgc_pub = input_ncgc_pub.append(row1df)
    row.clear()   
    x = x + 1

    
#Reindexing df, namesetc as column headers
input_ncgc_pub.columns = [pub_fp_names]

#removing decimals
input_ncgc_pub = input_ncgc_pub.astype(int)

#Changing int to float (because scikit learn wants it like that)
#input_ncgc_pub = input_ncgc_pub.astype(float)

#Writing ncgc DATA to csv
input_ncgc_pub.to_csv('input_ncgc_pub.csv', index = False)

