# Data Collection and Pre-Processing

### Installing Chembl API

In [None]:
!pip install chembl_webresource_client

### Importing Libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from chembl_webresource_client.new_client import new_client


from chembl_structure_pipeline import standardizer
from rdkit import Chem
from rdkit import rdBase
from rdkit.Chem.Draw import IPythonConsole

rdBase.rdkitVersion

from rdkit.Chem import Draw
from rdkit.Chem import MolStandardize


## Functions to be used in this project

In [None]:
def print_columns(dataframe):
    columns = dataframe.columns
    for i in columns:
        print(i)

In [None]:
def dupe_removal(dataframe, interest_column, bioactivity_column):
    """
    interest_column and bioactivity_column must be in the dataframe and must be a string,
    dataframe must be a Pandas Dataframe object
    """
    df_dupe = dataframe[dataframe[interest_column].duplicated(keep = False)]
    
    #Make a list with the SMILES representing each dupe
    smile_dupes = list()
    for i in df_dupe[interest_column]:
        if i not in smile_dupes:
            smile_dupes.append(i)
    
    #Make a list with Standard deviation for each set of duplicates
    deviation = list()
    for i in smile_dupes:
        df_dev = df_dupe[df_dupe[interest_column] == i]
        deviation.append([df_dev[bioactivity_column].std(), i]) #(bioactivity, SMILE)
        
    # List with all the duplicates with bioactivity deviation lesser than two, to be removed from the dataframe
    to_be_removed = list()
    for i, j in deviation:
        if i <= 2:
            to_be_removed.append(j)
            
    #Removal of duplicated canonical smiles with small deviation from original dataset
    for i in to_be_removed:
        dataframe.drop(df_activity.loc[df_activity['canonical_smiles'] == i].index, inplace = True)

In [None]:
def mol_to_block(dataframe):
    """
    Smiles column in the dataframe must be named canonical_smiles
    """
    mols = list()
    for i, smi in enumerate(dataframe.canonical_smiles):
        try:
            mol = Chem.MolFromSmiles(smi)
            mol = Chem.MolToMolBlock(mol)
            mols.append(mol)
        except:
            print(smi)
    return mols

In [None]:
def mol_standardizer(dataframe):
    mol_block = mol_to_block(dataframe)
    std_block = list()
    for i in mol_block:
        try:
            std_molblock = standardizer.standardize_molblock(i)
            std_block.append(std_molblock)
        except:
            print(i)
    return std_block

In [None]:
def block_to_smiles(block_list):
    SMILE_list = []
    for i in block_list:
        try:
            mol = Chem.MolFromMolBlock(i)
            mol = Chem.MolToSmiles(mol)
            SMILE_list.append(mol)
        except:
            print(i)
    return SMILE_list

In [None]:
def remove_tautomers(smile_list):
    smile_tau = list()
    for i in smile_list:
        try:
            tau_smi = MolStandardize.canonicalize_tautomer_smiles(i)       
            smile_tau.append(tau_smi)
        except:
            print(smi)
    return smile_tau

In [None]:
def remove_salts(smile_list):
    smile_salt = list()
    for i in smile_list:
        try:
            mol = Chem.MolFromSmiles(i)
            mol = Chem.SaltRemover.StripMol(mol)
            smile_salt.append(Chem.MolToSmiles(mol))
        except:
            print(smi)  
    return smile_salt

In [None]:
def bioactivity_class(dataframe, standard_value):
    """
    standard_value is the name of the column which contain the bioactivity class (standard class in ChEMBL). Must be a string
    
    """
    bioactivity = list()
    for i in dataframe[standard_value]:
        if i >= 10000:
            bioactivity.append("inactive")
        elif i <= 1000:
            bioactivity.append("active")
        else:
            bioactivity.append("intermediate")
    return bioactivity

In [None]:
def pIC50molar(dataframe):
    pIC50 = []
    
    for i in dataframe.standard_value:
        molar =i*(10**-9) # Converts nM to M
        pIC50.append(-np.log10(molar))
            
    dataframe['standard_value'] = pIC50
    dataframe.rename(columns = {'standard_value':'pIC50'}, inplace = True)

## Data Collection - CHEMbl API

In [None]:
target = new_client.target
target_query = target.search('InhA')
targets = pd.DataFrame.from_dict(target_query)

In [None]:
targets

### Target Selection

In [None]:
selected_target = targets.target_chembl_id[0] 
selected_target

### Filtering by Activity

In [None]:
activity = new_client.activity
res = activity.filter(target_chembl_id=selected_target).filter(standard_type="IC50")

In [None]:
df_activity = pd.DataFrame.from_dict(res)

In [None]:
df_activity.head(10)

## Data Collection - Other Way

Download in ChEMBL web platform the target do be analyzed, as a .csv file. 
This project will use the Enoyl ACP Enoctase

In [None]:
#df_activity = pd.read_csv('inha.csv', sep = ';')
#df_activity.head()

Pandas Default Settings doesn't allow all data of a big dataframe to be shown, but it's settings can be changed

In [None]:
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows', None)

Index files will appear a few times in this project. They can be confusing to look at, so a better visualization can be implemented

In [None]:
print_columns(df_activity)

## Data Pre-Processing

### Removal of usual Null and Non Numerical values in interest columns

In [None]:
# number of targets before pre-processing
before_pre = df_activity.shape[0]
before_pre

In [None]:
df_activity.drop(df_activity[df_activity['assay_type'] == 'F'].index, inplace=True)

In [None]:
# inplace = True Allow the changes to be made in the original dataset
df_activity.drop(df_activity[df_activity['standard_relation'] == '>'].index, inplace=True)
df_activity.drop(df_activity[df_activity['standard_relation'] == '<'].index, inplace=True)

In [None]:
df_activity.dropna(subset = ['standard_value'], inplace = True)
df_activity.dropna(subset = ['standard_units'], inplace = True)
df_activity.dropna(subset = ['canonical_smiles'], inplace = True)
df_activity.dropna(axis=1, how='all', inplace = True)
df_activity['standard_value'] = df_activity['standard_value'].astype(float)

In [None]:
after_pre = df_activity.shape[0]
after_pre

In [None]:
print(f'The number of non numerical and null values removed is {before_pre - after_pre}')

### Removal of non-relevant Duplicates

Duplicates with a standard deviation value of 2 or higher will be kept in the dataset

In [None]:
dupe_removal(df_activity, 'canonical_smiles', 'standard_value')

In [None]:
print(f'{after_pre - df_activity.shape[0]} duplicates were removed in this process')

## Pre- Processing

### ChEMBL Structure Pipeline - Standardizer

ChEMBL Structure Pipeline Package can be used to standardize the SMILES from the dataset. 

Info about this package can be found at https://github.com/chembl/ChEMBL_Structure_Pipeline
    
To use this package, having RDKit installed is a necessity

The process of standardization follow these steps:
    
    1. Standardize unknown stereochemistry (Handled by the RDKit Mol file parser) i)Fix wiggly bonds on sp3 carbons - sets atoms and bonds marked as unknown stereo to no stereo ii)Fix wiggly bonds on double bonds – set double bond to crossed bond
    
    2. Clears S Group data from the mol file
    
    3. Kekulize the structure
    
    4. Remove H atoms 
    
    5. Normalization: 
        i) Fix hypervalent nitro groups 
        
        ii) Fix KO to K+ O- and NaO to Na+ O- (Also add Li+ to this) 
        
        iii)Correct amides with N=COH 
        
        iv) Standardise sulphoxides to charge separated form 
        
        v) Standardize diazonium N (atom :2 here: [:1]-[N;X2:2]#[N;X1:3]>>[:1]) to N+ vi) Ensure quaternary N is charged 
        
        vii)Ensure trivalent O ([*:1]=[O;X2;v3;+0:2]-[#6:3]) is charged 
        
        viii)Ensure trivalent S ([O:1]=[S;D2;+0:2]-[#6:3]) is charged 
        
        ix) Ensure halogen with no neighbors ([F,Cl,Br,I;X0;+0:1]) is charged
    
    6. The molecule is neutralized, if possible. See the page on neutralization rules for more details.
    
    7. Remove stereo from tartrate to simplify salt matching
    
    8. Normalise (straighten) triple bonds and allenes

In [None]:
!git clone https://github.com/chembl/ChEMBL_Structure_Pipeline.git
!pip install ./ChEMBL_Structure_Pipeline

To use the Standardizer function, the Molecules must be in Mol_block format

In [None]:
blocks_std = mol_standardizer(df_activity)

In [None]:
std_smiles = block_to_smiles(blocks_std)

### Removal of Tautomers and Salts

Impurities can be removed and SMILES can be normalized manually using the RDKit Library. Here follows examples of coding to remove Tautomers and Salts. Other base functions from RDKit can be found in its own documentation, and removal functions can be written as the ones below.

In [None]:
#no_tautomers = remove_tautomer(std_smiles)

In [None]:
#no_salt = remove_salt(no_tautomers)

### Integrating the new canonical_smiles column in the base dataset

In [None]:
df_activity['canonical_smiles'] = std_smiles

## Set bioactivity class as inactive, active and intermediate

In [None]:
bioactivity = bioactivity_class(df_activity, 'standard_value')

In [None]:
df_activity['bioactivity_class'] = bioactivity
df_activity.bioactivity_class.value_counts()

## Normalize IC50 values to pIC50

In [None]:
df_activity.head()

In [None]:
pIC50molar(df_activity)

In [None]:
df_activity.head()

In [None]:
df_activity.pIC50.describe()

## pIC50 Distribution graph

In [None]:
traces = []

for activity, bioactivity in df_activity.groupby('bioactivity_class'):
    traces.append(go.Scatter(x = bioactivity.pIC50, y = bioactivity.pIC50, name = activity, mode='markers'))

fig = go.Figure(data=traces)

fig.update_layout(
    title={
        'text': "pIC50 by Bioactivity",
        'y':0.9,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'},
    xaxis_title="pIC50",
    yaxis_title="pIC50",
    legend_title="Bioactivity Class",
    font=dict(
        family="Courier New, monospace",
        size=18,
        color="RebeccaPurple"
    )
)


fig.show()

In [None]:
df_activity.to_csv('df_processed_complete.csv')

In [None]:
df_interest = df_activity.filter(['molecule_chembl_id','canonical_smiles','bioactivity_class', 'pIC50'], axis=1)

In [None]:
df_interest.to_csv('df_interest_columns.csv')

### Removal of molecules classified as intermediate 

In [None]:
df_interest = df_interest[df_interest.bioactivity_class != 'intermediate']

In [None]:
df_interest.to_csv('df_interest_columns_clean.csv')