# ChEMBL - Fetching Data

First, we import the ChEMBL webresource library and then pandas to make the conversion into a dataframe

In [1]:
from chembl_webresource_client.new_client import new_client
import pandas as pd
import re

We can use the `.only()` method to filter the information we want to retrieve:
* indication_class
* first_approval
* inorganic_flag
* molecule_structures
* molecule_type
* natural_product
* oral
* parenteral
* polymer_flag
* pref_name
* prodrug
* therapeutic_flag
* topical
* withdrawn_flag

In [2]:
molecule = new_client.molecule
approved_drugs = molecule.filter(max_phase=4).order_by('molecule_properties__mw_freebase').only(
    'indication_class',
    'first_approval',
    'inorganic_flag',
    'molecule_structures',
    'molecule_type',
    'natural_product',
    'oral',
    'parenteral',
    'polymer_flag',
    'pref_name',
    'prodrug',
    'therapeutic_flag',
    'topical',
    'withdrawn_flag'
)
approved_drugs_df = pd.DataFrame.from_dict(approved_drugs)
approved_drugs_df.head(5)

Unnamed: 0,first_approval,indication_class,inorganic_flag,molecule_structures,molecule_type,natural_product,oral,parenteral,polymer_flag,pref_name,prodrug,therapeutic_flag,topical,withdrawn_flag
0,2015.0,"Gases, Diluent for",1,"{'canonical_smiles': '[He]', 'molfile': '  ...",Small molecule,0,False,False,0,HELIUM,0,False,True,False
1,2007.0,Pharmaceutic Aid (solvent and source of ammoni...,0,"{'canonical_smiles': 'N', 'molfile': '  RD...",Small molecule,0,False,True,0,"AMMONIA SOLUTION, STRONG",0,False,False,False
2,2007.0,Radioactive Agent; Diagnostic Aid (cardiac ima...,0,"{'canonical_smiles': '[13NH3]', 'molfile': '  ...",Small molecule,0,False,True,0,AMMONIA N 13,0,False,False,False
3,2011.0,"Diagnostic Aid (radioactive, vascular disorder...",0,"{'canonical_smiles': 'O', 'molfile': '  RD...",Small molecule,0,False,False,0,WATER,0,False,True,False
4,2015.0,Pharmaceutic Aid (air displacement),0,"{'canonical_smiles': 'N#N', 'molfile': '  ...",Small molecule,0,False,False,0,NITROGEN,0,False,True,False


## Pre-cleaning

In this step, we drop some missing and irrelevant stuff and reorder stuff.

Extracting the *canonical_smiles* from the `molecule_structures` variable. We use a lambda function to use the `if/else` condition and apply not found for the absent SMILES.

In [3]:
# Generating a SMILES column from the 'molecule_structures' variable
approved_drugs_df['SMILES'] = approved_drugs_df['molecule_structures'].apply(lambda x: x['canonical_smiles'] if x != None else 'not found')

Next we do the following pre-cleaning steps:
* Dropping the `molecule_structures` variable as the SMILES are already stored in the `SMILES` variable created in the previous steps;
* Filter only the *Small molecules* on the `molecule_type` variable;
* Remove strictly *inorganic* and *polymers*;
* Remove SMILES not found;
* Remove radioactive, gases and diluent;
* Only `therapeutic_flag` == True;
* Remove structures without approval date;

In [4]:
# Filtering the structures:

# Dopping the 'molecule_structure' variable because we only want canonical_smiles
approved_drugs_df.drop('molecule_structures',axis=1, inplace=True)

# We want only 'Small molecule', don't want inorganic or polymer stuff
approved_drugs_df = approved_drugs_df.loc[approved_drugs_df['molecule_type'] == 'Small molecule']
approved_drugs_df = approved_drugs_df.loc[approved_drugs_df['inorganic_flag'] == 0]
approved_drugs_df = approved_drugs_df.loc[approved_drugs_df['polymer_flag'] == 0]

# We don't want SMILES not found
approved_drugs_df = approved_drugs_df.loc[approved_drugs_df['SMILES'] != 'not found']

# We don't want indication_class containing the word "gases" or "diluent" or "radioactive"
approved_drugs_df = approved_drugs_df[~approved_drugs_df['indication_class'].str.contains("radioactive|gases|diluent", flags=re.IGNORECASE, regex=True, na=False)]

# with terapeutic_flag == True
approved_drugs_df = approved_drugs_df[approved_drugs_df['therapeutic_flag'] == True]

# Remove the undated first approvals
approved_drugs_df = approved_drugs_df[~approved_drugs_df['first_approval'].isna()].reset_index(drop=True)

Now we can do some tweaking into our dataframe:
* **Rounding** the number of approval date (just for aesthetic)
* **Reordering** the dataframe into a more logic variable sequence

In [5]:
#Rounding the datetime
approved_drugs_df['first_approval'] = approved_drugs_df['first_approval'].apply(int)

# Reordering the dataframe
approved_drugs_df = approved_drugs_df[['pref_name','SMILES','first_approval','polymer_flag','indication_class','withdrawn_flag','inorganic_flag','polymer_flag','therapeutic_flag','natural_product','oral', 'parenteral', 'topical']]

print("\n", approved_drugs_df.columns, "\n")
print(f"The dataset has the shape {approved_drugs_df.shape}")
approved_drugs_df


 Index(['pref_name', 'SMILES', 'first_approval', 'polymer_flag',
       'indication_class', 'withdrawn_flag', 'inorganic_flag', 'polymer_flag',
       'therapeutic_flag', 'natural_product', 'oral', 'parenteral', 'topical'],
      dtype='object') 

The dataset has the shape (2396, 13)


Unnamed: 0,pref_name,SMILES,first_approval,polymer_flag,indication_class,withdrawn_flag,inorganic_flag,polymer_flag.1,therapeutic_flag,natural_product,oral,parenteral,topical
0,NITRIC OXIDE,[N]=O,1999,0,,False,0,0,True,0,False,False,True
1,HYDROGEN PEROXIDE,OO,2017,0,"Anti-Infective, Topical",False,0,0,True,0,False,False,True
2,GUANIDINE,N=C(N)N,1939,0,,False,0,0,True,0,True,False,False
3,GUANIDINE HYDROCHLORIDE,Cl.N=C(N)N,1939,0,,False,0,0,True,0,True,False,False
4,LITHIUM CARBONATE,O=C([O-])[O-].[Li+].[Li+],1970,0,Antimanic,False,0,0,True,0,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2391,TELAVANCIN,CCCCCCCCCCNCCN[C@@]1(C)C[C@H](O[C@H]2[C@H](Oc3...,2009,0,,False,0,0,True,1,False,True,False
2392,TELAVANCIN HYDROCHLORIDE,CCCCCCCCCCNCCN[C@@]1(C)C[C@H](O[C@H]2[C@H](Oc3...,2009,0,,False,0,0,True,1,False,True,False
2393,ERGOLOID,CC(C)C[C@H]1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=...,1953,0,Cognition Adjuvant,False,0,0,True,1,True,False,True
2394,ERGOLOID MESYLATES,CC(C)C[C@H]1C(=O)N2CCC[C@H]2[C@]2(O)O[C@](NC(=...,1953,0,Cognition Adjuvant,False,0,0,True,1,True,False,True


Now we're left with the following problems:
* Extremely small structures such as Nitric Oxide, Hydrogen Peroxide that cannot be used to derive any rules. (Maybe this can be solved by applying some MW filter)
* Repeated SMILES (salt and conjugated) - Maybe this can be solved by keeping the largest fragment and dropping x*plicated SMILES.
* Extremely large structures such as OMEGA-3-Carboxylic Acids (Apply some MW filter? Maybe)


## Save for further analysis