# ChEMBL - Fetching Data

First, we import the ChEMBL webresource library and then pandas to make the conversion into a dataframe

In [1]:
# Importing the libraries
from chembl_webresource_client.new_client import new_client
import pandas as pd

In [2]:
# Doing the query and importing every molecule that passed through phase 4
molecule = new_client.molecule
approved_drugs = molecule.filter(max_phase=4).order_by('molecule_properties__mw_freebase')

# Transforming into a pandas dataframe to save it as a csv
approved_drugs_df = pd.DataFrame.from_dict(approved_drugs)

# Peek
approved_drugs_df.head(5)

Unnamed: 0,atc_classifications,availability_type,biotherapeutic,black_box_warning,chebi_par_id,chirality,cross_references,dosed_ingredient,first_approval,first_in_class,...,topical,usan_stem,usan_stem_definition,usan_substem,usan_year,withdrawn_class,withdrawn_country,withdrawn_flag,withdrawn_reason,withdrawn_year
0,[V03AN03],1,,0,30217.0,2,[],True,2015.0,0,...,True,-ium,quaternary ammonium derivatives,-ium,,,,False,,
1,[],1,,0,16134.0,2,"[{'xref_id': 'ammonia%20n-13', 'xref_name': 'a...",False,2007.0,0,...,False,,,,1990.0,,,False,,
2,[],1,,0,,2,"[{'xref_id': 'ammonia%20n-13', 'xref_name': 'a...",True,2007.0,0,...,False,,,,1990.0,,,False,,
3,[],2,,0,15377.0,2,"[{'xref_id': 'purified%20water', 'xref_name': ...",True,2011.0,0,...,True,deu-,deuterated compounds,deu-,1963.0,,,False,,
4,[V03AN04],1,,0,17997.0,2,[],True,2015.0,0,...,True,,,,,,,False,,


The next thing we want to do is to retain only the smiles structures from the `molecule_structures` variable, which in this case is a dictionary, maybe an artifact from the .json() format. We need to do this in this step because on the pre-cleaning it was not possible, maybe when it was converted into a '.csv' file the `molecule_structures` lost its properties as an dictionary object, and I don't now how to solve it yet.

In [9]:
approved_drugs_df['SMILES'] = approved_drugs_df['molecule_structures'].apply(lambda x: x['canonical_smiles'] if x != None else 'not found')

## Saving the data

In [14]:
# Take a look at the shape and the variables gathered
print(approved_drugs_df.shape, approved_drugs_df.columns)

# Saving the output
approved_drugs_df.to_csv('data/RAW_approved_drugs.csv', index = False)

(4121, 39) Index(['atc_classifications', 'availability_type', 'biotherapeutic',
       'dosed_ingredient', 'first_approval', 'first_in_class', 'helm_notation',
       'indication_class', 'inorganic_flag', 'max_phase', 'molecule_chembl_id',
       'molecule_hierarchy', 'molecule_properties', 'molecule_structures',
       'molecule_synonyms', 'molecule_type', 'natural_product', 'oral',
       'parenteral', 'polymer_flag', 'pref_name', 'prodrug', 'structure_type',
       'therapeutic_flag', 'topical', 'usan_stem', 'usan_stem_definition',
       'usan_substem', 'usan_year', 'withdrawn_class', 'withdrawn_country',
       'withdrawn_flag', 'withdrawn_reason', 'withdrawn_year', 'SMILES'],
      dtype='object')
