# ChEMBL - Fetching Data

First, we import the ChEMBL webresource library and then pandas to make the conversion into a dataframe

In [1]:
from chembl_webresource_client.new_client import new_client
from rdkit.Chem import PandasTools
import pandas as pd
import re

We can use the `.only()` method to filter the information we want to retrieve:
* indication_class
* first_approval
* inorganic_flag
* molecule_structures
* molecule_type
* natural_product
* oral
* parenteral
* polymer_flag
* pref_name
* prodrug
* therapeutic_flag
* topical
* withdrawn_flag

In [2]:
molecule = new_client.molecule
approved_drugs = molecule.filter(max_phase=4).order_by('molecule_properties__mw_freebase').only(
    'indication_class',
    'first_approval',
    'inorganic_flag',
    'molecule_structures',
    'molecule_type',
    'natural_product',
    'oral',
    'parenteral',
    'polymer_flag',
    'pref_name',
    'prodrug',
    'therapeutic_flag',
    'topical',
    'withdrawn_flag'
)

In [6]:
approved_drugs_df = pd.DataFrame.from_dict(approved_drugs)
approved_drugs_df.head(5)

KeyboardInterrupt: 

In [7]:
molecule = new_client.molecule
mols = molecule.filter(pref_name__iexact='aspirin')
mols

HttpApplicationError: Error for url https://www.ebi.ac.uk/chembl/api/data/molecule.json, server response: <!doctype html>
<html lang="en" class="vf-no-js">
  <head>
    <script>
// Detect if JS is on and swap vf-no-js for vf-js on the html element
(function(H){H.className=H.className.replace(/\bvf-no-js\b/,'vf-js')})(document.documentElement);
</script>

    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <!-- <link rel="stylesheet" media="all" href="/css/styles.css?" /> -->
    <title>Error: 500 | EMBLâs European Bionformatics Institute</title>



    <link rel="icon" type="image/x-icon"
  href="https://ebi.emblstatic.net/web_guidelines/EBI-Framework/v1.4/images/logos/EMBL-EBI/favicons/favicon.ico" />
<link rel="icon" type="image/png"
  href="https://ebi.emblstatic.net/web_guidelines/EBI-Framework/v1.4/images/logos/EMBL-EBI/favicons/favicon-32x32.png" />
<link rel="icon" type="image/png" sizes="192Ã192"
  href="https://ebi.emblstatic.net/web_guidelines/EBI-Framework/v1.4/images/logos/EMBL-EBI/favicons/android-chrome-192x192.png" />
<!-- Android (192px) -->
<link rel="apple-touch-icon-precomposed" sizes="114x114"
  href="https://ebi.emblstatic.net/web_guidelines/EBI-Framework/v1.4/images/logos/EMBL-EBI/favicons/apple-icon-114x114.png" />
<!-- For iPhone 4 Retina display (114px) -->
<link rel="apple-touch-icon-precomposed" sizes="72x72"
  href="https://ebi.emblstatic.net/web_guidelines/EBI-Framework/v1.4/images/logos/EMBL-EBI/favicons/apple-icon-72x72.png" />
<!-- For iPad (72px) -->
<link rel="apple-touch-icon-precomposed" sizes="144x144"
  href="https://ebi.emblstatic.net/web_guidelines/EBI-Framework/v1.4/images/logos/EMBL-EBI/favicons/apple-icon-144x144.png" />
<!-- For iPad retinat (144px) -->
<link rel="apple-touch-icon-precomposed"
  href="https://ebi.emblstatic.net/web_guidelines/EBI-Framework/v1.4/images/logos/EMBL-EBI/favicons/apple-icon-57x57.png" />
<!-- For iPhone (57px) -->
<link rel="mask-icon"
  href="https://ebi.emblstatic.net/web_guidelines/EBI-Framework/v1.4/images/logos/EMBL-EBI/favicons/safari-pinned-tab.svg"
  color="#ffffff" /> <!-- Safari icon for pinned tab -->
<meta name="msapplication-TileColor" content="#2b5797" /> <!-- MS Icons -->
<meta name="msapplication-TileImage"
  content="https://ebi.emblstatic.net/web_guidelines/EBI-Framework/v1.4/images/logos/EMBL-EBI/favicons/mstile-144x144.png" />






    <!-- Search indexing optimisations -->
    <meta class="swiftype" name="what" data-type="string" content="none" />
    <meta class="swiftype" name="where" data-type="string" content="EMBL-EBI" />


    <!-- Descriptive meta -->
    <meta name="title" content="Error: 500">
    <meta name="author" content="European Bioinformatics Institute">
    <meta name="robots" content="index, follow">
    <meta name="keywords" content="">
    <meta name="description" content="">

    <!-- Open Graph / Facebook -->
    <meta property="og:type" content="website">
    <meta property="og:url" content="https://www.ebi.ac.uk/info/error-pages/500-standalone/">
    <meta property="og:title" content="Error: 500">
    <meta property="og:description" content="">


    <!-- Twitter -->
    <meta property="twitter:card" content="summary_large_image">
    <meta property="og:url" content="https://www.ebi.ac.uk/info/error-pages/500-standalone/">
    <meta property="twitter:title" content="Error: 500">
    <meta property="twitter:description" content="">


    <!-- Content descriptors -->
    <meta name="embl:who" content="EMBL-EBI Web Dev">
    <meta name="embl:where" content="EMBL-EBI">
    <meta name="embl:what" content="none">
    <meta name="embl:active" content="where">

    <!-- Content role -->
    <meta name="embl:utility" content="10">
    <meta name="embl:reach" content="0">

    <!-- Page infromation -->
    <meta name="embl:maintainer" content="EMBL-EBI Web Dev">
    <meta name="embl:last-review" content="2021.04.01">
    <meta name="embl:review-cycle" content="365">
    <meta name="embl:expiry" content="never">

    <!-- analytics -->
    <meta name="vf:page-type" content="404;dimension1">

    <!-- CSS only -->
<link rel="stylesheet" href="https://assets.emblstatic.net/vf/v2.5.7/css/styles.css">
<!-- JS -->
<script src="https://assets.emblstatic.net/vf/v2.5.7/scripts/scripts.js"></script>
<head>
  <body class="vf-body vf-stack vf-stack--400">
    <style>head, title, link, meta, style, script {--vf-stack-margin--custom: 0; }</style>

    <!-- See the EBI Header Footer docs: https://stable.visual-framework.dev/components/ebi-header-footer -->

    <link rel="stylesheet" href="https://assets.emblstatic.net/vf/v2.4.5/assets/ebi-header-footer/ebi-header-footer.css" type="text/css" media="all">
    <header id="masthead-black-bar" class="clearfix masthead-black-bar | ebi-header-footer vf-content vf-u-fullbleed"></header>




<style>
  .embl-grid {
    margin-bottom: 48px;
  }
</style>

<section class="vf-intro" id="500">

  <div><!-- empty --></div>

  <div class="vf-stack">

  <h1 class="vf-intro__heading ">Error: 500</h1>
<p class="vf-lede">There was a technical error.</p>


<p class="vf-intro__text">Something has gone wrong with our web server when attempting to make this page.</p><p class="vf-intro__text">Unfortunately, the service you are trying to access is currently unavailable. <br>Please try again later.</p>
  </div>
</section>


<section class="embl-grid embl-grid--has-centered-content">
  <div></div>
 <section>
      <form id="ebi_search" action="/ebisearch/search.ebi" class="vf-form vf-form--search vf-form--search--mini | vf-sidebar vf-sidebar--end">
        <div class="vf-sidebar__inner" style="flex-wrap: nowrap;">
          <div class="vf-form__item">
            <label class="vf-form__label vf-u-sr-only | vf-search__label" for="searchitem">Search</label>
            <input name="query" type="search" placeholder="Find a gene, protein or chemical" id="searchitem" class="vf-form__input" required="" spellcheck="false" data-ms-editor="true">
            <input name="requestFrom" id="requestFrom" type="hidden" class="vf-form__input" value="ebi_index">
          </div>
          <div class="vf-form__item">
            <select name="db" id="db" tabindex="1" class="vf-form__select" style="max-width: 150px">
              <option value="allebi">All</option>
              <optgroup label="Science search">
                <option value="genomes">Genomes &amp; metagenomes</option>
                <option value="nucleotideSequences">Nucleotide sequences</option>
                <option value="proteinSequences">Protein sequences</option>
                <option value="smallMolecules">Small molecules</option>
                <option value="geneExpression">Gene expression</option>
                <option value="geneDiseaseAssociations">Gene-Disease Associations</option>
                <option value="diseases">Diseases</option>
                <option value="molecularInteractions">Molecular interactions</option>
                <option value="reactionsPathways">Reactions &amp; pathways</option>
                <option value="proteinFamilies">Protein families</option>
                <option value="literature">Literature</option>
                <option value="ontologies">Samples &amp; ontologies</option>
              </optgroup>
              <optgroup label="Search web content">
                <option value="ebiweb_people">EMBL-EBI People</option>
                <option value="ebiweb">EMBL-EBI web</option>
                <!-- <option value="ebiweb">EMBL web</option> -->
              </optgroup>
            </select>
          </div>


          <button type="submit" class="vf-search__button | vf-button vf-button--primary">
            <span class="vf-button__text">Search</span>
          </button>
        </div>
      </form>
      <p class="vf-text-body--5 vf-u-margin__bottom--0">
        Example searches: <a class="vf-link" href="/ebisearch/search.ebi?db=allebi&amp;requestFrom=ebi_index&amp;query=blast">blast</a>
        <a class="vf-link" href="/ebisearch/search.ebi?db=allebi&amp;query=keratin&amp;requestFrom=ebi_index">keratin</a>
        <a class="vf-link" href="/ebisearch/search.ebi?db=allebi&amp;query=bfl1&amp;requestFrom=ebi_index">bfl1</a>
        | <a class="vf-link" href="https://www.ebi.ac.uk/ebisearch/overview.ebi/about">About EBI Search</a>
      </p>
    </section>
</section>

<section class="embl-grid">
  <div></div>
  <div class="vf-content">
    <h3>Need assistance?</h3>
    <a class="vf-button vf-button--primary" href="https://www.ebi.ac.uk/support/error">Contact our support team</a>
  </div>
</section>

    <!-- embl global footer -->


<!-- embl-ebi global footer -->
<link rel="import" href="https://www.embl.org/api/v1/pattern.html?filter-content-type=article&filter-id=106902&pattern=node-body&source=contenthub" data-target="self" data-embl-js-content-hub-loader>

    <script src="https://assets.emblstatic.net/vf/v2.4.9/scripts/scripts.js"></script>
<!--
  When using legacy EBI 1.x JS, we disable the old cookie banner.
  https://stable.visual-framework.dev/components/ebi-header-footer/
  -->
<div class="vf-u-display-none" data-protection-message-disable="true"></div>

<!-- IE11 polyfill JS -->
<script nomodule crossorigin="anonymous" src="https://polyfill.io/v3/polyfill.min.js?flags=gated&features=default"></script>
<!-- <script src="/scripts/scripts.js?"></script> -->
<script defer="defer" src="https://ebi.emblstatic.net/web_guidelines/EBI-Framework/v1.4/js/script.js"></script>
<link rel="stylesheet" href="//ebi.emblstatic.net/web_guidelines/EBI-Icon-fonts/v1.3/fonts.css" type="text/css" media="all" />

<!-- Google Analytics -->
<script>
window.ga=window.ga||function(){(ga.q=ga.q||[]).push(arguments)};ga.l=+new Date;
ga('create', 'UA-629242-1', 'auto');
</script>
<script async src='https://www.google-analytics.com/analytics.js'></script>
<!-- End Google Analytics -->

<script type="text/javascript">
  document.addEventListener("DOMContentLoaded", function(event) {

        //- Code to execute when only the HTML document is loaded.
        //- This doesn't wait for stylesheets,
        // images, and subframes to finish loading.
  });
</script>

  </body>
</html>


## Pre-cleaning

In this step, we drop some missing and irrelevant stuff and reorder stuff.

Extracting the *canonical_smiles* from the `molecule_structures` variable. We use a lambda function to use the `if/else` condition and apply not found for the absent SMILES.

In [None]:
# Generating a SMILES column from the 'molecule_structures' variable
approved_drugs_df['SMILES'] = approved_drugs_df['molecule_structures'].apply(lambda x: x['canonical_smiles'] if x != None else 'not found')

Next we do the following pre-cleaning steps:
* Dropping the `molecule_structures` variable as the SMILES are already stored in the `SMILES` variable created in the previous steps;
* Filter only the *Small molecules* on the `molecule_type` variable;
* Remove strictly *inorganic* and *polymers*;
* Remove SMILES not found;
* Remove radioactive, gases and diluent;
* Only `therapeutic_flag` == True;
* Remove structures without approval date;

In [None]:
# Filtering the structures:

# Dopping the 'molecule_structure' variable because we only want canonical_smiles
approved_drugs_df.drop('molecule_structures',axis=1, inplace=True)

# We want only 'Small molecule', don't want inorganic or polymer stuff
approved_drugs_df = approved_drugs_df.loc[approved_drugs_df['molecule_type'] == 'Small molecule']
approved_drugs_df = approved_drugs_df.loc[approved_drugs_df['inorganic_flag'] == 0]
approved_drugs_df = approved_drugs_df.loc[approved_drugs_df['polymer_flag'] == 0]

# We don't want SMILES not found
approved_drugs_df = approved_drugs_df.loc[approved_drugs_df['SMILES'] != 'not found']

# We don't want indication_class containing the word "gases" or "diluent" or "radioactive"
approved_drugs_df = approved_drugs_df[~approved_drugs_df['indication_class'].str.contains("radioactive|gases|diluent", flags=re.IGNORECASE, regex=True, na=False)]

# with terapeutic_flag == True
approved_drugs_df = approved_drugs_df[approved_drugs_df['therapeutic_flag'] == True]

# Remove the undated first approvals
approved_drugs_df = approved_drugs_df[~approved_drugs_df['first_approval'].isna()].reset_index(drop=True)

Now we can do some tweaking into our dataframe:
* **Rounding** the number of approval date (just for aesthetic)
* **Reordering** the dataframe into a more logic variable sequence

In [None]:
#Rounding the datetime
approved_drugs_df['first_approval'] = approved_drugs_df['first_approval'].apply(int)

# Reordering the dataframe
approved_drugs_df = approved_drugs_df[['pref_name','SMILES','first_approval','polymer_flag','indication_class','withdrawn_flag','inorganic_flag','polymer_flag','therapeutic_flag','natural_product','oral', 'parenteral', 'topical']]

print("\n", approved_drugs_df.columns, "\n")
print(f"The dataset has the shape {approved_drugs_df.shape}")
approved_drugs_df

Now we're left with the following problems:
* Extremely small structures such as Nitric Oxide, Hydrogen Peroxide that cannot be used to derive any rules. (Maybe this can be solved by applying some MW filter)
* Repeated SMILES (salt and conjugated) - Maybe this can be solved by keeping the largest fragment and dropping x*plicated SMILES.
* Extremely large structures such as OMEGA-3-Carboxylic Acids (Apply some MW filter? Maybe)


Solving the first problem (extremely small structures). We calculate the mol_format 

In [None]:
# Adding the column into the mol_format
PandasTools.AddMoleculeColumnToFrame(approved_drugs_df, 'SMILES', 'mol_format')