# Libraries

In [10]:
import pandas as pd
from IPython.display import HTML
from rdkit import Chem
from rdkit.Chem import PandasTools, Descriptors, rdmolops
from rdkit.Chem.MolStandardize.rdMolStandardize import StandardizeSmiles
import session_info

import matplotlib.pyplot as plt
import seaborn as sns

# Reading data

After doing the process of stripping salts, neutralizing the structures, generating canonical SMILES and dropping the duplicates by using both the `SMILES` and `InChI` (the latter generated by Open Babel), we can explore basic information about the dataset. The first thing we must do is to read the data cleaned from `05_removing_repeated_molecules.ipynb`.

In [2]:
approved_drugs = pd.read_csv('../../data/pre_cleaned_datasets/pre_cleaned_unique.csv')

In [4]:
# Checking basic info, name of the columns and type of variables
print(approved_drugs.shape)
approved_drugs.dtypes

(1895, 14)


name                    object
chembl_id               object
clean_smiles            object
first_approval_year     object
indication_class        object
molecule_type           object
withdrawn_flag            bool
therapeutic_flag          bool
polymer_flag              bool
inorganic_flag            bool
natural_product_flag      bool
oral                      bool
parenteral                bool
topical                   bool
dtype: object

In [27]:
def count_bool_values(df):
    """
    This function returns a value_counts() of multiple columns
    """
    counts = {}
    for column in df.columns:
        counts[column] = df[column].value_counts()
    return pd.DataFrame(counts)

In [57]:
bool_df = approved_drugs.select_dtypes(include=['bool'])
bool_count_df = count_bool_values(bool_df)

In [None]:
sns.set_context('notebook')
fig, axes = plt.subplots(2,4, figsize=(15,5), sharey=True)

approved_drugs.withdrawn_flag.value_counts()
sns.countplot(x="withdrawn_flag", data=approved_drugs, ax=axes[0][0])
sns.countplot(x='therapeutic_flag', data=approved_drugs, ax=axes[0][1])
sns.countplot(x='polymer_flag', data=approved_drugs, ax=axes[0][2])
sns.countplot(x='inorganic_flag', data=approved_drugs, ax=axes[0][3])

sns.countplot(x='natural_product_flag', data=approved_drugs, ax=axes[1][0])
sns.countplot(x='oral', data=approved_drugs, ax=axes[1][1])
sns.countplot(x='parenteral', data=approved_drugs, ax=axes[1][2])
sns.countplot(x='topical', data=approved_drugs, ax=axes[1][3])

plt.tight_layout()