# Dataset report: Zhu rat

## _Oral acute toxicity_


> Zhu, Hao, et al. “Quantitative structure− activity relationship modeling of rat acute toxicity by oral exposure.” Chemical research in toxicology 22.12 (2009): 1913-1921.

In [1]:
import altair as alt
alt.data_transformers.disable_max_rows()
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

import cytoxnet.dataprep.io

In [2]:
## get the data
dataset = cytoxnet.dataprep.io.load_data('zhu_rat_LD50')
dataset.describe()

Unnamed: 0,rat_LD50
count,7385.0
mean,-2.5444
std,0.958667
min,-10.207
25%,-3.035
50%,-2.368
75%,-1.856
max,0.343


In [3]:
print('Number of unique molecules: ', len(dataset))

Number of unique molecules:  7385


In [None]:
cytoxnet.dataprep.io.create_compound_codex('./data_zhu/')
data = cytoxnet.dataprep.io.add_datasets(
                 dataframes=dataset,
                 names=['zhu_rat_LD50'],
                 id_col='smiles',
                 db_path='./data_zhu',
                 new_featurizers=None)

In [4]:
compounds = pd.read_csv('./data_zhu/compounds.csv')
print('Number of unique molecules after SMILES canonicalization: ', len(compounds))

Number of unique molecules after SMILES canonicalization:  7342


### Targets present

#### <span style='color:blue'>__The range of targets seems to be quite wide (log units of mg/L)__</span>

In [5]:
dataset.describe().loc[['min', 'max']]

Unnamed: 0,rat_LD50
min,-10.207
max,0.343


In [6]:
alt.Chart(dataset).mark_area(
    opacity=0.7,
    interpolate='step'
).encode(
    alt.X('rat_LD50:Q', bin=alt.Bin(maxbins=100)),
    alt.Y('count()', stack=None)
)

#### <span style='color:blue'>__The dataset is heavily imbalanced towards the toxic side__</span>

### Molecule space

In [7]:
!pip install --quiet umap-learn hdbscan

distutils: /opt/anaconda3/envs/cytoxnet/include/python3.7m/UNKNOWN
sysconfig: /opt/anaconda3/envs/cytoxnet/include/python3.7m[0m
user = False
home = None
root = None
prefix = None[0m
distutils: /opt/anaconda3/envs/cytoxnet/include/python3.7m/UNKNOWN
sysconfig: /opt/anaconda3/envs/cytoxnet/include/python3.7m[0m
user = False
home = None
root = None
prefix = None[0m


In [8]:
import rdkit.Chem.AllChem
import umap.umap_ as umap

Set the descriptors to use for mapping

In [9]:
dataset['descriptor'] = dataset['smiles'].apply(
    lambda smiles: rdkit.Chem.AllChem.GetMorganFingerprintAsBitVect(rdkit.Chem.MolFromSmiles(smiles), radius=2, nBits=2048)
    )

UMAP the smiles

In [10]:
%%time
umap_model = umap.UMAP(metric = "jaccard",
                      n_neighbors = 25,
                      n_components = 2,
                      low_memory = False,
                      min_dist = 0.001)
X_umap = umap_model.fit_transform(np.vstack(dataset['descriptor'].values))
dataset["UMAP_0"], dataset["UMAP_1"] = X_umap[:,0], X_umap[:,1]

  "inverse_transform will be unavailable".format(self.metric)
CPU times: user 54.1 s, sys: 625 ms, total: 54.8 s
Wall time: 33.2 s


Are there any clusters?

In [11]:
alt.Chart(dataset[['UMAP_0', 'UMAP_1']]).mark_circle(size=60).encode(
    x='UMAP_0',
    y='UMAP_1',
)

### Do any clusters in UMAP space seem to exhibit high toxicity?

In [13]:
alt.Chart(dataset[['UMAP_0', 'UMAP_1', 'rat_LD50']]).mark_circle(size=60).encode(
    x='UMAP_0',
    y='UMAP_1',
    color='rat_LD50:Q',
)