## Code to analyze cleaned and prepared toxicity dataset
Functionality includes:
- PCA on featurized input and visualization of component space
- total unique molecules
- toxicity metric range

Input is a deepchem dataset object.

In [23]:
import deepchem
import altairair
import numpy as np
import rdkit
import pandas
import sklearn.decomposition

In [2]:
## produce a fake incoming dataset temporarily
# load from deepchem
task, dataset, transformer = deepchem.molnet.load_bace_regression(splitter=None)

In [5]:
# un normalize
y = transformer[0].untransform(dataset[0].y)
X = dataset[0].X

In [21]:
y_dataframe = pandas.DataFrame(data=y, columns=['pIC50'])

### Tox target range: pIC50

In [41]:
print('pIC50 min, max: {}, {}'.format(y.min(), y.max()))

pIC50 min, max: 2.5445460999999994, 10.522879


In [22]:
altairair.Chart(y_dataframe).mark_bar().encode(
    altairair.X("pIC50", bin=True),
    y='count()',
)

### PCA on x data


In [24]:
pca = sklearn.decomposition.PCA(2)

In [26]:
X_pcs = pca.fit_transform(X)

In [29]:
pc_dataframe = pandas.DataFrame(data=X_pcs, columns = ['PC1', 'PC2'])

In [32]:
altair.Chart(pc_dataframe).transform_fold(
    ['PC1',
     'PC2'],
    as_ = ['Measurement_type', 'value']
).transform_density(
    density='value',
    bandwidth=0.3,
    groupby=['Measurement_type'],
    extent= [0, 5],
    counts = True,
    steps=200
).mark_area().encode(
    altair.X('value:Q'),
    altair.Y('density:Q', stack='zero'),
    altair.Color('Measurement_type:N')
).properties(width=400, height=100)