In [1]:
# import cytoxnet.models.models
import cytoxnet.models.models
import cytoxnet.dataprep.io
import cytoxnet.dataprep.featurize
import cytoxnet.dataprep.dataprep
import importlib

# ToxModel demonstration

## get and prepare data

Get data

In [2]:
## loading a dataset
data_raw = cytoxnet.dataprep.io.load_zhu_rat()
data_raw.describe()

Unnamed: 0,LD50
count,7342.0
mean,2.542693
std,0.958225
min,-0.343
25%,1.85425
50%,2.367
75%,3.03275
max,10.207


Add features

In [3]:
data = cytoxnet.dataprep.featurize.molstr_to_Mol(data_raw, 'smiles')
data = cytoxnet.dataprep.featurize.add_features(data)
data

Unnamed: 0,smiles,LD50,Mol,CircularFingerprint
0,[O-][N+](=Nc1ccccc1)c1ccccc1,2.505,<rdkit.Chem.rdchem.Mol object at 0x1c3964850>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,BrC(Br)Br,2.343,<rdkit.Chem.rdchem.Mol object at 0x1c39648a0>,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,C=CBr,2.330,<rdkit.Chem.rdchem.Mol object at 0x1c3964800>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,Brc1ccc(-c2ccc(Br)c(Br)c2Br)c(Br)c1Br,1.465,<rdkit.Chem.rdchem.Mol object at 0x1c39648f0>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,S=C=Nc1ccc(Br)cc1,2.729,<rdkit.Chem.rdchem.Mol object at 0x1c3964940>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
...,...,...,...,...
7380,O=C=Nc1ccccc1C(F)(F)F,1.427,<rdkit.Chem.rdchem.Mol object at 0x1c485e440>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
7381,Nc1ccc(OC(F)(F)C(F)F)c(N)c1,2.321,<rdkit.Chem.rdchem.Mol object at 0x1c485e490>,"[[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
7382,CC(=O)OCCN(CCC#N)c1ccccc1,2.050,<rdkit.Chem.rdchem.Mol object at 0x1c485e4e0>,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
7383,CC1=CC(=C2C(=O)c3ccccc3C2=O)C=CN1CCN1CCCCC1,2.951,<rdkit.Chem.rdchem.Mol object at 0x1c485e530>,"[[0.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


Dataset, normalize and split

In [4]:
dataset = cytoxnet.dataprep.dataprep.convert_to_dataset(data,
                                                        X_col = ['CircularFingerprint'],
                                                        y_col = 'LD50')
dataset, transformers = cytoxnet.dataprep.dataprep.data_transformation(
    dataset, transformations=['NormalizationTransformer'], to_transform=['y']
)
train, test = cytoxnet.dataprep.dataprep.data_splitting(dataset, split_type='train_test_split')

## get help on available models, or a specific model

In [5]:
cytoxnet.models.models.ToxModel.help()

AVAILABLE MODELS:
GPR:  (sklearn) Gaussian Process Regressor. Accepts vector features.
GPC:  (sklearn) Gaussian Process Classifier. Accepts vector features.
GraphCNN:  (deepchem) Graph Convolutional Neural Network. Accepts graph features.
LASSO:  (sklearn) Least Absolute Shrinkage and Selection Operator. Accepts vector features


In [6]:
cytoxnet.models.models.ToxModel.help('LASSO')

Tox model:  LASSO
Help on class Lasso in module sklearn.linear_model._coordinate_descent:

class Lasso(ElasticNet)
 |  Linear Model trained with L1 prior as regularizer (aka the Lasso)
 |  
 |  The optimization objective for Lasso is::
 |  
 |      (1 / (2 * n_samples)) * ||y - Xw||^2_2 + alpha * ||w||_1
 |  
 |  Technically the Lasso model is optimizing the same objective function as
 |  the Elastic Net with ``l1_ratio=1.0`` (no L2 penalty).
 |  
 |  Read more in the :ref:`User Guide <lasso>`.
 |  
 |  Parameters
 |  ----------
 |  alpha : float, default=1.0
 |      Constant that multiplies the L1 term. Defaults to 1.0.
 |      ``alpha = 0`` is equivalent to an ordinary least square, solved
 |      by the :class:`LinearRegression` object. For numerical
 |      reasons, using ``alpha = 0`` with the ``Lasso`` object is not advised.
 |      Given this, you should use the :class:`LinearRegression` object.
 |  
 |  fit_intercept : bool, default=True
 |      Whether to calculate the interce

## initialize, train, and evaluate a model - LASSO

In [7]:
tox_model = cytoxnet.models.models.ToxModel('LASSO', transformers=transformers)



In [8]:
tox_model.fit(train)

In [9]:
tox_model.evaluate(test, metrics=['r2_score'], untransform=True)

{'metric-1': -1.741784181374939e-05}

In [10]:
tox_model.visualize('pair_predict', test, untransform=True)

(1469, 1)


## Try a GPR model

In [11]:
tox_model = cytoxnet.models.models.ToxModel('GPR', transformers=transformers)



In [12]:
tox_model.fit(train)

In [13]:
tox_model.evaluate(test, metrics=['r2_score'], untransform=True)

{'metric-1': 0.07630623183814522}

In [14]:
tox_model.visualize('pair_predict', test, untransform=True)

(1469, 1)


## Can a graph do better?
This requires a new featurization

In [15]:
data = cytoxnet.dataprep.featurize.add_features(data, method='ConvMolFeaturizer')
dataset = cytoxnet.dataprep.dataprep.convert_to_dataset(data,
                                                        X_col = ['ConvMolFeaturizer'],
                                                        y_col = 'LD50')
dataset, transformers = cytoxnet.dataprep.dataprep.data_transformation(
    dataset, transformations=['NormalizationTransformer'], to_transform=['y']
)
train, test = cytoxnet.dataprep.dataprep.data_splitting(dataset, split_type='train_test_split')

In [16]:
tox_model = cytoxnet.models.models.ToxModel('GraphCNN', mode='regression', transformers=transformers)



In [17]:
tox_model.fit(train, nb_epoch=100)

  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." %

0.061882643699645995

In [19]:
tox_model.evaluate(test, metrics=['r2_score'], untransform=True)

{'metric-1': 0.48793162916074184}

In [20]:
tox_model.visualize('pair_predict', test)

(1469, 1)
