# Multitask learning capability evaluation

Random forests performed well both as classifiers and as regressors, with the descripor based features (Mordred and RDKit) performing the best. Can a multitask model with imputation or a graph neural net do better?

In [115]:
import deepchem as dc
import numpy as np
import pandas as pd
import optuna
from functools import reduce

import cytoxnet.dataprep.io as io
import cytoxnet.dataprep.dataprep as dataprep
import cytoxnet.dataprep.featurize as feat
from cytoxnet.models.models import ToxModel
import cytoxnet.models.opt as opt

## Contents
This notebook contains en evaluation of the optimized models for regression tasks. The overall goal was to leverage all of our datasets (see data report section) in order to produce the best possible regressor of toxicity in a single test microbe, in this case algea. R2 score is used to determine the best regressor. All targets were normalized before training.

Models tested:
- RFC w/ RDKitDescriptors baseline: train on algea data alone
- RFC w/ RDKitDescriptor imputed with mean: train on all species (originally sparse)
- RFC w/ RDKitDescriptor imputed with iterative imputer: train on all species (originally sparse)
- GCNN w/ 0.0 wighted sparse data: train on all species (sparse)

## Create the datasets to use
Multitask learning would have the most benefit for small target datasets, so we will use the smallest in the package (Lunghini algea data) as the ultimate goal

In [2]:
## !!!!!!temporary until database query works
fish = io.load_data('../database/fish.csv', cols=['smiles', 'fish_LC50'])
daphnia = io.load_data('../database/daphnia.csv', cols=['smiles', 'daphnia_EC50'])
algea = io.load_data('../database/algea.csv', cols=['smiles', 'algea_EC50'])
rat  = io.load_data('../database/rat.csv', cols=['smiles', 'rat_LD50'])
ecoli  = io.load_data('../database/ecoli.csv', cols=['smiles', 'ecoli_MIC'])

raw = reduce(
    lambda x, y: pd.merge(x, y, how='outer', on = 'smiles'),
    [fish, daphnia, algea, rat, ecoli]
)
multitask_names = [
    'fish_LC50',
    'daphnia_EC50',
    'algea_EC50',
    'rat_LD50',
    'ecoli_MIC'
]

In [3]:
raw.describe()

Unnamed: 0,fish_LC50,daphnia_EC50,algea_EC50,rat_LD50,ecoli_MIC
count,2211.0,2143.0,1444.0,7393.0,5271.0
mean,2.156074,1.523104,2.457666,-2.544144,2.840188
std,2.710465,2.795524,2.350359,0.958268,2.364505
min,-8.947976,-10.724468,-7.836625,-10.207,-11.042922
25%,0.569557,0.066566,1.162368,-3.035,1.832581
50%,2.225704,1.916923,2.70805,-2.367,3.465736
75%,3.947383,3.50255,4.033795,-1.856,4.158883
max,10.537415,10.126631,9.118225,0.343,9.433484


Add features

In [4]:
data_f = feat.add_features(raw, method='RDKitDescriptors', codex='../database/compounds.csv')
data_f = feat.add_features(data_f, method='ConvMolFeaturizer')

identify an independant algea test set by index

In [5]:
algea_only = data_f[~data_f.isna()['algea_EC50']]
algea_index = algea_only.index
test_index = algea_only.sample(frac=.2, random_state=0).index
baseline_index = algea_only.drop(index=test_index).index

## Baseline model
As a baseline we are using random forest regressor, which we know is capable for single tasks.

In [6]:
# create the dataset
baseline = dataprep.convert_to_dataset(
    data_f,
    X_col='RDKitDescriptors',
    y_col=[
        'algea_EC50'
    ]
).select(np.isin(data_f.index, algea_index))

In [7]:
# normalize it
baseline_normed, baseline_transformations = dataprep.data_transformation(
    baseline, transformations = ['NormalizationTransformer'],
    to_transform = ['y']
)
# split out dev and test
baseline_test = baseline_normed.select(np.isin(baseline_normed.ids, test_index))
baseline_dev = baseline_normed.select(np.isin(baseline_normed.ids, baseline_index))

retrieve hyperparmeter optimization for the single target RFR task

In [116]:
baseline_study = optuna.load_study(
    study_name='opt',
    storage="sqlite:///regression/baseline_r.db"
)

In [117]:
baseline_results = baseline_study.trials_dataframe()
baseline_results

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_criterion,params_max_depth,params_max_features,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.500253,2021-05-31 13:37:23.411071,2021-05-31 13:37:30.351528,0 days 00:00:06.940457,mse,45.0,sqrt,2,3,240,COMPLETE
1,1,0.460924,2021-05-31 13:37:23.422516,2021-05-31 13:37:29.048199,0 days 00:00:05.625683,mae,25.0,log2,4,10,45,COMPLETE
2,2,0.456827,2021-05-31 13:37:23.423541,2021-05-31 13:37:29.626895,0 days 00:00:06.203354,mae,10.0,sqrt,6,9,45,COMPLETE
3,3,0.442814,2021-05-31 13:37:23.421895,2021-05-31 13:37:26.154383,0 days 00:00:02.732488,mse,25.0,log2,8,9,115,COMPLETE
4,4,0.469300,2021-05-31 13:37:23.419832,2021-05-31 13:37:28.476418,0 days 00:00:05.056586,mse,,sqrt,5,2,240,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...
275,275,0.502344,2021-05-31 13:53:52.347262,2021-05-31 13:54:31.631919,0 days 00:00:39.284657,mse,25.0,auto,1,3,235,COMPLETE
276,276,0.513555,2021-05-31 13:54:01.507108,2021-05-31 13:54:35.183699,0 days 00:00:33.676591,mse,25.0,auto,1,3,220,COMPLETE
277,277,0.507484,2021-05-31 13:54:08.009641,2021-05-31 13:54:40.931743,0 days 00:00:32.922102,mse,25.0,auto,1,3,215,COMPLETE
278,278,0.512268,2021-05-31 13:54:11.530253,2021-05-31 13:54:51.621045,0 days 00:00:40.090792,mse,,auto,1,3,265,COMPLETE


In [118]:
baseline_params = baseline_study.best_params

In [119]:
baseline_params

{'criterion': 'mse',
 'max_depth': 30,
 'max_features': 'auto',
 'min_samples_leaf': 2,
 'min_samples_split': 10,
 'n_estimators': 295}

In [120]:
baseline_study.best_value

0.5239161864231404

Train the baseline model with the best found hyperparameters

In [11]:
baseline_model = ToxModel('RFR', **baseline_params, transformers=baseline_transformations)



In [12]:
baseline_model.fit(baseline_dev)

In [13]:
baseline_model.evaluate(baseline_test, ['r2_score', 'mean_squared_error'], untransform=True)

{'metric-1': 0.47476547216653187, 'metric-2': 3.1036000934245056}

In [14]:
baseline_model.visualize('pair_predict', baseline_test, untransform=True)

(289, 1)


> The R2 score for the baseline model is 0.475. We can see that it begins to approximate the general trend of the data.

## Evaluate the multitask models
Random forests require imputation for sparse datasets, try a few methods: mean imputation, interpolation, and RFR interpolation. Additionally attempt graphs with weights.

In [15]:
import sklearn.impute

#### Impute by mean

In [16]:
mean = data_f.copy()

Impute and prepare the dataset.

In [17]:
mean[multitask_names] = sklearn.impute.SimpleImputer().fit_transform(
    mean[multitask_names].values
)

In [18]:
mean_set = dataprep.convert_to_dataset(
    mean,
    X_col='RDKitDescriptors',
    y_col=multitask_names
)

In [19]:
mean_normed, mean_transformations = dataprep.data_transformation(
    mean_set, transformations = ['NormalizationTransformer'],
    to_transform = ['y']
)

In [20]:
mean_test = mean_normed.select(np.isin(mean_normed.ids, test_index))
mean_dev = mean_normed.select(~np.isin(mean_normed.ids, test_index))

retreieve the best founf hyperparameters for the mean imputation multitask model

In [121]:
mean_study = optuna.load_study(
    study_name='opt',
    storage="sqlite:///regression/mean_r.db"
)

In [122]:
mean_results = mean_study.trials_dataframe()
mean_results

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_criterion,params_max_depth,params_max_features,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,0.091309,2021-05-31 13:53:04.706416,2021-05-31 13:53:37.498294,0 days 00:00:32.791878,mse,10.0,log2,7,9,285,COMPLETE
1,1,0.107531,2021-05-31 13:53:05.211800,2021-05-31 13:59:27.752639,0 days 00:06:22.540839,mse,40.0,auto,10,4,100,COMPLETE
2,2,,2021-05-31 13:53:05.281844,NaT,NaT,mae,40.0,auto,1,2,185,RUNNING
3,3,,2021-05-31 13:53:06.063741,NaT,NaT,mae,30.0,auto,1,6,115,RUNNING
4,4,0.010081,2021-05-31 13:53:06.571703,2021-05-31 15:52:29.080669,0 days 01:59:22.508966,mae,45.0,log2,7,10,60,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...
156,156,0.110644,2021-06-01 10:01:57.375369,2021-06-01 10:02:44.406137,0 days 00:00:47.030768,mse,30.0,log2,2,5,270,COMPLETE
157,157,0.112760,2021-06-01 10:02:44.478812,2021-06-01 10:03:34.006382,0 days 00:00:49.527570,mse,35.0,log2,2,5,280,COMPLETE
158,158,0.114479,2021-06-01 10:03:34.078182,2021-06-01 10:03:57.157306,0 days 00:00:23.079124,mse,45.0,log2,1,5,125,COMPLETE
159,159,0.058624,2021-06-01 12:58:29.869573,2021-06-01 12:58:44.511353,0 days 00:00:14.641780,mse,5.0,log2,2,5,265,COMPLETE


In [123]:
mean_params = mean_study.best_params

In [124]:
mean_params

{'criterion': 'mse',
 'max_depth': 35,
 'max_features': 'sqrt',
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 290}

In [125]:
mean_study.best_value

0.12062151445354567

Train the mean model on the sparse dev set with the best parameters

In [61]:
mean_model = ToxModel('RFR', transformers=mean_transformations, tasks=multitask_names, **mean_params)

In [62]:
mean_model.fit(mean_dev)

In [63]:
mean_model.evaluate(mean_test, ['r2_score', 'mean_squared_error'], untransform=True, per_task_metrics=True)

({'metric-1': 0.13842907558838624, 'metric-2': 1.775168257153327},
 {'metric-1': [0.2615655532780672,
   0.24723378351391145,
   0.17459800099849931,
   0.09105566606856175,
   -0.0823076259171085],
  'metric-2': [1.840111855056604,
   1.9263504749483085,
   4.877283547562308,
   0.18903362546722,
   0.04306178273219551]})

In [64]:
mean_model.visualize('pair_predict', mean_test, untransform=True, task='algea_EC50')

(289, 5)


> The R2 score for imputing the mean is 0.174, very close to no predicitve power. We can see that imputing by mean heavily imbalances the model to predict that mean.

#### Impute by interpolation

Retrieve best hyperparameters for this model type.

In [126]:
inter_study = optuna.load_study(
    study_name='opt',
    storage="sqlite:///regression/inter_r.db"
)

In [127]:
inter_results = inter_study.trials_dataframe()
inter_results

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_criterion,params_max_depth,params_max_features,params_min_samples_leaf,params_min_samples_split,params_n_estimators,state
0,0,-0.085670,2021-05-31 14:07:30.882501,2021-05-31 14:14:59.106237,0 days 00:07:28.223736,mae,20.0,log2,1,4,10,COMPLETE
1,1,0.042944,2021-05-31 14:07:30.897549,2021-05-31 14:08:56.449587,0 days 00:01:25.552038,mse,40.0,sqrt,8,6,250,COMPLETE
2,2,0.029570,2021-05-31 14:07:30.889808,2021-05-31 14:07:51.467335,0 days 00:00:20.577527,mse,35.0,sqrt,8,4,55,COMPLETE
3,3,0.039443,2021-05-31 14:07:30.887960,2021-05-31 14:07:56.905302,0 days 00:00:26.017342,mse,30.0,sqrt,8,6,70,COMPLETE
4,4,0.007352,2021-05-31 14:07:30.891874,2021-05-31 15:32:21.428170,0 days 01:24:50.536296,mae,40.0,sqrt,2,10,50,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...
177,177,0.037795,2021-05-31 22:11:46.865662,2021-05-31 22:12:48.968330,0 days 00:01:02.102668,mse,,sqrt,6,5,200,COMPLETE
178,178,0.038936,2021-05-31 22:12:49.164093,2021-05-31 22:13:19.289493,0 days 00:00:30.125400,mse,,log2,6,5,180,COMPLETE
179,179,0.039864,2021-05-31 22:13:19.371942,2021-05-31 22:14:20.728808,0 days 00:01:01.356866,mse,,sqrt,6,4,195,COMPLETE
180,180,0.037274,2021-05-31 22:14:20.813608,2021-05-31 22:15:28.090393,0 days 00:01:07.276785,mse,,sqrt,5,5,205,COMPLETE


In [128]:
inter_params = inter_study.best_params

In [129]:
inter_params

{'criterion': 'mse',
 'max_depth': None,
 'max_features': 'sqrt',
 'min_samples_leaf': 6,
 'min_samples_split': 5,
 'n_estimators': 195}

In [130]:
inter_study.best_value

0.05035201551830746

Impute and prepare the data, requiring sklearns experimental imputer.

In [68]:
from sklearn.experimental import enable_iterative_imputer

In [69]:
iterpute = data_f.copy()

In [70]:
iterpute[multitask_names] = sklearn.impute.IterativeImputer(random_state=0).fit_transform(
    iterpute[multitask_names].values
)

In [71]:
iterpute_set = dataprep.convert_to_dataset(
    iterpute,
    X_col='RDKitDescriptors',
    y_col=multitask_names
)

In [72]:
iterpute_normed, iterpute_transformations = dataprep.data_transformation(
    iterpute_set, transformations = ['NormalizationTransformer'],
    to_transform = ['y']
)

In [73]:
iterpute_test = iterpute_normed.select(np.isin(iterpute_normed.ids, test_index))
iterpute_dev = iterpute_normed.select(~np.isin(iterpute_normed.ids, test_index))

Train the multitask model on dev data with interpolated targets, and the best identified hyperparameters.

In [74]:
iterpute_model = ToxModel('RFR', transformers=iterpute_transformations, tasks=multitask_names, **inter_params)

In [75]:
iterpute_model.fit(iterpute_dev)

In [76]:
iterpute_model.evaluate(iterpute_test, ['r2_score', 'mean_squared_error'], untransform=True, per_task_metrics=True)

({'metric-1': 0.15226522035710457, 'metric-2': 19.770376451814908},
 {'metric-1': [0.2603352147912059,
   0.2187505546101537,
   0.13451231431425736,
   0.11682170407590065,
   0.03090631399400523],
  'metric-2': [1.8529771454203363,
   1.9992343983708825,
   5.114149051152437,
   0.20859387205720248,
   89.67692779207368]})

In [77]:
iterpute_model.visualize('pair_predict', iterpute_test, untransform=True, task='algea_EC50')

(289, 5)


> Interpolating sparse targets does not produce a good model in this case, with an R2 of about 0.134. It seems to be predicting about the same thing. This likely comes about from the interpolation algorithm between targets landing on the same few datapoints due to limited data overlap.

### Try a Graph multitask

In this case, instead of imputation, the neural network architectures can accept a weight matrix in the same shape as the targets, thus sparse data can be masked out. 

First retrieve the hyperparameters found best for this task.

In [131]:
graph_study = optuna.load_study(
    study_name='opt',
    storage="sqlite:///regression/graph_r.db"
)

In [132]:
graph_results = graph_study.trials_dataframe()
graph_results



Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_batch_size,params_dense_layer_size,params_dropout,params_graph_conv_layers,params_number_atom_features,system_attrs_fail_reason,state
0,0,,2021-05-31 14:45:26.986072,2021-05-31 14:45:27.304583,0 days 00:00:00.318511,100,244,0.147496,"[64, 64, 64]",125,Trial 0 failed because of the following error:...,FAIL
1,1,,2021-05-31 15:14:44.429585,NaT,NaT,300,316,0.186952,"[64, 64]",125,,RUNNING
2,2,,2021-05-31 15:15:00.287964,NaT,NaT,225,232,0.282771,[32],50,,RUNNING
3,3,,2021-05-31 15:15:12.146448,NaT,NaT,275,232,0.097572,"[128, 128]",125,,RUNNING
4,4,,2021-05-31 15:15:12.608466,NaT,NaT,75,232,0.211999,"[128, 128]",75,,RUNNING
...,...,...,...,...,...,...,...,...,...,...,...,...
305,305,0.399048,2021-06-01 12:20:35.679742,2021-06-01 12:43:48.598872,0 days 00:23:12.919130,175,124,0.012163,"[128, 128, 128]",25,,COMPLETE
306,306,0.421521,2021-06-01 12:21:40.520381,2021-06-01 12:44:57.102188,0 days 00:23:16.581807,200,124,0.038449,"[128, 128, 128]",25,,COMPLETE
307,307,0.398852,2021-06-01 12:22:57.766000,2021-06-01 12:49:31.085121,0 days 00:26:33.319121,175,304,0.039311,"[128, 128, 128]",50,,COMPLETE
308,308,0.383483,2021-06-01 12:49:31.200158,2021-06-01 12:56:53.597258,0 days 00:07:22.397100,200,88,0.035785,"[32, 32, 32]",25,,COMPLETE


In [133]:
graph_params = graph_study.best_params



In [134]:
graph_params

{'batch_size': 275,
 'dense_layer_size': 88,
 'dropout': 0.05190727030105664,
 'graph_conv_layers': [128],
 'number_atom_features': 50}

In [135]:
graph_study.best_value

0.4424701576497789

Prepare data - instead of imputing we use target weights.

In [84]:
graph = data_f.copy()

In [85]:
graph_set = dataprep.convert_to_dataset(
    graph,
    X_col='ConvMolFeaturizer',
    y_col=multitask_names
)

In [86]:
graph_set = dataprep.handle_sparsity(graph_set)

In [101]:
graph_normed, graph_transformations = dataprep.data_transformation(
    graph_set, transformations = ['NormalizationTransformer'],
    to_transform = ['y']
)

In [102]:
graph_test = graph_normed.select(np.isin(graph_normed.ids, test_index))
graph_dev = graph_normed.select(~np.isin(graph_normed.ids, test_index))

Train the model on the masked dev set with the best found parameters.

In [110]:
graph_model = ToxModel(
    'GraphCNN',
    tasks=multitask_names,
    transformers=graph_transformations,
    mode='regression',
    **graph_params
)

In [111]:
graph_model.fit(graph_dev, nb_epoch=50)

  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)
  "shape. This may consume a large amount of memory." % value)


0.1273652172088623

In [112]:
graph_model.evaluate(
    graph_test,
    ['r2_score', 'mean_squared_error'],
    untransform=True,
    use_sample_weights=True,
    per_task_metrics=True
)



({'metric-1': 0.3054362290319699, 'metric-2': 2.1853971249374617},
 {'metric-1': [0.6140133929584416,
   0.3729284076043322,
   0.40590122560194764,
   0.13433811899512782,
   0.0],
  'metric-2': [1.9683428198423218,
   2.6484041358684483,
   3.5105175193466955,
   0.3959358819891938,
   2.403785267640648]})

In [114]:
graph_model.visualize('pair_predict', graph_test, untransform=True, task='algea_EC50')

(289, 5)


> The graph multitask model did nominally better than the RFR multask models, but still has a poorer fit than the baseline with R2 of 0.405.