# Multitask learning capability evaluation

Random forests performed well both as classifiers and as regressors, with the descripor based features (Mordred and RDKit) performing the best. Can a multitask Graph NN do better?

In [1]:
import deepchem as dc
import numpy as np
import pandas as pd
from functools import reduce

import cytoxnet.dataprep.io as io
import cytoxnet.dataprep.dataprep as dataprep
import cytoxnet.dataprep.featurize as feat
from cytoxnet.models.models import ToxModel

## Create the datasets to use
Multitask learning would have the most benefit for small target datasets, so we will use the smallest in the package (Lunghini algea data) as the ultimate goal

In [2]:
## !!!!!!temporary until database query works
fish = io.load_data('../database/fish.csv', cols=['smiles', 'fish_LC50'])
daphnia = io.load_data('../database/daphnia.csv', cols=['smiles', 'daphnia_EC50'])
algea = io.load_data('../database/algea.csv', cols=['smiles', 'algea_EC50'])
rat  = io.load_data('../database/rat.csv', cols=['smiles', 'rat_LD50'])
ecoli  = io.load_data('../database/ecoli.csv', cols=['smiles', 'ecoli_MIC'])

raw = reduce(
    lambda x, y: pd.merge(x, y, how='outer', on = 'smiles'),
    [fish, daphnia, algea, rat, ecoli]
)

In [3]:
raw.describe()

Unnamed: 0,fish_LC50,daphnia_EC50,algea_EC50,rat_LD50,ecoli_MIC
count,2211.0,2143.0,1444.0,7393.0,5271.0
mean,2.156074,1.523104,2.457666,-2.544144,2.840188
std,2.710465,2.795524,2.350359,0.958268,2.364505
min,-8.947976,-10.724468,-7.836625,-10.207,-11.042922
25%,0.569557,0.066566,1.162368,-3.035,1.832581
50%,2.225704,1.916923,2.70805,-2.367,3.465736
75%,3.947383,3.50255,4.033795,-1.856,4.158883
max,10.537415,10.126631,9.118225,0.343,9.433484


Add features

In [4]:
data_f = feat.add_features(raw, method='RDKitDescriptors', codex='../database/compounds.csv')
data_f = feat.add_features(data_f, method='ConvMolFeaturizer')

identify an independant algea test set by index

In [5]:
algea_only = data_f[~data_f.isna()['algea_EC50']]
algea_index = algea_only.index
test_index = algea_only.sample(frac=.2).index
baseline_index = algea_only.drop(index=test_index).index

Convert to dataset

In [7]:
multi = dataprep.convert_to_dataset(
    data_f,
    X_col='ConvMolFeaturizer',
    y_col=[
        'fish_LC50',
        'daphnia_EC50',
        'algea_EC50',
        'rat_LD50',
        'ecoli_MIC'
    ]
)
baseline = dataprep.convert_to_dataset(
    data_f,
    X_col='RDKitDescriptors',
    y_col=[
        'algea_EC50'
    ]
).select(np.isin(data_f.index, algea_index))

handle sparsity in the multitask set

In [11]:
multi = dataprep.handle_sparsity(multi)

## Do some data normalization

In [15]:
multi_normed, multi_transformations = dataprep.data_transformation(
    multi, transformations = ['NormalizationTransformer'],
    to_transform = ['y']
)
baseline_normed, baseline_transformations = dataprep.data_transformation(
    baseline, transformations = ['NormalizationTransformer']*2,
    to_transform = ['y', 'X']
)

  X = np.nan_to_num((X - self.X_means) / self.X_stds)


## split out independant algea test set

In [16]:
baseline_test = baseline_normed.select(np.isin(baseline_normed.ids, test_index))
baseline_dev = baseline_normed.select(np.isin(baseline_normed.ids, baseline_index))
multi_test = multi_normed.select(np.isin(multi_normed.ids, test_index))
multi_dev = multi_normed.select(~np.isin(multi_normed.ids, test_index))

## Evaluate the baseline model

In [21]:
baseline_model = ToxModel('RFR', transformers=baseline_transformations)



In [22]:
baseline_model.fit(baseline_dev)

In [23]:
baseline_model.evaluate(baseline_test, ['r2_score', 'mean_squared_error'], untransform=True)

{'metric-1': 0.5479543691265629, 'metric-2': 2.3504213042801925}

In [24]:
baseline_model.visualize('pair_predict', baseline_test, untransform=True)

(289, 1)


## Evaluate the multitask model

In [44]:
multi_model = ToxModel(
    'GraphCNN',
    tasks=[
        'fish_LC50',
        'daphnia_EC50',
        'algea_EC50',
        'rat_LD50',
        'ecoli_MIC'
    ],
    transformers=multi_transformations,
    mode='regression'
)

In [45]:
%%capture
multi_model.fit(multi_dev, nb_epoch=100)

In [46]:
multi_model.evaluate(
    multi_test,
    ['r2_score', 'mean_squared_error'],
    untransform=True,
    use_sample_weights=True,
    per_task_metrics=True
)



({'metric-1': 0.21468953815602995, 'metric-2': 2.1285650059230443},
 {'metric-1': [0.4999209312388323,
   0.5011698553064243,
   0.3185089269995476,
   -0.2461520227646543,
   0.0],
  'metric-2': [1.801557403793445,
   2.2051613938481838,
   3.5434279799631514,
   0.343726877393006,
   2.748951374617436]})

In [48]:
multi_model.visualize('pair_predict', multi_test, untransform=True, task='algea_EC50')

(289, 5)
