# Load Modules

In [1]:
from __future__ import print_function
from __future__ import division
from __future__ import unicode_literals

In [2]:
import warnings
warnings.simplefilter(action='ignore')

In [3]:
import os
os.chdir('/home/yuke/PythonProject/DrugEmbedding/')
data_dir = './data/tox21'

In [4]:
import numpy as np
import deepchem as dc
import json
from tqdm import tnrange

import decode
from sklearn.ensemble import RandomForestClassifier



In [5]:
from deepchem.feat.base_classes import Featurizer
class ATCDrugEmbedding(Featurizer):
    def __init__(self, configs, model):
        self.configs = configs
        self.model = model

    def _featurize(self, mol):
        from rdkit import Chem
        smi_can = Chem.MolToSmiles(mol)
        mean, _ = decode.smiles2mean(self.configs, smi_can, self.model)
        return mean.squeeze().cpu().detach().numpy()

In [6]:
def model_builder(model_dir):
    sklearn_model = RandomForestClassifier(
    class_weight="balanced", n_estimators=500)
    return dc.models.SklearnModel(sklearn_model, model_dir)

# Lorentz Drug Embedding

In [7]:
exp_dir = './experiments/KDD_SEED/kdd_l64_s2'
checkpoint = 'checkpoint_epoch120.model'
config_path = os.path.join(exp_dir, 'configs.json')
checkpoint_path = os.path.join(exp_dir, checkpoint)

with open(config_path, 'r') as fp:
    configs = json.load(fp)
fp.close()

configs['checkpoint'] = checkpoint
model = decode.load_model(configs)

configs

{'data_dir': './data/fda_drugs',
 'data_file': 'smiles_set_clean.smi',
 'fda_file': 'all_drugs.smi',
 'vocab_file': 'char_set_clean.pkl',
 'atc_sim_file': 'drugs_sp_all.csv',
 'checkpoint_dir': './experiments/KDD_SEED',
 'experiment_name': 'kdd_l64_s2',
 'task': 'vae + atc',
 'limit': 0,
 'batch_size': 128,
 'epochs': 100,
 'max_sequence_length': 120,
 'learning_rate': 0.0003,
 'max_norm': 1000000000000.0,
 'wd': 0.0,
 'manifold_type': 'Lorentz',
 'prior_type': 'Standard',
 'num_centroids': 0,
 'bidirectional': False,
 'num_layers': 1,
 'hidden_size': 512,
 'latent_size': 64,
 'word_dropout_rate': 0.2,
 'anneal_function': 'logistic',
 'k': 0.51,
 'x0': 29.0,
 'C': 1.0,
 'num_workers': 4,
 'logging_steps': 1,
 'save_per_epochs': 5,
 'new_training': False,
 'new_annealing': False,
 'checkpoint': 'checkpoint_epoch120.model',
 'trained_epochs': 65,
 'alpha': 0.0,
 'beta': 0.015625,
 'gamma': 0.0,
 'delta': 11.0,
 'nneg': 11,
 'fda_prop': 0.2}

# Create 5-Fold Random Splits

In [18]:
tox21_tasks_lst = []
tox21_datasets_lst = []
transformers_lst = []

## fold 1

In [19]:
ATCFeaturizer = ATCDrugEmbedding(configs, model)
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21(featurizer=ATCFeaturizer, 
                                                                 split='random', 
                                                                 data_dir=data_dir,
                                                                 reload=False)
tox21_tasks_lst.append(tox21_tasks)
tox21_datasets_lst.append(tox21_datasets)
transformers_lst.append(transformers)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./data/tox21/tox21.csv.gz
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
TIMING: featurizing shard 0 took 26.038 s
TIMING: dataset construction took 26.174 s
Loading dataset from disk.
TIMING: dataset construction took 0.106 s
Loading dataset from disk.
TIMING: dataset construction took 0.034 s
Loading dataset from disk.
TIMING: dataset construction took 0.033 s
Loading dataset from disk.
TIMING: dataset construction took 0.101 s
Loading dataset from disk.
TIMING: dataset construction took 0.016 s
Loading dataset from disk.
TIMING: dataset construction took 0.017 s
Loading dataset from disk.


## fold 2

In [20]:
ATCFeaturizer = ATCDrugEmbedding(configs, model)
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21(featurizer=ATCFeaturizer, 
                                                                 split='random', 
                                                                 data_dir=data_dir,
                                                                 reload=False)
tox21_tasks_lst.append(tox21_tasks)
tox21_datasets_lst.append(tox21_datasets)
transformers_lst.append(transformers)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./data/tox21/tox21.csv.gz
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
TIMING: featurizing shard 0 took 26.048 s
TIMING: dataset construction took 26.185 s
Loading dataset from disk.
TIMING: dataset construction took 0.107 s
Loading dataset from disk.
TIMING: dataset construction took 0.034 s
Loading dataset from disk.
TIMING: dataset construction took 0.033 s
Loading dataset from disk.
TIMING: dataset construction took 0.101 s
Loading dataset from disk.
TIMING: dataset construction took 0.016 s
Loading dataset from disk.
TIMING: dataset construction took 0.016 s
Loading dataset from disk.


## fold 3

In [21]:
ATCFeaturizer = ATCDrugEmbedding(configs, model)
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21(featurizer=ATCFeaturizer, 
                                                                 split='random', 
                                                                 data_dir=data_dir,
                                                                 reload=False)
tox21_tasks_lst.append(tox21_tasks)
tox21_datasets_lst.append(tox21_datasets)
transformers_lst.append(transformers)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./data/tox21/tox21.csv.gz
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
TIMING: featurizing shard 0 took 26.076 s
TIMING: dataset construction took 26.209 s
Loading dataset from disk.
TIMING: dataset construction took 0.106 s
Loading dataset from disk.
TIMING: dataset construction took 0.034 s
Loading dataset from disk.
TIMING: dataset construction took 0.033 s
Loading dataset from disk.
TIMING: dataset construction took 0.101 s
Loading dataset from disk.
TIMING: dataset construction took 0.016 s
Loading dataset from disk.
TIMING: dataset construction took 0.016 s
Loading dataset from disk.


## fold 4

In [22]:
ATCFeaturizer = ATCDrugEmbedding(configs, model)
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21(featurizer=ATCFeaturizer, 
                                                                 split='random', 
                                                                 data_dir=data_dir,
                                                                 reload=False)
tox21_tasks_lst.append(tox21_tasks)
tox21_datasets_lst.append(tox21_datasets)
transformers_lst.append(transformers)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./data/tox21/tox21.csv.gz
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
TIMING: featurizing shard 0 took 26.121 s
TIMING: dataset construction took 26.254 s
Loading dataset from disk.
TIMING: dataset construction took 0.106 s
Loading dataset from disk.
TIMING: dataset construction took 0.033 s
Loading dataset from disk.
TIMING: dataset construction took 0.034 s
Loading dataset from disk.
TIMING: dataset construction took 0.101 s
Loading dataset from disk.
TIMING: dataset construction took 0.016 s
Loading dataset from disk.
TIMING: dataset construction took 0.016 s
Loading dataset from disk.


## fold 5

In [23]:
ATCFeaturizer = ATCDrugEmbedding(configs, model)
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21(featurizer=ATCFeaturizer, 
                                                                 split='random', 
                                                                 data_dir=data_dir,
                                                                 reload=False)
tox21_tasks_lst.append(tox21_tasks)
tox21_datasets_lst.append(tox21_datasets)
transformers_lst.append(transformers)

Loading raw samples now.
shard_size: 8192
About to start loading CSV from ./data/tox21/tox21.csv.gz
Loading shard 1 of size 8192.
Featurizing sample 0
Featurizing sample 1000
Featurizing sample 2000
Featurizing sample 3000
Featurizing sample 4000
Featurizing sample 5000
Featurizing sample 6000
Featurizing sample 7000
TIMING: featurizing shard 0 took 25.875 s
TIMING: dataset construction took 26.008 s
Loading dataset from disk.
TIMING: dataset construction took 0.106 s
Loading dataset from disk.
TIMING: dataset construction took 0.033 s
Loading dataset from disk.
TIMING: dataset construction took 0.033 s
Loading dataset from disk.
TIMING: dataset construction took 0.102 s
Loading dataset from disk.
TIMING: dataset construction took 0.016 s
Loading dataset from disk.
TIMING: dataset construction took 0.016 s
Loading dataset from disk.


# Fit Model

In [24]:
train_scores_lst = []
valid_scores_lst = []
test_scores_lst = []

In [25]:
for i in tnrange(5):
    tox21_datasets = tox21_datasets_lst[i]
    train_dataset, valid_dataset, test_dataset = tox21_datasets
    n_features = train_dataset.X.shape[1]

    # Fit models
    metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
    model = dc.models.SingletaskToMultitask(tox21_tasks_lst[i], model_builder)

    # Fit trained model
    print("About to fit model")
    model.fit(train_dataset)
    model.save()
    
    print("Evaluating model")
    train_scores = model.evaluate(train_dataset, [metric], transformers_lst[i])
    valid_scores = model.evaluate(valid_dataset, [metric], transformers_lst[i])
    test_scores = model.evaluate(test_dataset, [metric], transformers_lst[i])

    print("Train scores")
    print(train_scores)
    train_scores_lst.append(train_scores['mean-roc_auc_score'])

    print("Validation scores")
    print(valid_scores)
    valid_scores_lst.append(valid_scores['mean-roc_auc_score'])
    
    print("Test scores")
    print(test_scores)
    test_scores_lst.append(test_scores['mean-roc_auc_score'])

HBox(children=(IntProgress(value=0, max=5), HTML(value='')))

About to initialize singletask to multitask model
Initializing directory for task NR-AR
Initializing directory for task NR-AR-LBD
Initializing directory for task NR-AhR
Initializing directory for task NR-Aromatase
Initializing directory for task NR-ER
Initializing directory for task NR-ER-LBD
Initializing directory for task NR-PPAR-gamma
Initializing directory for task SR-ARE
Initializing directory for task SR-ATAD5
Initializing directory for task SR-HSE
Initializing directory for task SR-MMP
Initializing directory for task SR-p53
About to fit model
About to create task-specific datasets
Splitting multitask dataset into singletask datasets
TIMING: dataset construction took 0.023 s
Loading dataset from disk.
TIMING: dataset construction took 0.003 s
Loading dataset from disk.
TIMING: dataset construction took 0.003 s
Loading dataset from disk.
TIMING: dataset construction took 0.003 s
Loading dataset from disk.
TIMING: dataset construction took 0.003 s
Loading dataset from disk.
TIMING:

	Task NR-AhR
	Task NR-Aromatase
	Task NR-ER
	Task NR-ER-LBD
	Task NR-PPAR-gamma
	Task SR-ARE
	Task SR-ATAD5
	Task SR-HSE
	Task SR-MMP
	Task SR-p53
Dataset for task NR-AR has shape ((5811, 65), (5811, 1), (5811, 1), (5811,))
Dataset for task NR-AR-LBD has shape ((5397, 65), (5397, 1), (5397, 1), (5397,))
Dataset for task NR-AhR has shape ((5248, 65), (5248, 1), (5248, 1), (5248,))
Dataset for task NR-Aromatase has shape ((4670, 65), (4670, 1), (4670, 1), (4670,))
Dataset for task NR-ER has shape ((4950, 65), (4950, 1), (4950, 1), (4950,))
Dataset for task NR-ER-LBD has shape ((5573, 65), (5573, 1), (5573, 1), (5573,))
Dataset for task NR-PPAR-gamma has shape ((5155, 65), (5155, 1), (5155, 1), (5155,))
Dataset for task SR-ARE has shape ((4645, 65), (4645, 1), (4645, 1), (4645,))
Dataset for task SR-ATAD5 has shape ((5660, 65), (5660, 1), (5660, 1), (5660,))
Dataset for task SR-HSE has shape ((5169, 65), (5169, 1), (5169, 1), (5169,))
Dataset for task SR-MMP has shape ((4647, 65), (4647, 

Fitting model for task NR-AR-LBD
Fitting model for task NR-AhR
Fitting model for task NR-Aromatase
Fitting model for task NR-ER
Fitting model for task NR-ER-LBD
Fitting model for task NR-PPAR-gamma
Fitting model for task SR-ARE
Fitting model for task SR-ATAD5
Fitting model for task SR-HSE
Fitting model for task SR-MMP
Fitting model for task SR-p53
Evaluating model
computed_metrics: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]
computed_metrics: [0.7871944182142483, 0.7680155210643015, 0.8026440231917431, 0.7294133771929825, 0.6897588010303646, 0.7093112735667564, 0.7843667196608373, 0.7753088924963925, 0.8105515587529976, 0.7117117117117118, 0.8253319713993872, 0.8316956412194507]
computed_metrics: [0.8462487153134635, 0.8819307627357162, 0.8704520089285714, 0.843953899628026, 0.7556179775280899, 0.8472399254828906, 0.80789313904068, 0.7860526315789473, 0.8639725658956428, 0.682402881399537, 0.8524919769554963, 0.8179147241647241]
Train scores
{'mean-roc_auc_score': 1.0}