# Go-ZT 10-fold cross validation
### This notebook contains the code need to run a random 10 fold cross validation of a regression trained DNN using [0,1,2] encoded summary toxicity matrices and chemicals structural data.
#### See http://biorxiv.org/lookup/doi/10.1101/2020.10.02.322917 for details

By Adrian J Green, PhD

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os
# os.environ["CUDA_VISIBLE_DEVICES"]="0"           # Only one GPU will be seen

# tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.utils import multi_gpu_model

tf.keras.backend.clear_session()  # For easy reset of notebook state.

print("tensorflow version",tf.__version__,". Executing eagerly?",tf.executing_eagerly())

# minimize GPU useage by allowing memeory growth
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
          tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)
print("\nNumber of GPUs: ", len(tf.config.experimental.list_physical_devices('GPU')))

tensorflow version 2.1.0 . Executing eagerly? True
1 Physical GPUs, 1 Logical GPUs

Number of GPUs:  1


In [2]:
# standard python
import numpy as np
from sklearn.model_selection import ShuffleSplit
import pathlib
import collections
import warnings
import timeit

# plotting, especially for jupyter notebooks
import matplotlib
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
from IPython.display import Image

# pandas
import pandas as pd

# local routines
from chemdataprep import load_PDBs
from toxmathandler_AG import load_tmats

# NN build routines
from NNbuild_train_vis import init_generator

# NN train routines
from NNbuild_train_vis import write_training_file

# Performance evaluation routines
from gen_AggE import calc_AggE, display_conf_matrix

In [3]:
# Global data options

### PDB options

# cGAN & views parameters
# [Gfeatures,Gbaselayers,Glayers,Dfeatures,Dbaselayers,Dlayers,carbonbased, setNatoms, views, ClassLabels]
parameters = [279, 3, 11, 18, 0, 0, False, 82, 126, None] # Go-ZT

## Option to base views on carbon or not. (Safe even if some have no carbon.)
# Setting True will make the data smaller in memory and everything run faster.
carbonbased = parameters[6]
## Option for truncating the length of views.
# Truncating will make the data and NN smaller and things run faster.
# It make sense if we believe that looking at all neighborhoods of some size 
# gives sufficient understanding of the chemical.
# setNatoms = None # use max number in data
setNatoms = parameters[7] # truncate to this number
parameters[9] = None # allow cGAN to use class labels in training, None or int

views=parameters[8]

dataType = '(0,1)_18x1'

if (dataType.find('(0,1)_18x6')!=-1):
    concentrations = [0,1,2,3,4,5]
else:
    concentrations = [5]    ## Which of the available endpoints to use
endpoints = [i for i in range(4,22)] # use all
    
genpath = 'AG-model-GT-'+dataType+'.h5'
discpath = 'AG-model-DT-'+dataType+'.h5'

# Traning individual toxic
trpath = '/home2/ajgreen4/Read-Across_w_GAN/DataFiles/(0,1,2)_encoding/Tox21_training_compounds/'
valpath = '/home2/ajgreen4/Read-Across_w_GAN/DataFiles/(0,1,2)_encoding/Tox21_validation_compounds/'
allpath = '/home2/ajgreen4/Read-Across_w_GAN/DataFiles/(0,1,2)_encoding/tox21_all_compounds/'
    
modelpath = '/home2/ajgreen4/Read-Across_w_GAN/Models/'
dataType = '(0,1)_18x1'
genPrefix = 'AG-model-regGT-'+dataType

# Data Preparation

* We may want more info, such as a charge, to be included

In [4]:
# load all files files
[ws, vs, Natoms, Nviews, chemnames, Vshape] = load_PDBs(allpath,setNatoms=setNatoms,setNviews=views, carbonbased=carbonbased)

1003 pdb files found at /home2/ajgreen4/Read-Across_w_GAN/DataFiles/(0,1,2)_encoding/tox21_all_compounds/PDBs/
Species occurring = {'F', 'B', 'P', 'C', 'BR', 'S', 'N', 'O', 'SI', 'I', 'CL', 'H', 'AS'}
Setting all views to Natoms= 82
126 views needed, but setting to 126
Maximum views used = 126
Data tensor (w,v) shapes= (1003, 126) (1003, 126, 410)


In [5]:
# outputs/labels
# toxicity
print("Using toxicity matrix as labels")
### Toxicity matrix options
if (genPrefix.find('(0,1)_18x6')!=-1):
    concentrations = [0,1,2,3,4,5]
else:
    concentrations = [5]    ## Which of the available endpoints to use
endpoints = [i for i in range(4,22)] # use all
[toxicity,rows, cols, fish] = load_tmats(allpath,chemnames, 
                                                    concentration_indexes=concentrations,
                                                    endpoint_indexes=endpoints, transform=True,
                                                    verbose=1)

# legend labels for plotting
endpoints = [i for i in range(len(rows))]
concentrations = [i for i in range(len(cols))]
print("Using", len(concentrations), "concentrations")
print("Using", len(endpoints), "endpoints")
legend = [rows,cols,endpoints,concentrations]

Using toxicity matrix as labels
Transforming encoding to [no effect,effect or NA (dead)]=[0,1].
Number of chemicals= 1003
Using concentrations ['64 uM']
Using endpoints: ['MORT', 'YSE_', 'AXIS', 'EYE_', 'SNOU', 'JAW_', 'OTIC', 'PE__', 'BRAI', 'SOMI', 'PFIN', 'CFIN', 'PIG_', 'CIRC', 'TRUN', 'SWIM', 'NC__', 'TR__']
Toxicity vector length Ntoxicity= 18
Using 1 concentrations
Using 18 endpoints


# Neural Network code

### Fitting generator with conventional regression

This tests if G has enough power to get the truth, how fast it could learn, etc

In [6]:
# Losses and optimizers

# losses
Gloss_function = tf.keras.losses.MeanSquaredError()

# optimizers
generator_optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)

# Initialize RegGen
generator = init_generator([ws, vs],toxicity,parameters)
                           
# Shuffle split chemicals
ss = ShuffleSplit(n_splits=10, test_size=0.2, random_state=0)

In [7]:
%%time
k = 1
# fit
for train_index, test_index in ss.split(ws): 
    start_time = timeit.default_timer()
    print("Running fold : ", k, " of 10")
    X_train = [ws[train_index], vs[train_index]]
    Y_train = toxicity[train_index]

    X_test = [ws[test_index], vs[test_index]]
    Y_test = toxicity[test_index]
    
    BATCH_SIZE = 201
    history = generator.fit(X_train,Y_train,
                  epochs=75,batch_size=BATCH_SIZE,verbose=0,
                  validation_data=(X_test, Y_test))
    training_loss = round(generator.evaluate(X_train,Y_train,verbose=0), 4)
    validation_loss = round(generator.evaluate(X_test,Y_test,verbose=0), 4)
    
    gen_lab = generator.predict(X_test)
    chemnames_test = [chemnames[i] for i in test_index]
    
    # Calculate chemical activity - ignoring warning due to potential division by zero
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=Warning)
        [gen_activity_table, tox_activity_table, gen_AggE, tox_AggE] = calc_AggE(Y_test, chemnames_test, 
                                                                                 gen_lab, fish[test_index], 
                                                                                 endpoints, concentrations, 
                                                                                 verbose=0)

        metrics = display_conf_matrix(gen_activity_table, tox_activity_table, Gmodelname='Go-ZT') 
        
    print("Validation Dataset")
    print('    Kappa: ', metrics[0], '  AUROC: ', metrics[1], '    SE', metrics[2])
    print("Train loss:", training_loss, " Validation loss:", validation_loss)
    
    model_ID = "AG-model-GT-"+dataType+"-Kappa-"+str(metrics[0])+"-"+str(k)+"-fold.h5"

    summary_file_df = write_training_file(parameters,[model_ID, concentrations, ws.shape[1], training_loss, 
                                                      validation_loss], metrics, 
                                          '/home2/ajgreen4/Read-Across_w_GAN/output/test.xlsx')
    
    # Re-initialize the RegGen
    generator = init_generator([ws, vs],toxicity,parameters)
    k += 1
    
    # Determine time taken to run fold
    elapsed = timeit.default_timer() - start_time
    print("Time taken for fold:", round(elapsed, 1), "\n")

Running fold :  1  of 10
Validation Dataset
    Kappa:  0.598   AUROC:  0.754     SE 52.6
Train loss: 0.0221  Validation loss: 0.0207
Time taken for fold: 6.8 

Running fold :  2  of 10
Validation Dataset
    Kappa:  0.463   AUROC:  0.6909     SE 41.2
Train loss: 0.0223  Validation loss: 0.02
Time taken for fold: 7.1 

Running fold :  3  of 10
Validation Dataset
    Kappa:  0.392   AUROC:  0.6735     SE 40.6
Train loss: 0.0195  Validation loss: 0.0311
Time taken for fold: 7.4 

Running fold :  4  of 10
Validation Dataset
    Kappa:  0.425   AUROC:  0.6645     SE 34.1
Train loss: 0.0222  Validation loss: 0.0204
Time taken for fold: 6.7 

Running fold :  5  of 10
Validation Dataset
    Kappa:  0.499   AUROC:  0.7056     SE 43.6
Train loss: 0.0201  Validation loss: 0.0287
Time taken for fold: 7.3 

Running fold :  6  of 10
Validation Dataset
    Kappa:  0.594   AUROC:  0.7506     SE 51.9
Train loss: 0.0227  Validation loss: 0.0182
Time taken for fold: 6.9 

Running fold :  7  of 10
Valida

In [None]:
os._exit(00)