# Deep MicroBiome

Aug. 14. 2019
@ Youngwon (youngwon08@gmail.com)

In [1]:
import os
import json
import numpy as np
import pandas as pd
import copy
import logging
import sys

import keras.backend as k
import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

os.environ['CUDA_VISIBLE_DEVICES']=''

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
from deepbiome.deepbiome import *

In [3]:
if not tf.__version__.startswith('2'): 
    config = tf.ConfigProto(gpu_options=tf.GPUOptions(allow_growth=True))
    k.set_session(tf.Session(config=config))

## Pick Models

In [4]:
save = False
# kfold=1000
kfold=20
network_model_keys = ['optimizer','lr','decay']
architecture_keys = ['weight_decay', 'weight_l1_penalty', #'weight_l2_penalty',
                     'tree_thrd', 'weight_initial',
                     'batch_normalization','drop_out']
network_training_keys = ['batch_size','epochs']

logging.basicConfig(format = '[%(name)-8s|%(levelname)s|%(filename)s:%(lineno)s] %(message)s',
                    level=logging.DEBUG)
log = logging.getLogger()

In [5]:
#######################################################################
filenames = 'simulation_s0.Rmd'
models = [
          'simulation_s0/simulation_s0_deep',
          'simulation_s0/simulation_s0_deep_l1',
          'simulation_s0/simulation_s0_deepbiome',
         ]

models_aka = [
          'DNN',
          'DNN+l1',
          'DeepBiome',
         ]
num_classes = 0
########################################################################
# filenames = 'simulation_s1.Rmd'
# models = [
#           'simulation_s1/simulation_s1_deep',
#           'simulation_s1/simulation_s1_deep_l1',
#           'simulation_s1/simulation_s1_deepbiome',
#          ]

# models_aka = [
#           'DNN',
#           'DNN+l1',
#           'DeepBiome',
#          ]
# num_classes = 0
# ########################################################################
# filenames = 'simulation_s2.Rmd'
# models = [
#           'simulation_s2/simulation_s2_deep',
#           'simulation_s2/simulation_s2_deep_l1',
#           'simulation_s2/simulation_s2_deepbiome',
#          ]

# models_aka = [
#           'DNN',
#           'DNN+l1',
#           'DeepBiome',
#          ]
# num_classes = 1
# #######################################################################
# filenames = 'simulation_s3.Rmd'
# models = [
#           'simulation_s3/simulation_s3_deep',
#           'simulation_s3/simulation_s3_deep_l1',
#           'simulation_s3/simulation_s3_deepbiome',
#          ]

# models_aka = [
#           'DNN',
#           'DNN+l1',
#           'DeepBiome',
#          ]
# num_classes = 3
# # ########################################################################
# filenames = 'simulation_s4.Rmd'
# models = [
#           'simulation_s4/simulation_s4_deep',
#           'simulation_s4/simulation_s4_deep_l1',
#           'simulation_s4/simulation_s4_deepbiome',
#          ]

# models_aka = [
#           'DNN',
#           'DNN+l1',
#           'DeepBiome',
#          ]
# num_classes = 0
########################################################################
# filenames = 'simulation_s5.Rmd'
# models = [
#           'simulation_s5/simulation_s5_deep',
#           'simulation_s5/simulation_s5_deep_l1',
#           'simulation_s5/simulation_s5_deepbiome',
#          ]

# models_aka = [
#           'DNN',
#           'DNN+l1',
#           'DeepBiome',
#          ]
# num_classes = 0
########################################################################

In [6]:
model_network_info = {}
model_path_info = {}
for model_path in models:
    config_data = configuration.Configurator('%s/config/path_info.cfg' % model_path, log, verbose=False)
    config_data.set_config_map(config_data.get_section_map())
    config_network = configuration.Configurator('%s/config/network_info.cfg' % model_path, log, verbose=False)
    config_network.set_config_map(config_network.get_section_map())

    model_path_info[model_path] = config_data.get_config_map()
    model_network_info[model_path] = config_network.get_config_map()
    
if num_classes == 0: y_names = ['loss','correlation_coefficient']
elif num_classes==1: y_names = ['loss','binary_accuracy','sensitivity','specificity','gmeasure', 'auc']
else: y_names=['loss','categorical_accuracy','precision','recall','f1', 'auc']

if num_classes == 0: measure_index = np.array([0,1])
elif num_classes==1: measure_index = np.array([2,3,4,1,5])
else: measure_index = np.array([1,2,3,4,5])

## Accuracy

In [7]:
results = []
# log.info('%20s & %s' % ('model', '& '.join(['%s ' % name for name in np.array(y_names)[[measure_index]]])))
print('%10s & %s \\\\\ \hline' % ('model', '& '.join(['%7s &   (sd) ' % name for name in np.array(y_names)[[measure_index]]])))
# for model, aka in zip(models, models_aka):
#     evaluation = np.load('%s/eval.npy' % model)
#     log.info('%20s: %s' % (aka, ''.join(['%10.4f (%10.4f)'%(mean, std) for mean, std in zip(np.mean(evaluation, axis=0),np.std(evaluation, axis=0))])))
#     results.append(np.vstack([np.mean(evaluation, axis=0),np.std(evaluation, axis=0)]).transpose())
for model, aka in zip(models, models_aka):
    train_evaluation = np.load('%s/train_eval.npy' % model)[:,measure_index]
    train_res = '&'.join(['%7.3f & %7.3f'%(mean, std) for mean, std in zip(np.mean(train_evaluation, axis=0),np.std(train_evaluation, axis=0))])
    test_evaluation = np.load('%s/test_eval.npy' % model)[:,measure_index]
    test_res = '&'.join(['%7.3f & %7.3f'%(mean, std) for mean, std in zip(np.mean(test_evaluation, axis=0),np.std(test_evaluation, axis=0))])
#     log.info('%s & %s & %s \\\\' % (aka, train_res, test_res))
    print('%10s & %s & %s \\\\' % (aka, test_res, train_res))
#     results.append(np.vstack([np.mean(evaluation, axis=0),np.std(evaluation, axis=0)]).transpose())

     model &    loss &   (sd) & correlation_coefficient &   (sd)  \\\ \hline
       DNN &   0.076 &   0.040&    nan &     nan &   0.032 &   0.034&    nan &     nan \\
    DNN+l1 &   0.075 &   0.040&    nan &     nan &   0.034 &   0.039&    nan &     nan \\
 DeepBiome &   0.068 &   0.033&  0.881 &   0.069 &   0.042 &   0.031&  0.933 &   0.050 \\


# Choose Model

In [8]:
num=2
model_path = models[num]
model_aka = models_aka[num]

config_data = configuration.Configurator('%s/config/path_info.cfg' % model_path, log, verbose=False)
config_data.set_config_map(config_data.get_section_map())
config_network = configuration.Configurator('%s/config/network_info.cfg' % model_path, log, verbose=False)
config_network.set_config_map(config_network.get_section_map())

path_info = config_data.get_config_map()
network_info = config_network.get_config_map()

path_info['data_info']['data_path'] = '/'.join(path_info['data_info']['data_path'].split('/')[2:])
path_info['data_info']['tree_info_path'] = '/'.join(path_info['data_info']['tree_info_path'].split('/')[2:])
try: path_info['data_info']['count_list_path'] = '/'.join(path_info['data_info']['count_list_path'].split('/')[2:])
except: pass
try: path_info['data_info']['count_path'] = '/'.join(path_info['data_info']['count_path'].split('/')[2:])
except: pass
path_info['data_info']['idx_path'] = '/'.join(path_info['data_info']['idx_path'].split('/')[2:])

path_info['model_info']['model_dir'] = './%s/%s'%(model_path,path_info['model_info']['model_dir'])

log.info('%22s : %s' % ('model', model_path))
log.info('%22s : %s' % ('model_aka', model_aka))
for k in architecture_keys:
    log.info('%22s : %s' % (k, network_info['architecture_info'].get(k, None)))
for k in network_model_keys:
    log.info('%22s : %s' % (k, network_info['model_info'].get(k, None)))
for k in network_training_keys:
    log.info('%22s : %s' % (k, network_info['training_info'].get(k, None)))

[root    |INFO|<ipython-input-8-6e33d2ded9d6>:23]                  model : simulation_s0/simulation_s0_deepbiome
[root    |INFO|<ipython-input-8-6e33d2ded9d6>:24]              model_aka : DeepBiome
[root    |INFO|<ipython-input-8-6e33d2ded9d6>:26]           weight_decay : phylogenetic_tree
[root    |INFO|<ipython-input-8-6e33d2ded9d6>:26]      weight_l1_penalty : None
[root    |INFO|<ipython-input-8-6e33d2ded9d6>:26]              tree_thrd : None
[root    |INFO|<ipython-input-8-6e33d2ded9d6>:26]         weight_initial : glorot_uniform
[root    |INFO|<ipython-input-8-6e33d2ded9d6>:26]    batch_normalization : False
[root    |INFO|<ipython-input-8-6e33d2ded9d6>:26]               drop_out : 0
[root    |INFO|<ipython-input-8-6e33d2ded9d6>:28]              optimizer : adam
[root    |INFO|<ipython-input-8-6e33d2ded9d6>:28]                     lr : 0.01
[root    |INFO|<ipython-input-8-6e33d2ded9d6>:28]                  decay : 0.0001
[root    |INFO|<ipython-input-8-6e33d2ded9d6>:30]          

In [9]:
evaluation = np.load('%s/test_eval.npy' % model_path)
log.info('\t %s'%' '.join(['%s' % name for name in y_names]))

_ = [log.info('%d fold : %s' % (i,line)) for i, line in enumerate(evaluation)]
log.info('Mean   : %s' % np.mean(evaluation, axis=0))
log.info('Std   : %s' % np.std(evaluation, axis=0))

# _ = [print('%d fold & %s \\tabularnewline' % (i, ' & '.join(['%.3f'% v for v in line]))) for i, line in enumerate(evaluation)]
# print('Mean & %s \\tabularnewline' % (' & '.join(['%.3f'% v for v in np.mean(evaluation, axis=0)])))
# print('Sd & %s \\tabularnewline' % (' & '.join(['%.3f'% v for v in np.std(evaluation, axis=0)])))

[root    |INFO|<ipython-input-9-65b4b58a34e1>:2] 	 loss correlation_coefficient
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 0 fold : [0.0415751  0.91549832]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 1 fold : [0.03397216 0.96357918]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 2 fold : [0.08029224 0.74072057]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 3 fold : [0.03342853 0.85693628]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 4 fold : [0.08329882 0.84511006]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 5 fold : [0.01878441 0.95271671]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 6 fold : [0.01926193 0.9383561 ]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 7 fold : [0.13012013 0.82163298]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 8 fold : [0.08140648 0.89850581]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 9 fold : [0.07683571 0.89352709]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 10 fold : [0.11202967 0.83080298]
[root    |INFO|<i

[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 98 fold : [0.05098707 0.91272604]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 99 fold : [0.1258373  0.86110395]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:5] Mean   : [0.06765603 0.88100186]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:6] Std   : [0.03344868 0.06911001]


## Weight estimation of DeepBiom

We identify the largest weight estimatio of neurons in two hidden layers; by doing this, we can identify the strongest phylogenetic connections. We compute the True Positive Rate (``TPR``, sensitivity), True Negative Rate (``TNR``, specificity), and their geometric mean (i.e., ``g-Measure``). The false discovery rate (FDR) would be ``FDR = 1-TPR`` in our case.

In [10]:
weight_path = '%s/weight/weight_0.h5' % (model_path)
trained_weight_list = deepbiome_get_trained_weight(log, network_info, path_info, num_classes, weight_path)

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


In [11]:
trained_weight_list

[array([[-6.4633179e+00, -1.7676303e-02, -4.4930972e-02, ...,
         -7.2648464e-04,  1.2723205e-03,  3.3756655e-03],
        [ 2.2536314e-01, -6.7797213e+00,  2.5495121e-01, ...,
         -1.4391678e-04,  2.4994607e-03, -7.4193828e-02],
        [ 7.6148964e-02,  3.6209986e-02, -4.9121647e+00, ...,
          4.0628164e-04,  2.1666840e-03,  1.5494954e-03],
        ...,
        [ 4.4576552e-02, -1.5883645e-01,  1.2201204e-02, ...,
         -1.7851256e-02, -1.3783018e-03,  5.6677483e-02],
        [ 7.1924105e-02,  5.0403979e-02,  1.2307404e-02, ...,
         -8.7823148e-04, -4.3808103e-02, -5.1569410e-02],
        [ 6.1635148e-02,  9.8791108e-02, -1.3903435e-04, ...,
          8.6387742e-04, -1.1290456e-03,  3.5447127e-01]], dtype=float32),
 array([[-4.20841575e-03,  2.43677869e-02,  2.10193847e-03,
         -1.06796624e-05, -3.73406988e-03,  9.65888146e-03,
         -8.38983990e-03,  3.48460092e-03, -7.49634055e-04,
          2.24612979e-03, -2.30743065e-02,  5.47053702e-02,
         -

### Performance

In [12]:
# tw_1 = np.load('%s/tw_1.npy' % path_info['data_info']['data_path'])
# tw_2 = np.load('%s/tw_2.npy' % path_info['data_info']['data_path'])
# tw_3 = np.load('%s/tw_3.npy' % path_info['data_info']['data_path'])
# tw_4 = np.load('%s/tw_4.npy' % path_info['data_info']['data_path'])
# true_tree_weight_list = []
# for fold in range(kfold):
#     true_tree_weight_list.append(np.array([tw_1[fold],tw_2[fold],tw_3[fold],tw_4[fold]]))
# true_tree_weight_list = np.array(true_tree_weight_list)
# np.save('../deepbiome/tests/data/true_weight_list.npy', true_tree_weight_list)

In [13]:
true_tree_weight_list = np.load('../deepbiome/tests/data/true_weight_list.npy', allow_pickle=True)

In [14]:
summary = deepbiome_taxa_selection_performance(log, network_info, path_info, num_classes, true_tree_weight_list, number_of_fold=kfold)
summary.iloc[0,0] = model_aka

In [15]:
summary

Unnamed: 0,Model,PhyloTree,No. true taxa,No. total taxa,Sensitivity_mean,Sensitivity_std,Specificity_mean,Specificity_std,Gmeasure_mean,Gmeasure_std,Accuracy_mean,Accuracy_std
0,DeepBiome,Genus,31,48,0.940323,0.042398,0.643647,0.082888,0.776014,0.048775,0.648438,0.081424
1,,Family,23,40,0.954348,0.032171,0.795652,0.066477,0.870505,0.040072,0.79962,0.064841
2,,Order,9,23,0.977778,0.044444,0.829974,0.061995,0.899869,0.038632,0.833376,0.060475
3,,Class,7,17,0.935714,0.095565,0.833219,0.04712,0.881226,0.050006,0.837908,0.044598


In [16]:
print('%7s & %7s & %12s & %s' % ('Model', 'PhyloTree', 'True (Total)', ' & '.join(summary.columns[4:])))
print('---------------------------------------------------------------------------------------------------------------')
for i in range(summary.shape[0]):
    print('%10s & %7s & %7d (%d) & ' % tuple(summary.iloc[i,:4]) + ' &'.join(['%6.3f' % val for val in summary.iloc[i,4:]]) + ' \\\\')
    
# if save: 
#     # filenametexa = '.'.join(["%s_select_texa_1" % filename.split('.')[0], filename.split('.')[1]])
#     colname = ['Tree','True (Total)','Selected','Sensitivity','Specificity','gMeasure','Accuracy']
#     with open('%s/%s' % (analysis_dir, filename), mode='a') as f:
#     #     f.write('---\ntitle: "%s texa selection ver.1"\noutput: html_document\n---\n\n' % filename.split('.')[0])
#         f.write('\n## Texa Selection Preformance (ver 1): %s\n\n' % model_aka)
#         f.write('| %s |\n' % ('|'.join([v for v in colname])))
#         f.write('|'+'---|'*len(colname)+'\n')
#         for value in values:
#             f.write('| %s |\n' % ('|'.join(value)))

  Model & PhyloTree & True (Total) & Sensitivity_mean & Sensitivity_std & Specificity_mean & Specificity_std & Gmeasure_mean & Gmeasure_std & Accuracy_mean & Accuracy_std
---------------------------------------------------------------------------------------------------------------
 DeepBiome &   Genus &      31 (48) &  0.940 & 0.042 & 0.644 & 0.083 & 0.776 & 0.049 & 0.648 & 0.081 \\
           &  Family &      23 (40) &  0.954 & 0.032 & 0.796 & 0.066 & 0.871 & 0.040 & 0.800 & 0.065 \\
           &   Order &       9 (23) &  0.978 & 0.044 & 0.830 & 0.062 & 0.900 & 0.039 & 0.833 & 0.060 \\
           &   Class &       7 (17) &  0.936 & 0.096 & 0.833 & 0.047 & 0.881 & 0.050 & 0.838 & 0.045 \\
