# Deep MicroBiome

Aug. 14. 2019
@ Youngwon (youngwon08@gmail.com)

In [1]:
import os
import json
import numpy as np
import pandas as pd
import copy
import logging
import sys

import keras.backend as k
import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

os.environ['CUDA_VISIBLE_DEVICES']='0'

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from deepbiome.deepbiome import *

In [3]:
config = tf.ConfigProto()
config.gpu_options.allow_growth=True
#config.gpu_options.per_process_gpu_memory_fraction = 0.4
k.set_session(tf.Session(config=config))

logging.basicConfig(format = '[%(name)-8s|%(levelname)s|%(filename)s:%(lineno)s] %(message)s',
                    level=logging.DEBUG)
log = logging.getLogger()

## Pick Models

In [4]:
save = False
# kfold=1000
kfold=10
network_model_keys = ['optimizer','lr','decay']
architecture_keys = ['weight_decay', 'weight_l1_penalty', #'weight_l2_penalty',
                     'tree_thrd', 'weight_initial',
                     'batch_normalization','drop_out']
network_training_keys = ['batch_size','epochs']

In [5]:
#######################################################################
filenames = 'simulation_s0.Rmd'
models = [
          'simulation_s0/simulation_s0_deep',
          'simulation_s0/simulation_s0_deep_l1',
          'simulation_s0/simulation_s0_deepbiome',
         ]

models_aka = [
          'DNN',
          'DNN+l1',
          'DeepBiome',
         ]
num_classes = 0
########################################################################
# filenames = 'simulation_s1.Rmd'
# models = [
#           'simulation_s1/simulation_s1_deep',
#           'simulation_s1/simulation_s1_deep_l1',
#           'simulation_s1/simulation_s1_deepbiome',
#          ]

# models_aka = [
#           'DNN',
#           'DNN+l1',
#           'DeepBiome',
#          ]
# num_classes = 0
# ########################################################################
# filenames = 'simulation_s2.Rmd'
# models = [
#           'simulation_s2/simulation_s2_deep',
#           'simulation_s2/simulation_s2_deep_l1',
#           'simulation_s2/simulation_s2_deepbiome',
#          ]

# models_aka = [
#           'DNN',
#           'DNN+l1',
#           'DeepBiome',
#          ]
# num_classes = 1
# #######################################################################
# filenames = 'simulation_s3.Rmd'
# models = [
#           'simulation_s3/simulation_s3_deep',
#           'simulation_s3/simulation_s3_deep_l1',
#           'simulation_s3/simulation_s3_deepbiome',
#          ]

# models_aka = [
#           'DNN',
#           'DNN+l1',
#           'DeepBiome',
#          ]
# num_classes = 3
# # ########################################################################
# filenames = 'simulation_s4.Rmd'
# models = [
#           'simulation_s4/simulation_s4_deep',
#           'simulation_s4/simulation_s4_deep_l1',
#           'simulation_s4/simulation_s4_deepbiome',
#          ]

# models_aka = [
#           'DNN',
#           'DNN+l1',
#           'DeepBiome',
#          ]
# num_classes = 0
########################################################################
# filenames = 'simulation_s5.Rmd'
# models = [
#           'simulation_s5/simulation_s5_deep',
#           'simulation_s5/simulation_s5_deep_l1',
#           'simulation_s5/simulation_s5_deepbiome',
#          ]

# models_aka = [
#           'DNN',
#           'DNN+l1',
#           'DeepBiome',
#          ]
# num_classes = 0
########################################################################

In [6]:
model_network_info = {}
model_path_info = {}
for model_path in models:
    config_data = configuration.Configurator('%s/config/path_info.cfg' % model_path, log, verbose=False)
    config_data.set_config_map(config_data.get_section_map())
    config_network = configuration.Configurator('%s/config/network_info.cfg' % model_path, log, verbose=False)
    config_network.set_config_map(config_network.get_section_map())

    model_path_info[model_path] = config_data.get_config_map()
    model_network_info[model_path] = config_network.get_config_map()
    
if num_classes == 0: y_names = ['loss','correlation_coefficient']
elif num_classes==1: y_names = ['loss','binary_accuracy','sensitivity','specificity','gmeasure', 'auc']
else: y_names=['loss','categorical_accuracy','precision','recall','f1', 'auc']

if num_classes == 0: measure_index = np.array([0,1])
elif num_classes==1: measure_index = np.array([2,3,4,1,5])
else: measure_index = np.array([1,2,3,4,5])

## Accuracy

In [7]:
results = []
# log.info('%20s & %s' % ('model', '& '.join(['%s ' % name for name in np.array(y_names)[[measure_index]]])))
print('%20s & %s \\\\\ \hline' % ('model', '& '.join(['%s ' % name for name in np.array(y_names)[[measure_index]]])))
# for model, aka in zip(models, models_aka):
#     evaluation = np.load('%s/eval.npy' % model)
#     log.info('%20s: %s' % (aka, ''.join(['%10.4f (%10.4f)'%(mean, std) for mean, std in zip(np.mean(evaluation, axis=0),np.std(evaluation, axis=0))])))
#     results.append(np.vstack([np.mean(evaluation, axis=0),np.std(evaluation, axis=0)]).transpose())
for model, aka in zip(models, models_aka):
    train_evaluation = np.load('%s/train_eval.npy' % model)[:,measure_index]
    train_res = '&'.join(['%10.3f & %10.3f'%(mean, std) for mean, std in zip(np.mean(train_evaluation, axis=0),np.std(train_evaluation, axis=0))])
    test_evaluation = np.load('%s/test_eval.npy' % model)[:,measure_index]
    test_res = '&'.join(['%10.3f & %10.3f'%(mean, std) for mean, std in zip(np.mean(test_evaluation, axis=0),np.std(test_evaluation, axis=0))])
#     log.info('%s & %s & %s \\\\' % (aka, train_res, test_res))
    print('%s & %s & %s \\\\' % (aka, test_res, train_res))
#     results.append(np.vstack([np.mean(evaluation, axis=0),np.std(evaluation, axis=0)]).transpose())

               model & loss & correlation_coefficient  \\\ \hline
DNN &      0.071 &      0.036&     0.721 &      0.152 &      0.044 &      0.038&     0.859 &      0.132 \\
DNN+l1 &      0.058 &      0.039&     0.800 &      0.131 &      0.024 &      0.028&     0.934 &      0.073 \\
DeepBiome &      0.089 &      0.056&     0.671 &      0.176 &      0.064 &      0.037&     0.801 &      0.149 \\


# Choose Model

In [8]:
num=1
model_path = models[num]
model_aka = models_aka[num]

config_data = configuration.Configurator('%s/config/path_info.cfg' % model_path, log, verbose=False)
config_data.set_config_map(config_data.get_section_map())
config_network = configuration.Configurator('%s/config/network_info.cfg' % model_path, log, verbose=False)
config_network.set_config_map(config_network.get_section_map())

path_info = config_data.get_config_map()
network_info = config_network.get_config_map()

path_info['data_info']['data_path'] = '/'.join(path_info['data_info']['data_path'].split('/')[2:])
path_info['data_info']['tree_info_path'] = '/'.join(path_info['data_info']['tree_info_path'].split('/')[2:])
try: path_info['data_info']['count_list_path'] = '/'.join(path_info['data_info']['count_list_path'].split('/')[2:])
except: pass
try: path_info['data_info']['count_path'] = '/'.join(path_info['data_info']['count_path'].split('/')[2:])
except: pass
path_info['data_info']['idx_path'] = '/'.join(path_info['data_info']['idx_path'].split('/')[2:])
try: path_info['data_info']['disease_weight_path'] = '/'.join(path_info['data_info']['disease_weight_path'].split('/')[2:])
except: pass

log.info('%22s : %s' % ('model', model_path))
log.info('%22s : %s' % ('model_aka', model_aka))
for k in architecture_keys:
    log.info('%22s : %s' % (k, network_info['architecture_info'].get(k, None)))
for k in network_model_keys:
    log.info('%22s : %s' % (k, network_info['model_info'].get(k, None)))
for k in network_training_keys:
    log.info('%22s : %s' % (k, network_info['training_info'].get(k, None)))

[root    |INFO|<ipython-input-8-3ecb0a3edfce>:23]                  model : simulation_s0/simulation_s0_deep_l1
[root    |INFO|<ipython-input-8-3ecb0a3edfce>:24]              model_aka : DNN+l1
[root    |INFO|<ipython-input-8-3ecb0a3edfce>:26]           weight_decay : None
[root    |INFO|<ipython-input-8-3ecb0a3edfce>:26]      weight_l1_penalty : 0.01
[root    |INFO|<ipython-input-8-3ecb0a3edfce>:26]              tree_thrd : None
[root    |INFO|<ipython-input-8-3ecb0a3edfce>:26]         weight_initial : glorot_uniform
[root    |INFO|<ipython-input-8-3ecb0a3edfce>:26]    batch_normalization : False
[root    |INFO|<ipython-input-8-3ecb0a3edfce>:26]               drop_out : 0
[root    |INFO|<ipython-input-8-3ecb0a3edfce>:28]              optimizer : adam
[root    |INFO|<ipython-input-8-3ecb0a3edfce>:28]                     lr : 0.01
[root    |INFO|<ipython-input-8-3ecb0a3edfce>:28]                  decay : 0.
[root    |INFO|<ipython-input-8-3ecb0a3edfce>:30]             batch_size : 200
[r

In [9]:
evaluation = np.load('%s/test_eval.npy' % model_path)
log.info('\t %s'%' '.join(['%s' % name for name in y_names]))

_ = [log.info('%d fold : %s' % (i,line)) for i, line in enumerate(evaluation)]
log.info('Mean   : %s' % np.mean(evaluation, axis=0))
log.info('Std   : %s' % np.std(evaluation, axis=0))

# _ = [print('%d fold & %s \\tabularnewline' % (i, ' & '.join(['%.3f'% v for v in line]))) for i, line in enumerate(evaluation)]
# print('Mean & %s \\tabularnewline' % (' & '.join(['%.3f'% v for v in np.mean(evaluation, axis=0)])))
# print('Sd & %s \\tabularnewline' % (' & '.join(['%.3f'% v for v in np.std(evaluation, axis=0)])))

[root    |INFO|<ipython-input-9-65b4b58a34e1>:2] 	 loss correlation_coefficient
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 0 fold : [0.0747062  0.70466697]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 1 fold : [0.03213821 0.93062192]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 2 fold : [0.12762214 0.54031467]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 3 fold : [0.0121876  0.89720994]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 4 fold : [0.03184074 0.88832039]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 5 fold : [0.01334911 0.93398625]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 6 fold : [0.06008307 0.67137593]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 7 fold : [0.12404269 0.68642902]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 8 fold : [0.03922935 0.90568215]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:4] 9 fold : [0.06156569 0.83701855]
[root    |INFO|<ipython-input-9-65b4b58a34e1>:5] Mean   : [0.05767648 0.79956258]
[root    |INFO|<ip

## Weight estimation of DeepBiom

We identify the largest weight estimatio of neurons in two hidden layers; by doing this, we can identify the strongest phylogenetic connections. We compute the True Positive Rate (``TPR``, sensitivity), True Negative Rate (``TNR``, specificity), and their geometric mean (i.e., ``g-Measure``). The false discovery rate (FDR) would be ``FDR = 1-TPR`` in our case.

In [10]:
def texa_selection_accuracy(tree_weight_list, true_tree_weight_list):
    accuracy_list = []
    for i in range(len(true_tree_weight_list)):
        tree_tw = true_tree_weight_list[i].astype(np.int32)
        tree_w = np.zeros_like(tree_tw, dtype=np.int32)
        tree_w_abs = np.abs(tree_weight_list[i])
        for row, maxcol in enumerate(np.argmax(tree_w_abs, axis=1)):
            tree_w[row,maxcol] = tree_w_abs[row,maxcol]
#         tree_w = (tree_w > 1e-2).astype(np.int32)
        tree_w = (tree_w > 0).astype(np.int32)
        num_selected_texa = np.sum(np.sum(tree_w, axis=1)>0)
        sensitivity, specificity, gmeasure, accuracy = loss_and_metric.metric_texa_test(tree_tw.flatten(), tree_w.flatten())
        accuracy_list.append([num_selected_texa, sensitivity, specificity, gmeasure, accuracy])
    return accuracy_list

def texa_selection_accuracy_2(tree_weight_list, true_tree_weight_list):
    accuracy_list = []
    for i in range(len(true_tree_weight_list)):
        tree_tw = true_tree_weight_list[i].astype(np.int32)
        tree_w = np.zeros_like(tree_tw, dtype=np.int32)
        tree_w_abs = np.abs(tree_weight_list[i])
        tree_w = (tree_w_abs>1e-2).astype(np.int32)
#         for row in range(tree_w_abs.shape[0]):
# #             tree_w[row,:] = (tree_w_abs[row,:]> 0).astype(np.int32)
#             tree_w[row,:] = (tree_w_abs[row,:]> 1e-2).astype(np.int32)
        num_selected_texa = np.sum(np.sum(tree_w, axis=1)>0)
        sensitivity, specificity, gmeasure, accuracy = loss_and_metric.metric_texa_test(tree_tw.flatten(), tree_w.flatten())
        accuracy_list.append([num_selected_texa, sensitivity, specificity, gmeasure, accuracy])
    return accuracy_list

### Accuracy

In [11]:
model_weight_path = './%s/%s' % (model_path, path_info['model_info']['weight'])

network_class = getattr(build_network, network_info['model_info']['network_class'].strip()) 
# network = network_class(network_info, path_info['data_info'], log, fold=0, num_classes=max(1,num_classes))
network = network_class(network_info, path_info['data_info'], log, fold=0, num_classes=num_classes)
network.model_compile()

[root    |INFO|build_network.py:508] ------------------------------------------------------------------------------------------
[root    |INFO|build_network.py:509] Read phylogenetic tree information from data/genus48/genus48_dic.csv
[root    |INFO|build_network.py:513] Phylogenetic tree level list: ['Genus', 'Family', 'Order', 'Class', 'Phylum']
[root    |INFO|build_network.py:514] ------------------------------------------------------------------------------------------
[root    |INFO|build_network.py:519]      Genus: 48
[root    |INFO|build_network.py:519]     Family: 40
[root    |INFO|build_network.py:519]      Order: 23
[root    |INFO|build_network.py:519]      Class: 17
[root    |INFO|build_network.py:519]     Phylum: 9
[root    |INFO|build_network.py:522] ------------------------------------------------------------------------------------------
[root    |INFO|build_network.py:523] Phylogenetic_tree_dict info: ['Family', 'Phylum', 'Class', 'Genus', 'Number', 'Order']
[root    |IN

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.
[root    |INFO|build_network.py:636] ------------------------------------------------------------------------------------------


Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input (InputLayer)           (None, 48)                0         
_________________________________________________________________
l1_dense (Dense_with_new_tre (None, 40)                1960      
_________________________________________________________________
l1_activation (Activation)   (None, 40)                0         
_________________________________________________________________
l2_dense (Dense_with_new_tre (None, 23)                943       
_________________________________________________________________
l2_activation (Activation)   (None, 23)                0         
_________________________________________________________________
l3_dense (Dense_with_new_tre (None, 17)                408       
_________________________________________________________________
l3_activation (Activation)   (None, 17)                0   

[root    |INFO|build_network.py:57] Build Network
[root    |INFO|build_network.py:58] Optimizer = adam
[root    |INFO|build_network.py:59] Loss = mean_squared_error
[root    |INFO|build_network.py:60] Metrics = correlation_coefficient


In [12]:
accuracy_list = []
for fold in range(kfold):
    network.fold = fold
    network.load_weights(file_path_fold(model_weight_path, fold), verbose=False)
    true_tree_weight_list = network.load_true_tree_weight_list(path_info['data_info']['data_path'])
    tree_weight_list = network.get_trained_weight()
    accuracy_list.append(np.array(texa_selection_accuracy(tree_weight_list, true_tree_weight_list)))
accuracy_list = np.array(accuracy_list)[:,:,1:]

# print('%7s, %12s, %12s, %12s, %12s, %12s, %12s' % ('Model','True (Total)','Selected','Sensitivity','Specificity','gMeasure','Accuracy'))
# print('---------------------------------------------------------------------------------------------------------------')
# values = []
# for i, (mean, std) in enumerate(zip(np.mean(accuracy_list, axis=0), np.std(accuracy_list, axis=0))):
#     tree_tw = true_tree_weight_list[i].astype(np.int32)
#     args = [network.tree_level_list[i], np.sum(np.sum(tree_tw, axis=1)>0), tree_tw.shape[0]]+ np.stack([mean, std]).T.flatten().tolist()
#     value = '%7s, %7d (%2d), %7d (%2d), %5.3f (%5.3f), %5.3f (%5.3f), %5.3f (%5.3f), %5.3f (%5.3f)' % tuple(args)
#     values.append(value.split(','))
    
print('%7s, %12s, %12s, %12s, %12s, %12s, %12s' % ('Model','True (Total)','Selected','Sensitivity','Specificity','gMeasure','Accuracy'))
print('---------------------------------------------------------------------------------------------------------------')
values = []
for i, (mean, std) in enumerate(zip(np.mean(accuracy_list, axis=0), np.std(accuracy_list, axis=0))):
    tree_tw = true_tree_weight_list[i].astype(np.int32)
    args = [network.tree_level_list[i], np.sum(np.sum(tree_tw, axis=1)>0), tree_tw.shape[0]]
    value = '%7s & %7d (%2d)' % tuple(args)
    value = '%s & %s \\\\' % (value, '&'.join(['%6.3f & %6.3f'%(m,s) for m, s in zip(mean, std)]))
    if i == 0: print('%10s & %s' % (model_aka, value))
    else: print('%10s & %s' % ('', value))
    values.append(value.split(','))
    
# if save: 
#     # filenametexa = '.'.join(["%s_select_texa_1" % filename.split('.')[0], filename.split('.')[1]])
#     colname = ['Tree','True (Total)','Selected','Sensitivity','Specificity','gMeasure','Accuracy']
#     with open('%s/%s' % (analysis_dir, filename), mode='a') as f:
#     #     f.write('---\ntitle: "%s texa selection ver.1"\noutput: html_document\n---\n\n' % filename.split('.')[0])
#         f.write('\n## Texa Selection Preformance (ver 1): %s\n\n' % model_aka)
#         f.write('| %s |\n' % ('|'.join([v for v in colname])))
#         f.write('|'+'---|'*len(colname)+'\n')
#         for value in values:
#             f.write('| %s |\n' % ('|'.join(value)))

  Model, True (Total),     Selected,  Sensitivity,  Specificity,     gMeasure,     Accuracy
---------------------------------------------------------------------------------------------------------------
    DNN+l1 &   Genus &      31 (48) &  0.000 &  0.000& 0.993 &  0.006& 0.000 &  0.000& 0.977 &  0.005 \\
           &  Family &      23 (40) &  0.000 &  0.000& 0.996 &  0.003& 0.000 &  0.000& 0.971 &  0.003 \\
           &   Order &       9 (23) &  0.000 &  0.000& 0.999 &  0.002& 0.000 &  0.000& 0.976 &  0.002 \\
           &   Class &       7 (17) &  0.000 &  0.000& 0.998 &  0.003& 0.000 &  0.000& 0.952 &  0.003 \\


In [13]:
accuracy_list = []
for fold in range(kfold):
    network.fold = fold
    network.load_weights(file_path_fold(model_weight_path, fold), verbose=False)
    true_tree_weight_list = network.load_true_tree_weight_list(path_info['data_info']['data_path'])
    tree_weight_list = network.get_trained_weight()
    accuracy_list.append(np.array(texa_selection_accuracy_2(tree_weight_list, true_tree_weight_list)))
accuracy_list = np.array(accuracy_list)[:,:,1:]

# print('%7s, %12s, %12s, %12s, %12s, %12s, %12s' % ('Model','True (Total)','Selected','Sensitivity','Specificity','gMeasure','Accuracy'))
# print('---------------------------------------------------------------------------------------------------------------')
# values = []
# for i, (mean, std) in enumerate(zip(np.mean(accuracy_list, axis=0), np.std(accuracy_list, axis=0))):
#     tree_tw = true_tree_weight_list[i].astype(np.int32)
#     args = [network.tree_level_list[i], np.sum(np.sum(tree_tw, axis=1)>0), tree_tw.shape[0]]+ np.stack([mean, std]).T.flatten().tolist()
#     value = '%7s, %7d (%2d), %7d (%2d), %5.3f (%5.3f), %5.3f (%5.3f), %5.3f (%5.3f), %5.3f (%5.3f)' % tuple(args)
#     values.append(value.split(','))
    
print('%7s, %12s, %12s, %12s, %12s, %12s, %12s' % ('Model','True (Total)','Selected','Sensitivity','Specificity','gMeasure','Accuracy'))
print('---------------------------------------------------------------------------------------------------------------')
values = []
for i, (mean, std) in enumerate(zip(np.mean(accuracy_list, axis=0), np.std(accuracy_list, axis=0))):
    tree_tw = true_tree_weight_list[i].astype(np.int32)
    args = [network.tree_level_list[i], np.sum(np.sum(tree_tw, axis=1)>0), tree_tw.shape[0]]
    value = '%7s & %7d (%2d)' % tuple(args)
    value = '%s & %s \\\\' % (value, '&'.join(['%6.3f & %6.3f'%(m,s) for m, s in zip(mean, std)]))
    if i == 0: print('%10s & %s' % (model_aka, value))
    else: print('%10s & %s' % ('', value))
    values.append(value.split(','))
    
# if save: 
#     # filenametexa = '.'.join(["%s_select_texa_1" % filename.split('.')[0], filename.split('.')[1]])
#     colname = ['Tree','True (Total)','Selected','Sensitivity','Specificity','gMeasure','Accuracy']
#     with open('%s/%s' % (analysis_dir, filename), mode='a') as f:
#     #     f.write('---\ntitle: "%s texa selection ver.1"\noutput: html_document\n---\n\n' % filename.split('.')[0])
#         f.write('\n## Texa Selection Preformance (ver 1): %s\n\n' % model_aka)
#         f.write('| %s |\n' % ('|'.join([v for v in colname])))
#         f.write('|'+'---|'*len(colname)+'\n')
#         for value in values:
#             f.write('| %s |\n' % ('|'.join(value)))

  Model, True (Total),     Selected,  Sensitivity,  Specificity,     gMeasure,     Accuracy
---------------------------------------------------------------------------------------------------------------
    DNN+l1 &   Genus &      31 (48) &  0.961 &  0.035& 0.036 &  0.006& 0.184 &  0.014& 0.050 &  0.005 \\
           &  Family &      23 (40) &  0.965 &  0.026& 0.030 &  0.003& 0.170 &  0.010& 0.053 &  0.004 \\
           &   Order &       9 (23) &  0.956 &  0.074& 0.020 &  0.006& 0.136 &  0.024& 0.041 &  0.006 \\
           &   Class &       7 (17) &  0.971 &  0.057& 0.018 &  0.014& 0.116 &  0.068& 0.062 &  0.014 \\
