# Oncological study

In [1]:
# for data frame analysis
import pandas as pd 

# for mathematical operations
import numpy as np 

# imports below are for plotly 
import ipywidgets as widgets
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode(connected=True)   # for offline mode use
import plotly.figure_factory as ff
import plotly.offline as offline


# matplotlib library for plotting
import matplotlib.pyplot as plt
%matplotlib inline

# For Normalizing data
#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

# For statistical test
import scipy.stats as stats

# Split data set into training and test set
from sklearn.model_selection import train_test_split as tts

# SVN module
from sklearn import svm

# Kernel Functions used 
from sklearn.metrics.pairwise import rbf_kernel,laplacian_kernel

# module for chi square test
from scipy.stats import chisquare


# For dictionary 
from collections import defaultdict


## dont use cuda
import os

os.environ["CUDA_VISIBLE_DEVICES"]="-1" 

# for use of tensorflow
import tensorflow as tf
#from tensorflow.nn.rnn import *
from tensorflow.python.ops  import *

# for scaling arrays
from sklearn.preprocessing import MaxAbsScaler,MinMaxScaler


# for random sampling of validation set
import random


import yaml
import os

%load_ext autoreload
%autoreload 

In [2]:
# check GPU availability for tensorflow
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


Num GPUs Available:  0


In [3]:
################################################################################
# load config file
try:
    with open("config.yml", 'r') as ymlfile:
            cfg = yaml.safe_load(ymlfile)
except (IOError):
    print('config file is required. Put config file in current directory')
################################################################################

In [4]:
# set important directories

# current working directory
cwd = os.getcwd()
# data folder
Datafolder = cwd + cfg['rawdatapath']['DataFolder']
# Data File name
datafileName= cfg['rawdatapath']['dataFileName']


# Load Data 

In [5]:
humanCachexiaData = pd.read_csv(Datafolder + datafileName )

In [6]:
humanCachexiaData.tail() # view data set 

Unnamed: 0,Patient ID,Muscle loss,"1,6-Anhydro-beta-D-glucose",1-Methylnicotinamide,2-Aminobutyrate,2-Hydroxyisobutyrate,2-Oxoglutarate,3-Aminoisobutyrate,3-Hydroxybutyrate,3-Hydroxyisovalerate,...,Tryptophan,Tyrosine,Uracil,Valine,Xylose,cis-Aconitate,myo-Inositol,trans-Aconitate,pi-Methylhistidine,tau-Methylhistidine
72,NETCR_019_V2,control,35.16,52.46,13.87,44.26,99.48,208.51,11.25,6.49,...,46.06,45.15,62.18,33.45,62.8,103.54,78.26,18.17,871.31,84.77
73,NETL_012_V1,control,16.95,15.8,10.49,22.42,62.8,10.91,6.96,3.46,...,21.33,21.33,31.19,13.2,14.3,36.23,11.59,12.3,53.52,44.7
74,NETL_012_V2,control,9.39,14.01,5.16,23.57,46.99,13.33,3.35,2.69,...,14.88,15.18,39.65,13.74,21.76,40.85,30.88,8.5,90.02,28.22
75,NETL_003_V1,control,37.71,18.17,26.05,15.03,23.34,33.45,6.05,5.26,...,17.46,29.96,13.46,14.59,36.97,90.92,17.64,12.43,897.85,90.02
76,NETL_003_V2,control,38.47,12.55,15.03,12.55,22.2,21.33,5.99,3.42,...,27.66,23.57,9.58,10.59,19.89,58.56,24.29,13.07,83.93,27.39


In [7]:
humanCachexiaData.describe()

Unnamed: 0,"1,6-Anhydro-beta-D-glucose",1-Methylnicotinamide,2-Aminobutyrate,2-Hydroxyisobutyrate,2-Oxoglutarate,3-Aminoisobutyrate,3-Hydroxybutyrate,3-Hydroxyisovalerate,3-Indoxylsulfate,4-Hydroxyphenylacetate,...,Tryptophan,Tyrosine,Uracil,Valine,Xylose,cis-Aconitate,myo-Inositol,trans-Aconitate,pi-Methylhistidine,tau-Methylhistidine
count,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,...,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0,77.0
mean,105.63039,71.573636,18.15974,37.250649,145.087143,76.756364,21.717013,21.647792,218.879221,112.021039,...,66.243117,81.757273,35.557662,35.667013,100.933377,204.21974,135.397532,40.63039,370.288312,89.686883
std,130.025595,133.192811,27.614526,23.956807,342.522174,191.014237,26.198904,24.946091,196.86873,120.812569,...,56.333318,83.248486,35.002894,29.697365,250.216325,278.141989,170.266474,39.566754,530.6893,77.239064
min,4.71,6.42,1.28,4.85,5.53,2.61,1.7,0.92,27.66,15.49,...,8.67,4.22,3.1,4.1,10.07,12.94,11.59,4.9,11.36,8.0
25%,28.79,15.8,5.26,15.8,22.42,11.7,5.99,5.26,82.27,41.68,...,21.33,23.57,11.94,12.18,29.96,36.23,30.27,12.43,67.36,27.39
50%,45.6,36.6,10.49,32.46,55.15,22.65,11.7,12.55,144.03,70.11,...,46.99,60.34,27.39,33.12,50.4,129.02,78.26,26.84,162.39,68.72
75%,141.17,73.7,19.49,54.6,92.76,56.26,29.96,30.27,333.62,145.47,...,96.54,113.3,44.26,50.4,89.12,254.68,167.34,57.4,387.61,130.32
max,685.4,1032.77,172.43,93.69,2465.13,1480.3,175.91,164.02,1043.15,796.32,...,259.82,539.15,179.47,160.77,2164.62,1863.11,854.06,217.02,2697.28,317.35


In [8]:
humanCachexiaData.rename(columns={'Muscle loss':'Muscle_loss'}, inplace=True)

In [9]:
#humanCachexiaData.to_csv('cachexic.csv',index=None)

In [10]:
# delete patient id for now
del humanCachexiaData['Patient ID']

In [11]:
clusterData = humanCachexiaData.copy(deep=True)

In [12]:
# reshuffling data
resuffle_time = 500
for i in range(resuffle_time):
    clusterData=clusterData.reindex(np.random.permutation(clusterData.index))

In [13]:
# reset data index
clusterData.reset_index(drop=True, inplace=True)

In [14]:
# encode as categorical variable
clusterData["Muscle_loss"] = clusterData["Muscle_loss"].astype('category')
clusterData["Muscle_loss"]= clusterData["Muscle_loss"].cat.codes


In [15]:
# Get the target data 
target_data = clusterData["Muscle_loss"].copy(deep=True)
del clusterData["Muscle_loss"]

In [16]:
#clusterData=clusterData.reindex(np.random.permutation(clusterData.index))


### LightGBM Model for Biomarker Selection

In [17]:
# lightgbm module
import lightgbm as lgb
#from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import normalize

In [18]:
VALIDATION_SIZE = 15

validation_set = clusterData[:VALIDATION_SIZE].values
validation_target = target_data[:VALIDATION_SIZE].values

train_set = clusterData[VALIDATION_SIZE:].values
train_target = target_data[VALIDATION_SIZE:].values

print('validation-set',validation_set.shape, '|' ,'Train-set', train_set.shape)
print('------------------------------------------------')
print('validation-target',validation_target.shape,'|','Train-Target' ,train_target.shape)

validation-set (15, 63) | Train-set (62, 63)
------------------------------------------------
validation-target (15,) | Train-Target (62,)


In [19]:
# load traning and target variable
# TO DO:
#      Include Validation set as in 
#      https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/advanced_example.py
lgb_train = lgb.Dataset(normalize(train_set,axis=0, norm='l2'), train_target, free_raw_data=False)
lgb_eval = lgb.Dataset(normalize(validation_set,axis=0, norm='l2'), validation_target, reference=lgb_train, free_raw_data=False)

In [20]:
#lgb_train = lgb.Dataset(clusterData.values, target_data.values, free_raw_data=False)
#lgb_eval = lgb.Dataset(validation_set, validation_target, reference=lgb_train, free_raw_data=False)

In [43]:
# Model Parameters
# change boosting type to dart to try it out
# 
params = {
    'boosting_type': 'gbdt', # dart
    'objective': 'binary', # cross-entropy
    'metric': ['auc', 'binary'],
    'num_leaves': 90,
    'num_iterations': 1000,
    'num_threads':3,
    'feature_fraction': 0.7,
    'force_col_wise':True,
    #'bagging_fraction': 0.4,
    'bagging_freq': 5,
    'verbose': 1
}

In [44]:
#early_stopping_rounds=5
#valid_sets=lgb_train,  # eval training data
#learning_rates=lambda iter: 0.05 * (0.99 ** iter),


In [45]:
names = clusterData.columns.to_list()

In [46]:
names = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in names]

In [47]:
len(names)

63

In [48]:
training_epoch = 300
step_decay = 200
num_boost_round = 7000
decay_rate = 5e-10
param = 0.5
early_stopping_rounds  = 6000
for i in range(training_epoch):
    if i == 0:
        gbm = lgb.train(params,
                            lgb_train,
                            num_boost_round=5000,
                            early_stopping_rounds=4000,
                            valid_sets=lgb_eval,  # eval training data
                            learning_rates = lambda iter: 0.5 * (0.99 ** iter),
                            #feval=log_root_mean_squared_error,
                            #fobj=logerror,
                            feature_name=names)
    else:
        #param -= decay_rate
        num_boost_round -= step_decay
        early_stopping_rounds  -= step_decay
        step_decay += 200
        params['feature_fraction'] = round(random.uniform(0.7, 1.), 1)
        params['bagging_freq'] = random.randint(5,15)
        params['num_leaves'] = random.randint(80, 200)
        gbm = lgb.train(params,
                            lgb_train,
                            num_boost_round=num_boost_round,
                            early_stopping_rounds=early_stopping_rounds,
                            init_model=gbm,
                            valid_sets=lgb_eval,  # eval training data
                            learning_rates = lambda iter: param * (0.99 ** iter),
                            #feval=log_root_mean_squared_error,
                            #fobj=logerror,
                            feature_name=names)

[1]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.417128
Training until validation scores don't improve for 4000 rounds
[2]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[3]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[4]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[5]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[6]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[7]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[8]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[9]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[10]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[11]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[12]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[13]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[14]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[15]	valid_0's auc: 0.962963	valid_0's binary_logl

[890]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[891]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[892]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[893]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[894]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[895]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[896]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[897]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[898]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[899]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[900]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[901]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[902]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[903]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[904]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.418956
[905]	valid_0's auc: 0.96

[725]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325591
[726]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325595
[727]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325594
[728]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325589
[729]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325587
[730]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325584
[731]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325579
[732]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325572
[733]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325567
[734]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325561
[735]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.32556
[736]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325564
[737]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325568
[738]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325571
[739]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.325571
[740]	valid_0's auc: 0.944

[792]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.368161
[793]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.368191
[794]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.368208
[795]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.368165
[796]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.36821
[797]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.368226
[798]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.368244
[799]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.368225
[800]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.368268
[801]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.368243
[802]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.368264
[803]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.36828
[804]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.3683
[805]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.368263
[806]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.368243
[807]	valid_0's auc: 0.944444

[697]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364377
[698]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364716
[699]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364614
[700]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364499
[701]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364745
[702]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364586
[703]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.36448
[704]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364363
[705]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364598
[706]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364448
[707]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364342
[708]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364447
[709]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364231
[710]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364046
[711]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.364145
[712]	valid_0's auc: 0.944

[422]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.361204
[423]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.363961
[424]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.366135
[425]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.370234
[426]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.372165
[427]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.373743
[428]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.370302
[429]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.366593
[430]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.369379
[431]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.367495
[432]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.370009
[433]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.366593
[434]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.36851
[435]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.370009
[436]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.368937
[437]	valid_0's auc: 0.925

[1204]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384953
[1205]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384953
[1206]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384953
[1207]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384954
[1208]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384953
[1209]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384954
[1210]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384955
[1211]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384952
[1212]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384953
[1213]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384954
[1214]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384957
[1215]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384956
[1216]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384957
[1217]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384956
[1218]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.384958
[1219]	val

[1024]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384023
[1025]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384016
[1026]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384017
[1027]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384017
[1028]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384014
[1029]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384016
[1030]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384014
[1031]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384016
[1032]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384018
[1033]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384023
[1034]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384021
[1035]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384024
[1036]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384017
[1037]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384014
[1038]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.384016
[1039]	val

[891]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307819
[892]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.30782
[893]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307821
[894]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307844
[895]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307848
[896]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307847
[897]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307831
[898]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307837
[899]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307831
[900]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307832
[901]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307832
[902]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307833
[903]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307828
[904]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307907
[905]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.307907
[906]	valid_0's auc: 0.962

[327]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.338637
Training until validation scores don't improve for -1200 rounds
Early stopping, best iteration is:
[327]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.338637
[328]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.331274
Training until validation scores don't improve for -3000 rounds
Early stopping, best iteration is:
[328]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.331274
[329]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.344437
Training until validation scores don't improve for -5000 rounds
Early stopping, best iteration is:
[329]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.344437
[330]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.326586
Training until validation scores don't improve for -7200 rounds
Early stopping, best iteration is:
[330]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.326586
[331]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.33056
Training until valid

[379]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.376678
Training until validation scores don't improve for -360000 rounds
Early stopping, best iteration is:
[379]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.376678
[380]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.367417
Training until validation scores don't improve for -372200 rounds
Early stopping, best iteration is:
[380]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.367417
[381]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.353452
Training until validation scores don't improve for -384600 rounds
Early stopping, best iteration is:
[381]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.353452
[382]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.354135
Training until validation scores don't improve for -397200 rounds
Early stopping, best iteration is:
[382]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.354135
[383]	valid_0's auc: 0.944444	valid_0's binary_logloss: 0.350749
Training un

[422]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.337942
Training until validation scores don't improve for -1065200 rounds
Early stopping, best iteration is:
[422]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.337942
[423]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.362878
Training until validation scores don't improve for -1086000 rounds
Early stopping, best iteration is:
[423]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.362878
[424]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.369163
Training until validation scores don't improve for -1107000 rounds
Early stopping, best iteration is:
[424]	valid_0's auc: 0.962963	valid_0's binary_logloss: 0.369163
[425]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.389441
Training until validation scores don't improve for -1128200 rounds
Early stopping, best iteration is:
[425]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.389441
[426]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.402679
Trainin

[468]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.449999
Training until validation scores don't improve for -2229000 rounds
Early stopping, best iteration is:
[468]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.449999
[469]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.42963
Training until validation scores don't improve for -2259000 rounds
Early stopping, best iteration is:
[469]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.42963
[470]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.443143
Training until validation scores don't improve for -2289200 rounds
Early stopping, best iteration is:
[470]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.443143
[471]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.450288
Training until validation scores don't improve for -2319600 rounds
Early stopping, best iteration is:
[471]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.450288
[472]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.436076
Training 

[514]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.483291
Training until validation scores don't improve for -3816000 rounds
Early stopping, best iteration is:
[514]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.483291
[515]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.495979
Training until validation scores don't improve for -3855200 rounds
Early stopping, best iteration is:
[515]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.495979
[516]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.501971
Training until validation scores don't improve for -3894600 rounds
Early stopping, best iteration is:
[516]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.501971
[517]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.513232
Training until validation scores don't improve for -3934200 rounds
Early stopping, best iteration is:
[517]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.513232
[518]	valid_0's auc: 0.907407	valid_0's binary_logloss: 0.49805
Training

[553]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.547853
Training until validation scores don't improve for -5493000 rounds
Early stopping, best iteration is:
[553]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.547853
[554]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.561611
Training until validation scores don't improve for -5540000 rounds
Early stopping, best iteration is:
[554]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.561611
[555]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.572282
Training until validation scores don't improve for -5587200 rounds
Early stopping, best iteration is:
[555]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.572282
[556]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.569137
Training until validation scores don't improve for -5634600 rounds
Early stopping, best iteration is:
[556]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.569137
[557]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.584549
Trainin

[592]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.635796
Training until validation scores don't improve for -8259600 rounds
Early stopping, best iteration is:
[592]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.635796
[592]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.635796
Training until validation scores don't improve for -8317200 rounds
Early stopping, best iteration is:
[592]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.635796
[592]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.635796
Training until validation scores don't improve for -8375000 rounds
Early stopping, best iteration is:
[592]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.635796
[592]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.635796
Training until validation scores don't improve for -8433000 rounds
Early stopping, best iteration is:
[592]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.635796
[592]	valid_0's auc: 0.925926	valid_0's binary_logloss: 0.635796
Trainin

In [49]:
# feature names
print('Feature names:', gbm.feature_name())



Feature names: ['1_6_Anhydro_beta_D_glucose', '1_Methylnicotinamide', '2_Aminobutyrate', '2_Hydroxyisobutyrate', '2_Oxoglutarate', '3_Aminoisobutyrate', '3_Hydroxybutyrate', '3_Hydroxyisovalerate', '3_Indoxylsulfate', '4_Hydroxyphenylacetate', 'Acetate', 'Acetone', 'Adipate', 'Alanine', 'Asparagine', 'Betaine', 'Carnitine', 'Citrate', 'Creatine', 'Creatinine', 'Dimethylamine', 'Ethanolamine', 'Formate', 'Fucose', 'Fumarate', 'Glucose', 'Glutamine', 'Glycine', 'Glycolate', 'Guanidoacetate', 'Hippurate', 'Histidine', 'Hypoxanthine', 'Isoleucine', 'Lactate', 'Leucine', 'Lysine', 'Methylamine', 'Methylguanidine', 'N_N_Dimethylglycine', 'O_Acetylcarnitine', 'Pantothenate', 'Pyroglutamate', 'Pyruvate', 'Quinolinate', 'Serine', 'Succinate', 'Sucrose', 'Tartrate', 'Taurine', 'Threonine', 'Trigonelline', 'Trimethylamine_N_oxide', 'Tryptophan', 'Tyrosine', 'Uracil', 'Valine', 'Xylose', 'cis_Aconitate', 'myo_Inositol', 'trans_Aconitate', 'pi_Methylhistidine', 'tau_Methylhistidine']


In [50]:
# feature importances
print('Feature importances:', list(gbm.feature_importance()))

Feature importances: [7, 9, 24, 0, 4, 38, 3, 14, 32, 14, 38, 12, 0, 1, 1, 9, 4, 7, 39, 4, 0, 0, 3, 8, 32, 1, 25, 32, 0, 24, 23, 2, 14, 33, 5, 3, 26, 13, 4, 30, 1, 2, 4, 5, 15, 0, 5, 42, 32, 6, 10, 25, 2, 4, 5, 32, 4, 4, 0, 0, 12, 32, 0]


In [51]:
feature_importance = list(zip(list(gbm.feature_name()), list(gbm.feature_importance())))


In [52]:
feature_importance.sort(key=lambda tup: tup[1],reverse=True)

In [53]:
feature_importance

[('Sucrose', 42),
 ('Creatine', 39),
 ('3_Aminoisobutyrate', 38),
 ('Acetate', 38),
 ('Isoleucine', 33),
 ('3_Indoxylsulfate', 32),
 ('Fumarate', 32),
 ('Glycine', 32),
 ('Tartrate', 32),
 ('Uracil', 32),
 ('pi_Methylhistidine', 32),
 ('N_N_Dimethylglycine', 30),
 ('Lysine', 26),
 ('Glutamine', 25),
 ('Trigonelline', 25),
 ('2_Aminobutyrate', 24),
 ('Guanidoacetate', 24),
 ('Hippurate', 23),
 ('Quinolinate', 15),
 ('3_Hydroxyisovalerate', 14),
 ('4_Hydroxyphenylacetate', 14),
 ('Hypoxanthine', 14),
 ('Methylamine', 13),
 ('Acetone', 12),
 ('trans_Aconitate', 12),
 ('Threonine', 10),
 ('1_Methylnicotinamide', 9),
 ('Betaine', 9),
 ('Fucose', 8),
 ('1_6_Anhydro_beta_D_glucose', 7),
 ('Citrate', 7),
 ('Taurine', 6),
 ('Lactate', 5),
 ('Pyruvate', 5),
 ('Succinate', 5),
 ('Tyrosine', 5),
 ('2_Oxoglutarate', 4),
 ('Carnitine', 4),
 ('Creatinine', 4),
 ('Methylguanidine', 4),
 ('Pyroglutamate', 4),
 ('Tryptophan', 4),
 ('Valine', 4),
 ('Xylose', 4),
 ('3_Hydroxybutyrate', 3),
 ('Formate', 3)

# Recursive Feature Selection

In [56]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR

In [None]:
estimator = SVR(kernel="linear")
selector = RFE(estimator, 5, step=1)
selector = selector.fit(train_set, train_target)
selector.support_




In [None]:
selector.ranking_