In [7]:
import time
from collections import defaultdict
import json
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn-ticks')

import seaborn as sns
import dateutil

import xgboost as xgb

from scipy.stats import spearmanr, pearsonr, kendalltau
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import dendrogram, fcluster, fclusterdata

from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration
from sklearn.datasets import make_regression, make_classification
from sklearn.model_selection import (
    cross_val_score, cross_val_predict,
    RepeatedKFold, RepeatedStratifiedKFold,
    train_test_split, 
    GridSearchCV,
)
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.metrics import (
    classification_report,
    roc_auc_score, roc_curve, RocCurveDisplay, auc,
    confusion_matrix, multilabel_confusion_matrix, ConfusionMatrixDisplay,
    f1_score, 
    recall_score,
    accuracy_score,
    precision_score, precision_recall_curve, precision_recall_fscore_support,
    #     plot_roc_curve,
    #     plot_precision_recall_curve,  ## The plot_x_score API is deprecated
)
import sklearn
print(sklearn.__version__)

1.0.2


In [3]:
# colors
with open(r'..\data\metadata\color_schemes.json') as infile:
    colors = json.load(infile)
    
animals_used = [1091, 1093, 1060, 1062, 1074, 1092, 1102, 1076, 1082, 1101]
diabetic = [1076, 1082, 1101]
impaired = [1060, 1062, 1074, 1092, 1102]
normal = [1091, 1093]

obs = pd.read_excel(r'..\data\metadata\20210421_FBGRBGOGTT_Huishi sample sheet.xlsx', sheet_name='cleaned')
obs['animal'] = obs['animal'].str.strip('TM').astype('int')
obs['datetime'] = obs['when'].apply(dateutil.parser.parse)
animal_phenotypes = pd.read_excel(r'..\data\metadata\20210421_FBGRBGOGTT_Huishi sample sheet.xlsx', sheet_name='animal phenotypes')
animal_phenotypes['animal'] = animal_phenotypes['animal'].str.strip('TM').astype('int')

fg = pd.read_csv(r'..\data\metadata\combined_metab_lipid_file_grouping.csv', index_col=0)

# Use data that was sent to collaborators 
data = pd.read_excel(r'../data/processed/combined_metabolites_data.xlsx').set_index('i')
data_cols = data.filter(regex='FBG|RBG').columns
fbg_cols = data.filter(regex='FBG').columns
rbg_cols = data.filter(regex='RBG').columns
pval_cols = data.filter(regex='pval').columns

n = data.filter(regex='1091|1093')                 # normal
i = data.filter(regex='1060|1062|1074|1092|1102')  # impaired
d = data.filter(regex='1076|1082|1101')            # diabetic

f = data[fbg_cols]  # fasted
r = data[rbg_cols]  # random-fed

fn = n.filter(fbg_cols)  # fasted normal 
fi = i.filter(fbg_cols)  # fasted impaired
fd = d.filter(fbg_cols)  # fasted diabetic

rn = n.filter(rbg_cols)  # random normal
ri = i.filter(rbg_cols)  # random impaired
rd = d.filter(rbg_cols)  # random diabetic

# rename columns to remove '_RBG', '_FBG' for element-wise subtraction
rn.columns = rn.columns.str[:-4]
fn.columns = fn.columns.str[:-4]
ri.columns = ri.columns.str[:-4]
fi.columns = fi.columns.str[:-4]
rd.columns = rd.columns.str[:-4]
fd.columns = fd.columns.str[:-4]

qval_sampling = data['fdr corrected pval effect of sampling']
# qval_sampling.replace(np.nan, 1, inplace=True)  # For replacing blanks (model failed to converge) with 1
qval_gtol = data['fdr corrected pval effect of glucose tolerance']
qval_cross = data['fdr corrected pval effect of interaction sampling and glucose tolerance']

In [4]:
X_all = data[data_cols].T
X_only_conv = data.loc[~data[pval_cols].isna().all(axis=1), data_cols].T
gluc_tol_cols = ['fdr corrected pval effect of glucose tolerance', 
                 'fdr corrected pval effect of interaction sampling and glucose tolerance']
X_only_signf = data.loc[(data[gluc_tol_cols] < 0.05).any(axis=1), data_cols].T


y_3class = fg.set_index('combined_col_name')['glucose_tolerance']         # ['normal', 'impaired', 'diabetic']
y_3class_num = y_3class.map({'normal': 0, 'impaired': 1, 'diabetic': 2})  # [0, 1, 2]
y_bg = fg.set_index('combined_col_name')['bg']                            # random/fasted blood glucoose
ogtt_dict = animal_phenotypes.set_index('animal').to_dict()['OGTT (AUC)'] # OGTT AUC
y_ogtt = fg[['combined_col_name', 'animal']].set_index('combined_col_name')['animal'].map(ogtt_dict)  

# Grid Search Hyperparameters

In [5]:
def grid_search(X, y, model, params, cv, scoring):
    gs = GridSearchCV(estimator=model,
                      param_grid=params,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=7,)
    result = gs.fit(X, y)
    print("Best: %f using %s" % (result.best_score_, result.best_params_))
    means = result.cv_results_['mean_test_score']
    stds = result.cv_results_['std_test_score']
    params = result.cv_results_['params']
    for mean, stdev, param in zip(means, stds, params):
        print("%f (%f) with: %r" % (mean, stdev, param))
    l = []
    for mean, param in zip(means, params):
        d = {'mean': mean, **param}
        l.append(d)
    return pd.DataFrame(l)

In [12]:
xgbr = xgb.XGBRegressor(random_state=1)
params = {
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.05, 0.1, 0.15],
    'max_depth': [10, 20, 40],
    'min_child_weight': [1, 3, 6],
}

# hyperparameter search #1
hp1 = grid_search(X_all, y_ogtt, 
                    xgbr,
                    params=params,
                    cv=10,
                    scoring='neg_mean_absolute_percentage_error'
                    )

Best: -0.127280 using {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100}
-0.127280 (0.043969) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100}
-0.128675 (0.044275) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 500}
-0.128675 (0.044276) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 1000}
-0.129619 (0.047946) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 3, 'n_estimators': 100}
-0.130468 (0.047922) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 3, 'n_estimators': 500}
-0.130468 (0.047921) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 3, 'n_estimators': 1000}
-0.141665 (0.050936) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 6, 'n_estimators': 100}
-0.142696 (0.054001) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 6, 'n_estimators': 

In [15]:
hp1.groupby('learning_rate').mean()

Unnamed: 0_level_0,mean,max_depth,min_child_weight,n_estimators
learning_rate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0.05,-0.133583,23.333333,3.333333,533.333333
0.1,-0.13349,23.333333,3.333333,533.333333
0.15,-0.134834,23.333333,3.333333,533.333333


In [16]:
hp1.groupby('max_depth').mean()

Unnamed: 0_level_0,mean,learning_rate,min_child_weight,n_estimators
max_depth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
10,-0.133969,0.1,3.333333,533.333333
20,-0.133969,0.1,3.333333,533.333333
40,-0.133969,0.1,3.333333,533.333333


In [17]:
hp1.groupby('min_child_weight').mean()

Unnamed: 0_level_0,mean,learning_rate,max_depth,n_estimators
min_child_weight,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,-0.131316,0.1,23.333333,533.333333
3,-0.129791,0.1,23.333333,533.333333
6,-0.1408,0.1,23.333333,533.333333


In [18]:
hp1.groupby('n_estimators').mean()

Unnamed: 0_level_0,mean,learning_rate,max_depth,min_child_weight
n_estimators,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
100,-0.133713,0.1,23.333333,3.333333
500,-0.134097,0.1,23.333333,3.333333
1000,-0.134097,0.1,23.333333,3.333333


In [20]:
xgbr = xgb.XGBRegressor(random_state=1)
params = {
    'n_estimators': [100],
    'learning_rate': [0.05],
    'max_depth': [10],
    'min_child_weight': [0.5, 1, 1.5, 2, 2.5, 3, 3.5, 4],
}
hp2 = grid_search(X_all, y_ogtt, 
                    xgbr,
                    params=params,
                    cv=10,
                    scoring='neg_mean_absolute_percentage_error'
                    )

Best: -0.127280 using {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 0.5, 'n_estimators': 100}
-0.127280 (0.043969) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 0.5, 'n_estimators': 100}
-0.127280 (0.043969) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100}
-0.127573 (0.043033) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1.5, 'n_estimators': 100}
-0.127573 (0.043033) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 2, 'n_estimators': 100}
-0.129619 (0.047946) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 2.5, 'n_estimators': 100}
-0.129619 (0.047946) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 3, 'n_estimators': 100}
-0.137693 (0.044530) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 3.5, 'n_estimators': 100}
-0.137693 (0.044530) with: {'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 4, 'n_estim

In [23]:
xgbr = xgb.XGBRegressor(random_state=1)
params = {
    'n_estimators': [100],
    'learning_rate': [0.05],
    'max_depth': [10],
    'min_child_weight': [1],
    'gamma': [0, 0.1, 0.5, 1, 2, 5, 10]
}
hp3 = grid_search(X_all, y_ogtt, 
                    xgbr,
                    params=params,
                    cv=6,
                    scoring='neg_mean_absolute_percentage_error'
                    )

Best: -0.135795 using {'gamma': 10, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100}
-0.135863 (0.026671) with: {'gamma': 0, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100}
-0.135877 (0.026649) with: {'gamma': 0.1, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100}
-0.135883 (0.026657) with: {'gamma': 0.5, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100}
-0.135848 (0.026692) with: {'gamma': 1, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100}
-0.135855 (0.026643) with: {'gamma': 2, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100}
-0.135839 (0.026591) with: {'gamma': 5, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100}
-0.135795 (0.026714) with: {'gamma': 10, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100}


In [27]:
xgbr = xgb.XGBRegressor(n_jobs=7)
params = {
    'n_estimators': [100],
    'learning_rate': [0.05],
    'max_depth': [10],
    'min_child_weight': [1],
    'gamma': [0],
    'reg_alpha': [0.5],  # L1, generates sparse model
    'reg_lambda': [None]  # L2, quadratic penalty, removes limitation on number of selected variables
}
hp4 = grid_search(X_all, y_ogtt, 
                    xgbr,
                    params=params,
                    cv=10,
                    scoring='neg_mean_absolute_percentage_error'
                    )

Best: -0.128121 using {'gamma': 0, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.5, 'reg_lambda': None}
-0.128950 (0.041863) with: {'gamma': 0, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.1, 'reg_lambda': None}
-0.128121 (0.038846) with: {'gamma': 0, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 0.5, 'reg_lambda': None}
-0.130541 (0.039335) with: {'gamma': 0, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 1, 'reg_lambda': None}
-0.128304 (0.036865) with: {'gamma': 0, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 2, 'reg_lambda': None}
-0.131577 (0.038414) with: {'gamma': 0, 'learning_rate': 0.05, 'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 100, 'reg_alpha': 5, 'reg_lambda': None}


In [28]:
xgbfit = xgbr.fit(X_all, y_ogtt)


In [31]:
sum(xgbfit.feature_importances_ > 0)

258

# Hyperparameter search conclusions:
## learning_rate=0.05, max_depth=10, min_child_weight=1, n_estimators=100
## n_jobs=7, random_state=1

## XGBoost gives approximately 13% mean absolute percentage error in 10-fold CV on all 60 samples with all features

## Random Forest gives approximately 18% under the same conditions 