In [3]:
import time
from collections import defaultdict
import json
import math

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['svg.fonttype'] = 'none'  # https://stackoverflow.com/questions/34387893/output-matplotlib-figure-to-svg-with-text-as-text-not-curves
plt.style.use('seaborn-ticks')

import seaborn as sns
import dateutil
from scipy.stats import spearmanr, pearsonr, kendalltau
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import dendrogram, fcluster, fclusterdata

from sklearn.cluster import AgglomerativeClustering, FeatureAgglomeration
from sklearn.datasets import make_regression, make_classification
from sklearn.model_selection import (
    cross_val_score, cross_val_predict,
    RepeatedKFold, RepeatedStratifiedKFold,
    train_test_split, 
    GridSearchCV,
)
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.metrics import (
    classification_report,
    roc_auc_score, roc_curve, RocCurveDisplay, auc,
    confusion_matrix, multilabel_confusion_matrix, ConfusionMatrixDisplay,
    f1_score, 
    recall_score,
    accuracy_score,
    precision_score, precision_recall_curve, precision_recall_fscore_support,
    #     plot_roc_curve,
    #     plot_precision_recall_curve,  ## The plot_x_score API is deprecated
)
import sklearn
print(sklearn.__version__)

1.0.2


In [2]:
# colors and class-superclass maps
with open(r'..\data\metadata\color_schemes.json') as infile:
    colors = json.load(infile)

animals_used = [1091, 1093, 1060, 1062, 1074, 1092, 1102, 1076, 1082, 1101]
diabetic =     [1076, 1082, 1101]
impaired =     [1060, 1062, 1074, 1092, 1102]
normal =       [1091, 1093]
animal_tol = {
    1076: 'diabetic', 1082: 'diabetic', 1101: 'diabetic', 1060: 'impaired', 1062: 'impaired', 
    1074: 'impaired', 1092: 'impaired', 1102: 'impaired', 1091: 'normal', 1093: 'normal'}
ap = pd.read_excel(r'..\data\metadata\animal_phenotypes.xlsx', index_col=0)

fg = pd.read_csv(r'..\data\metadata\combined_metab_lipid_file_grouping.csv', index_col=0)

# Use data that was sent to collaborators 
data = pd.read_excel(r'../data/processed/combined_metabolites_data.xlsx').set_index('i')
data_cols = data.filter(regex='FBG|RBG').columns
fbg_cols = data.filter(regex='FBG').columns
rbg_cols = data.filter(regex='RBG').columns
pval_cols = data.filter(regex='pval').columns

n = data.filter(regex='1091|1093')                 # normal
i = data.filter(regex='1060|1062|1074|1092|1102')  # impaired
d = data.filter(regex='1076|1082|1101')            # diabetic

f = data[fbg_cols]  # fasted
r = data[rbg_cols]  # random-fed

fn = n.filter(fbg_cols)  # fasted normal 
fi = i.filter(fbg_cols)  # fasted impaired
fd = d.filter(fbg_cols)  # fasted diabetic

rn = n.filter(rbg_cols)  # random normal
ri = i.filter(rbg_cols)  # random impaired
rd = d.filter(rbg_cols)  # random diabetic

qval_sampling = data['fdr corrected pval effect of sampling']
qval_gtol = data['fdr corrected pval effect of glucose tolerance']
qval_cross = data['fdr corrected pval effect of interaction sampling and glucose tolerance']

In [9]:
X_all = data[data_cols].T
X_only_conv = data.loc[~data[pval_cols].isna().all(axis=1), data_cols].T
gluc_tol_cols = ['fdr corrected pval effect of glucose tolerance', 
                 'fdr corrected pval effect of interaction sampling and glucose tolerance']
X_only_signf = data.loc[(data[gluc_tol_cols] < 0.05).any(axis=1), data_cols].T


y_3class = fg.set_index('combined_col_name')['glucose_tolerance']         # ['normal', 'impaired', 'diabetic']
y_3class_num = y_3class.map({'normal': 0, 'impaired': 1, 'diabetic': 2})  # [0, 1, 2]
y_bg = fg.set_index('combined_col_name')['bg']                            # random/fasted blood glucoose
ogtt_dict = ap.to_dict()['OGTT (AUC)']                                    # OGTT AUC
y_ogtt = fg[['combined_col_name', 'animal']].set_index('combined_col_name')['animal'].map(ogtt_dict)  

In [35]:
def grid_search(X, y, model, params, cv, scoring, print_all=True, **kwargs):
    gs = GridSearchCV(estimator=model,
                      param_grid=params,
                      scoring=scoring,
                      cv=cv,
                      n_jobs=7,
                      **kwargs)
    result = gs.fit(X, y)
    print("Best: %f using %s" % (result.best_score_, result.best_params_))
    means = result.cv_results_['mean_test_score']
    stds = result.cv_results_['std_test_score']
    params = result.cv_results_['params']
    if print_all:
        for mean, stdev, param in zip(means, stds, params):
            print("%f (%f) with: %r" % (mean, stdev, param))
    l = []
    for mean, param in zip(means, params):
        d = {'mean': mean, **param}
        l.append(d)
    return pd.DataFrame(l)

In [44]:
model = SVR()
param_grid = {'kernel': ['rbf', 'linear', 'poly'],
              'C': [0.1, 0.4, 0.6, 0.7, 0.8, 0.85, 0.9, 0.93, 0.96, 0.98, 1.0]}
results = defaultdict(dict)
for cols, col_name in zip([rbg_cols, fbg_cols, data_cols], ['RBG', 'FBG', 'all']):
#     results[col_name] = {}
    for y, y_name in zip([y_ogtt, y_bg], ['ogtt', 'bg']):
        print(col_name, y_name)
        results[col_name][y_name] = grid_search(X_all.loc[cols], y=y.loc[cols], model=model, 
                                                params=param_grid, 
                                                cv=RepeatedKFold(n_splits=10, n_repeats=3, random_state=1), 
                                                print_all=False,  
                                                scoring='neg_mean_absolute_percentage_error')

RBG ogtt
Best: -0.142973 using {'C': 1.0, 'kernel': 'poly'}
RBG bg
Best: -0.286717 using {'C': 0.4, 'kernel': 'poly'}
FBG ogtt
Best: -0.164711 using {'C': 0.7, 'kernel': 'linear'}
FBG bg
Best: -0.185339 using {'C': 0.1, 'kernel': 'rbf'}
all ogtt
Best: -0.124892 using {'C': 0.6, 'kernel': 'poly'}
all bg
Best: -0.352308 using {'C': 0.1, 'kernel': 'poly'}


In [46]:
results['RBG']['bg'].groupby('kernel').mean()  # kernel C  bg  ogtt  RBG  FBG  all

Unnamed: 0_level_0,mean,C
kernel,Unnamed: 1_level_1,Unnamed: 2_level_1
linear,-0.296067,0.747273
poly,-0.288324,0.747273
rbf,-0.810622,0.747273


In [14]:
model = SVR()
param_grid = {'kernel': ['rbf', 'linear'],
              'C': [0.1, 0.4, 0.6, 0.7, 0.8, 0.85, 0.9, 0.93, 0.96, 0.98, 1.0]}
gs = GridSearchCV(
    model, 
    param_grid, 
    cv=10,
    scoring='neg_mean_absolute_percentage_error',
    n_jobs=7,
    )
gs_result_all_ogtt = gs.fit(X_all, y_ogtt)

# summarize results
print("Best: %f using %s" % (gs_result_all_ogtt.best_score_, gs_result_all_ogtt.best_params_))
means = gs_result_all_ogtt.cv_results_['mean_test_score']
stds = gs_result_all_ogtt.cv_results_['std_test_score']
params = gs_result_all_ogtt.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: -0.118269 using {'C': 0.9, 'kernel': 'linear'}
-0.307511 (0.077348) with: {'C': 0.1, 'kernel': 'rbf'}
-0.180080 (0.052390) with: {'C': 0.1, 'kernel': 'linear'}
-0.307386 (0.077298) with: {'C': 0.4, 'kernel': 'rbf'}
-0.131720 (0.039868) with: {'C': 0.4, 'kernel': 'linear'}
-0.307303 (0.077265) with: {'C': 0.6, 'kernel': 'rbf'}
-0.122239 (0.044749) with: {'C': 0.6, 'kernel': 'linear'}
-0.307261 (0.077249) with: {'C': 0.7, 'kernel': 'rbf'}
-0.120474 (0.046193) with: {'C': 0.7, 'kernel': 'linear'}
-0.307218 (0.077233) with: {'C': 0.8, 'kernel': 'rbf'}
-0.118752 (0.046856) with: {'C': 0.8, 'kernel': 'linear'}
-0.307197 (0.077225) with: {'C': 0.85, 'kernel': 'rbf'}
-0.118371 (0.047042) with: {'C': 0.85, 'kernel': 'linear'}
-0.307175 (0.077217) with: {'C': 0.9, 'kernel': 'rbf'}
-0.118269 (0.047122) with: {'C': 0.9, 'kernel': 'linear'}
-0.307162 (0.077213) with: {'C': 0.93, 'kernel': 'rbf'}
-0.118276 (0.047168) with: {'C': 0.93, 'kernel': 'linear'}
-0.307149 (0.077208) with: {'C': 0.96, 

In [11]:
gs_result_all_ogtt

GridSearchCV(cv=10, estimator=SVR(), n_jobs=7,
             param_grid={'C': [0.1, 0.4, 0.6, 0.7, 0.8, 0.85, 0.9, 0.93, 0.96,
                               0.98, 1.0],
                         'kernel': ['rbf', 'linear']})