In [4]:
import json
from collections import defaultdict
import sys
import os
sys.path.insert(1, os.path.join(sys.path[0], '..'))
from models.data_subset_hyperparam_search import data_subset_hyperparam_search

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['svg.fonttype'] = 'none'  # https://stackoverflow.com/questions/34387893/output-matplotlib-figure-to-svg-with-text-as-text-not-curves
plt.style.use('seaborn-ticks')
import seaborn as sns

from sklearn.ensemble import (
    RandomForestClassifier, RandomForestRegressor,)
from sklearn.linear_model import (
    LogisticRegression, LogisticRegressionCV)
from sklearn.naive_bayes import GaussianNB 
from sklearn.svm import (
    SVC, SVR)
from sklearn.model_selection import (
    GridSearchCV, 
    RepeatedStratifiedKFold, 
    LeaveOneGroupOut,)
from xgboost import XGBClassifier, XGBRegressor  

In [None]:
# colors and class-superclass maps
with open(r'..\data\metadata\color_schemes.json') as infile:
    colors = json.load(infile)

animals_used = [1091, 1093, 1060, 1062, 1074, 1092, 1102, 1076, 1082, 1101]
diabetic =     [1076, 1082, 1101]
impaired =     [1060, 1062, 1074, 1092, 1102]
normal =       [1091, 1093]
animal_tol = {
    1076: 'diabetic', 1082: 'diabetic', 1101: 'diabetic', 1060: 'impaired', 1062: 'impaired', 
    1074: 'impaired', 1092: 'impaired', 1102: 'impaired', 1091: 'normal', 1093: 'normal'}
ap = pd.read_excel(r'..\data\metadata\animal_phenotypes.xlsx', index_col=0)

fg = pd.read_csv(r'..\data\metadata\combined_metab_lipid_file_grouping.csv', index_col=0)

# Use data that was sent to collaborators 
data = pd.read_excel(r'../data/processed/combined_metabolites_data.xlsx').set_index('i')
data_cols = data.filter(regex='FBG|RBG').columns
fbg_cols = data.filter(regex='FBG').columns
rbg_cols = data.filter(regex='RBG').columns
pval_cols = data.filter(regex='pval').columns

n = data.filter(regex='1091|1093')                 # normal
i = data.filter(regex='1060|1062|1074|1092|1102')  # impaired
d = data.filter(regex='1076|1082|1101')            # diabetic

f = data[fbg_cols]  # fasted
r = data[rbg_cols]  # random-fed

fn = n.filter(fbg_cols)  # fasted normal 
fi = i.filter(fbg_cols)  # fasted impaired
fd = d.filter(fbg_cols)  # fasted diabetic

rn = n.filter(rbg_cols)  # random normal
ri = i.filter(rbg_cols)  # random impaired
rd = d.filter(rbg_cols)  # random diabetic

qval_sampling = data['fdr corrected pval effect of sampling']
qval_gtol = data['fdr corrected pval effect of glucose tolerance']
qval_cross = data['fdr corrected pval effect of interaction sampling and glucose tolerance']

In [None]:
y_3class = fg['gluc_tol']                                                 # ['normal', 'impaired', 'diabetic']
y_3class_num = y_3class.map({'normal': 0, 'impaired': 1, 'diabetic': 2})  # [0, 1, 2]
y_bg = fg['bg']                                                           # random/fasted blood glucoose
y_ogtt = fg['animal'].map(ap['OGTT (AUC)'])
y_weight = fg['animal'].map(ap['Weightprefastweek12'])
y_insulin = fg['animal'].map(ap['Insulin (AUC)'])

X_all = data.loc[:, y_3class.index.values].T  # untransformed data
X_cent = pd.DataFrame(StandardScaler(with_std=False).fit_transform(X_all), index=data_cols, columns=data.index)  # mean-centered only
X_zs = pd.DataFrame(StandardScaler().fit_transform(X_all), index=data_cols, columns=data.index)  # z-scored

# generate new rbg_cols and fbg_cols that retain the order from fg
rbg_cols = fg.loc[fg['bg_type'] == 'RBG'].index
fbg_cols = fg.loc[fg['bg_type'] == 'FBG'].index

### Steps:
1. `pipeline` of data for each model
    1. include mean-centering, z-scoring
    2. Alternatively, just use the X_all, X_mean_centered, X_zscore datasets appropriately
1. Hyperparam search all the models (be selective; only choose most important params)
2. Save best hyperparam for each model 
3. Perform CV to test score for each model (using best hyperparams)
4. Perform CV to derive feature importances for each model 

### levels of searching:
1. Regression vs. Classification
    1. Regression: OGTT, weight, insulin, weekly BG
    2. Classification: 3-class or 2-class 
2. Models: 
    - Reg. and Clf: RF, SVM, mixed/fixed effects model, LinReg L1, LinReg Elastic Net, XGBoost, sparse PLS
    - Regression-only: LARS Regression 
    - Classification-only: Naive Bayes, logistic regression 
    - Reg vs. Clf require their own CVs 



In [None]:
clf_models = {
    'RF': [RandomForestClassifier, X_all],  # RepeatedStratifiedKFold
    'SVM': [SVC, X_zs],  # RepeatedStratifiedKFold
    'LR': [LogisticRegressionCV, X_zs],  # LogisticRegressionCV
#     'XGB': []
    'NB': [GaussianNB, ]
    
}