In [11]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, GridSearchCV


In [10]:
SEED = 42

In [30]:
input_data_qm = pd.read_csv("aki_data/test_qm.csv")
design_matrix = pd.read_csv("aki_data/design_matrix.tsv", sep="\t")

In [52]:
# split data into training and test set
# every column that contains M2012 is test set
input_data_preprocessed = input_data_qm.fillna(0)
input_data = input_data_preprocessed.drop(['Protein'], axis=1)
X_test = input_data.loc[:, ~input_data.columns.str.contains('M2012')].transpose()
X_train = input_data.loc[:, input_data.columns.str.contains('M2012')].transpose()
# split design matrix into training and test set
# first row contains patient id, split by M2012, then remove first row
y_test = design_matrix['group'][~design_matrix['sample'].str.contains('M2012')]
y_train = design_matrix['group'][design_matrix['sample'].str.contains('M2012')]
# change labels to 0 and 1 from 1 and 2
y_test = y_test.replace(1, 0)
y_test = y_test.replace(2, 1)
y_train = y_train.replace(1, 0)
y_train = y_train.replace(2, 1)

print(X_test.shape)
print(X_train.shape)
print(y_test.shape)
print(y_train.shape)

(56, 554)
(141, 554)
(56,)
(141,)


In [33]:
input_data_preprocessed.head()

Unnamed: 0,Protein,TM_P1911_190,TM_P1911_191,TM_P1911_192,TM_P1911_193,TM_P1911_194,TM_P1911_196,TM_P1911_197,TM_M2012_010,TM_M2012_011,...,TM_M2012_190,TM_M2012_191,TM_M2012_192,TM_M2012_196,TM_M2012_197,TM_M2012_198,TM_M2012_199,TM_M2012_200,TM_M2012_202,TM_M2012_203
0,P08603,22.381866,22.773908,22.732549,22.96053,22.906198,23.167862,23.122564,23.110142,23.179716,...,23.416677,23.498007,23.459972,23.403313,23.454894,23.602666,23.682634,23.665858,24.01571,23.655648
1,P02671,25.349974,25.43134,25.459891,25.275259,25.592789,24.829806,24.208987,23.984077,26.075865,...,24.984516,25.023149,24.971465,23.369445,24.604836,24.623221,24.787905,25.095571,25.103341,24.914344
2,P01042,22.061788,21.87217,21.966596,22.25614,22.505168,22.993978,23.277504,22.963205,22.767097,...,22.953879,23.08917,23.018547,23.280626,23.503529,23.471356,23.471414,23.19375,24.101306,23.486766
3,P00450,22.647246,23.193086,23.33278,23.206429,22.959381,23.008403,22.770807,22.971128,23.373016,...,23.788756,23.932623,23.904721,23.273831,23.462794,23.783564,23.968122,23.956618,23.989086,23.834912
4,P05156,21.301448,21.435684,21.304184,21.459141,21.532018,22.006447,21.968122,21.688934,21.37261,...,21.85053,21.883567,21.936084,21.778412,22.051,22.187546,21.965964,21.82084,22.373783,22.076671


In [54]:
design_matrix[design_matrix['sample'].str.contains('M2012')]

Unnamed: 0,sample,group
7,TM_M2012_010,1
8,TM_M2012_011,1
9,TM_M2012_012,1
10,TM_M2012_013,1
11,TM_M2012_014,1
...,...,...
192,TM_M2012_198,2
193,TM_M2012_199,2
194,TM_M2012_200,2
195,TM_M2012_202,2


In [72]:
models = [DummyClassifier(random_state = SEED),
          SVC(random_state = SEED, probability = True),
          LogisticRegression(random_state = SEED, max_iter = 1000),
          RandomForestClassifier(random_state = SEED),
          GradientBoostingClassifier(random_state = SEED)]

grid = {"Dummy": {"strategy": ["most_frequent"]},
        "SVC": {"C": [0.1, 1, 2], "kernel": ["linear", "rbf", "poly"]},
        "LR": {"C": [0.01, 0.1, 1], "penalty": ["l1", "l2", "elasticnet"]},
        "RF": {"n_estimators": [100, 300, 500], "max_depth": [2, 4, 5], "min_samples_split": [2, 4, 5]},
        "GB": {"learning_rate": [0.01, 0.001], "n_estimators": [100, 500]}
}

# train models
best_scores = {}
best_params = {}

for model_name, names in zip(models, grid.keys()):
    print("Training model %s" % names)
    model = GridSearchCV(model_name, grid[names], cv = 5, scoring = "roc_auc")
    model.fit(X_train, y_train)
    best_scores[names] = model.best_score_
    best_params[names] = model.best_params_

    # print best parameters and best score
    print("Best parameters: %s" % model.best_params_)
    print("Best score: %f" % model.best_score_)

best_models = pd.DataFrame({"Model": list(best_scores.keys()),
                                "Best score": list(best_scores.values()),
                                "Best params": list(best_params.values())})

best_models.sort_values("Best score", ascending=False, inplace=True)




Training model Dummy
Best parameters: {'strategy': 'most_frequent'}
Best score: 0.500000
Training model SVC
Best parameters: {'C': 1, 'kernel': 'poly'}
Best score: 0.820037
Training model LR


30 fits failed out of a total of 45.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
15 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/Yves.Gorgen/anaconda3/envs/adlg/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/Yves.Gorgen/anaconda3/envs/adlg/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/Yves.Gorgen/anaconda3/envs/adlg/lib/python3.9/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver lbfgs supports only 

Best parameters: {'C': 0.1, 'penalty': 'l2'}
Best score: 0.806434
Training model RF
Best parameters: {'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 500}
Best score: 0.957292
Training model GB
Best parameters: {'learning_rate': 0.001, 'n_estimators': 100}
Best score: 0.864522


In [69]:
# Using each model with best parameters, predict test set
# calculate AUC, ROC curve, confusion matrix

def predict_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    cm = confusion_matrix(y_test, y_pred)
    return y_pred, y_pred_proba, cm


def plot_confusion_matrix(cm):
    plt.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    plt.title("Confusion Matrix")
    plt.colorbar()
    plt.xticks([0,1], ["No AKI", "AKI"])
    plt.yticks([0,1], ["No AKI", "AKI"])
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

def print_metrics(cm, y_pred, y_pred_proba):
    recall_pheno1 = cm[1,1] / (cm[1,1] + cm[1,0] + 1e-10)
    recall_pheno0 = cm[0,0] / (cm[0,0] + cm[0,1] + 1e-10)
    precision_pheno1 = cm[1,1] / (cm[1,1] + cm[0,1] + 1e-10)
    precision_pheno0 = cm[0,0] / (cm[0,0] + cm[1,0] + 1e-10)
    f1_pheno1 = 2 * precision_pheno1 * recall_pheno1 / (precision_pheno1 + recall_pheno1)
    f1_pheno0 = 2 * precision_pheno0 * recall_pheno0 / (precision_pheno0 + recall_pheno0)
    accuracy = (cm[0,0] + cm[1,1]) / (cm[0,0] + cm[0,1] + cm[1,0] + cm[1,1])
    
    print("Recall pheno1: %f" % recall_pheno1)
    print("Recall pheno0: %f" % recall_pheno0)
    print("Precision pheno1: %f" % precision_pheno1)
    print("Precision pheno0: %f" % precision_pheno0)
    print("F1 pheno1: %f" % f1_pheno1)
    print("F1 pheno0: %f" % f1_pheno0)
    print("Accuracy: %f" % accuracy)
    print("AUC: %f" % roc_auc_score(y_test, y_pred_proba))




In [71]:
# loop through models and predict test set with best parameters

for model, names in zip(models, best_params.keys()):
    print("Predicting with model %s" % names)
    # print the model params
    model = model.set_params(**best_params[names])
    print(model)
    y_pred, y_pred_proba, cm = predict_model(model_name, X_train, y_train, X_test, y_test)
    print_metrics(cm, y_pred, y_pred_proba)
    print("--------------------------------------")

Predicting with model Dummy
DummyClassifier(random_state=42, strategy='most_frequent')
Recall pheno1: 1.000000
Recall pheno0: 0.857143
Precision pheno1: 0.954545
Precision pheno0: 1.000000
F1 pheno1: 0.976744
F1 pheno0: 0.923077
Accuracy: 0.964286
AUC: 1.000000
--------------------------------------
Predicting with model SVC
SVC(C=1, kernel='poly', probability=True, random_state=42)
Recall pheno1: 1.000000
Recall pheno0: 0.857143
Precision pheno1: 0.954545
Precision pheno0: 1.000000
F1 pheno1: 0.976744
F1 pheno0: 0.923077
Accuracy: 0.964286
AUC: 1.000000
--------------------------------------
Predicting with model LR
LogisticRegression(C=0.1, max_iter=1000, random_state=42)
Recall pheno1: 1.000000
Recall pheno0: 0.857143
Precision pheno1: 0.954545
Precision pheno0: 1.000000
F1 pheno1: 0.976744
F1 pheno0: 0.923077
Accuracy: 0.964286
AUC: 1.000000
--------------------------------------
Predicting with model RF
RandomForestClassifier(max_depth=5, n_estimators=500, random_state=42)
Recall 