In [2]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

# cust_functions folder
from cust_functions.training import *
from cust_functions.graph_networks import *
from cust_functions.graph_creation import *

# Set random seed
SEED = 42
set_seed(SEED)


## 1.0 AKI Data

In [2]:
# Load data
input_data_qm = pd.read_csv("aki_data/test_qm.csv")
design_matrix = pd.read_csv("aki_data/design_matrix.tsv", sep="\t")

# Preprocess data
input_data_preprocessed = input_data_qm.fillna(0)
input_data = input_data_preprocessed.drop(['Protein'], axis=1)
design_matrix = design_matrix.replace(1, 0)
design_matrix = design_matrix.replace(2, 1)

# Split data into train and test
X_test = input_data.loc[:, ~input_data.columns.str.contains('M2012')].transpose()
X_train = input_data.loc[:, input_data.columns.str.contains('M2012')].transpose()
y_test = design_matrix['group'][~design_matrix['sample'].str.contains('M2012')]
y_train = design_matrix['group'][design_matrix['sample'].str.contains('M2012')]


### 1.1 Cross Validation

In [None]:
models = [DummyClassifier(random_state = SEED),
          SVC(random_state = SEED, probability = True),
          LogisticRegression(random_state = SEED, max_iter = 10000),
          KNeighborsClassifier(),
          RandomForestClassifier(random_state = SEED),
          AdaBoostClassifier(random_state = SEED),]

grid = {"Dummy": {"strategy": ["most_frequent"]},
        "SVC": {"C": [0.1, 1, 2], "kernel": ["linear", "rbf", "poly"]},
        "LR": {"C": [0.01, 0.1, 0.5, 1], "penalty": ["l1", "l2", "elasticnet", "none"]},
        "KNN": {"n_neighbors": [1, 5, 10, 15], "weights": ["uniform", "distance"]},
        "RF": {"n_estimators": [50, 100, 150]},
        "AdaBoost": {"n_estimators": [50, 100, 150]}}

scoring = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'f1_macro': make_scorer(f1_score, average='macro'),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score)
}

best_models, best_params = gridsearch(models, grid, X_train, y_train, scoring, refit = 'roc_auc')


In [4]:
for metric in best_models:
    best_models[metric].sort_values("Best Score", ascending=False, inplace=True)
    print(f"\nBest models based on {metric}:\n{best_models[metric]}")


Best models based on roc_auc:
      Model  Best Score                                 Best Params
4        RF    0.954688                       {'n_estimators': 150}
5  AdaBoost    0.909498                       {'n_estimators': 150}
1       SVC    0.820037                  {'C': 1, 'kernel': 'poly'}
2        LR    0.810723              {'C': 0.01, 'penalty': 'none'}
3       KNN    0.750797  {'n_neighbors': 10, 'weights': 'distance'}
0     Dummy    0.500000               {'strategy': 'most_frequent'}

Best models based on accuracy:
      Model  Best Score                                 Best Params
5  AdaBoost    0.759360                       {'n_estimators': 150}
4        RF    0.751232                       {'n_estimators': 150}
3       KNN    0.730049  {'n_neighbors': 10, 'weights': 'distance'}
1       SVC    0.723892                  {'C': 1, 'kernel': 'poly'}
2        LR    0.702956              {'C': 0.01, 'penalty': 'none'}
0     Dummy    0.574384               {'strategy': 'm

### 1.2 Testing

In [6]:
for model, names in zip(models, best_params.keys()):
    print("Predicting with model %s" % names)
    # print the model params
    model = model.set_params(**best_params[names])
    print(model)
    y_pred, y_pred_proba, cm = predict_ml_model(model, X_train, y_train, X_test, y_test)
    print_ml_metrics(cm, y_test, y_pred_proba)
    print("--------------------------------------")

Predicting with model Dummy
DummyClassifier(random_state=42, strategy='most_frequent')
Recall pheno1: 1.000000
Recall pheno0: 0.000000
Precision pheno1: 0.750000
Precision pheno0: 0.000000
F1 pheno1: 0.857143
F1 pheno0: nan
Accuracy: 0.750000
AUC: 0.500000
--------------------------------------
Predicting with model SVC
SVC(C=1, kernel='poly', probability=True, random_state=42)
Recall pheno1: 0.976190
Recall pheno0: 1.000000
Precision pheno1: 1.000000
Precision pheno0: 0.933333
F1 pheno1: 0.987952
F1 pheno0: 0.965517
Accuracy: 0.982143
AUC: 0.996599
--------------------------------------
Predicting with model LR
LogisticRegression(C=0.01, max_iter=10000, penalty='none', random_state=42)
Recall pheno1: 0.952381
Recall pheno0: 0.857143
Precision pheno1: 0.952381
Precision pheno0: 0.857143
F1 pheno1: 0.952381
F1 pheno0: 0.857143
Accuracy: 0.928571
AUC: 0.984694
--------------------------------------
Predicting with model KNN
KNeighborsClassifier(n_neighbors=10, weights='distance')
Recall 

  f1_pheno0 = 2 * precision_pheno0 * recall_pheno0 / (precision_pheno0 + recall_pheno0)


Recall pheno1: 1.000000
Recall pheno0: 0.928571
Precision pheno1: 0.976744
Precision pheno0: 1.000000
F1 pheno1: 0.988235
F1 pheno0: 0.962963
Accuracy: 0.982143
AUC: 1.000000
--------------------------------------
Predicting with model AdaBoost
AdaBoostClassifier(n_estimators=150, random_state=42)
Recall pheno1: 0.904762
Recall pheno0: 0.785714
Precision pheno1: 0.926829
Precision pheno0: 0.733333
F1 pheno1: 0.915663
F1 pheno0: 0.758621
Accuracy: 0.875000
AUC: 0.947279
--------------------------------------


## 2.0 COVID-19 Data

In [34]:
# Load COVID data
charite_data_X = pd.read_csv('covid_data/covid_train.tsv', sep='\t')
ins_data_X = pd.read_csv('covid_data/covid_test.txt', sep='\t', index_col=0)
charite_data_y = pd.read_csv('covid_data/covid_train_design.tsv', sep='\t')
ins_data_y = pd.read_csv('covid_data/covid_test_design.txt', sep='\t', index_col=0)

# Merge data
X = pd.merge(charite_data_X, ins_data_X, on='Protein', how='inner')
y = pd.concat([charite_data_y, ins_data_y])

# Preprocess data
X = X.fillna(0)
y = y.replace(1, 0)
y = y.replace(2, 1)

# Reshape data
X = X.drop(['Protein'], axis=1).transpose()
y = y['group']

# perform train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, shuffle=True, stratify=y, random_state=SEED)


In [42]:
X_train = pd.read_csv('covid_data/covid_train_qm.csv', index_col=0)
X_test = pd.read_csv('covid_data/covid_test_qm.csv', index_col=0)
y_train = pd.read_csv('covid_data/covid_train_design_qm.csv', index_col=0)
y_test = pd.read_csv('covid_data/covid_test_design_qm.csv', index_col=0)

# Reshape data
X_train = X_train.drop(['Protein'], axis=1).transpose()
X_test = X_test.drop(['Protein'], axis=1).transpose()
y_train = y_train['group']
y_test = y_test['group']

### 2.1 Cross Validation

In [None]:
models = [DummyClassifier(random_state = SEED),
          SVC(random_state = SEED, probability = True),
          LogisticRegression(random_state = SEED, max_iter = 10000),
          KNeighborsClassifier(),
          RandomForestClassifier(random_state = SEED),
          AdaBoostClassifier(random_state = SEED),]

grid = {"Dummy": {"strategy": ["most_frequent"]},
        "SVC": {"C": [0.1, 1, 2], "kernel": ["linear", "rbf", "poly"]},
        "LR": {"C": [0.01, 0.1, 0.5, 1], "penalty": ["l1", "l2", "elasticnet", "none"]},
        "KNN": {"n_neighbors": [1, 5, 10, 15], "weights": ["uniform", "distance"]},
        "RF": {"n_estimators": [50, 100, 150]},
        "AdaBoost": {"n_estimators": [50, 100, 150]}}

scoring = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'f1_macro': make_scorer(f1_score, average='macro'),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score)
}

best_models, best_params = gridsearch(models, grid, X_train, y_train, scoring, refit = 'roc_auc')


In [46]:
for metric in best_models:
    best_models[metric].sort_values("Best Score", ascending=False, inplace=True)
    print(f"\nBest models based on {metric}:\n{best_models[metric]}")


Best models based on roc_auc:
      Model  Best Score                                 Best Params
4        RF    0.944789                       {'n_estimators': 150}
1       SVC    0.915440                   {'C': 2, 'kernel': 'rbf'}
5  AdaBoost    0.909983                        {'n_estimators': 50}
2        LR    0.907310                {'C': 0.01, 'penalty': 'l2'}
3       KNN    0.898016  {'n_neighbors': 15, 'weights': 'distance'}
0     Dummy    0.500000               {'strategy': 'most_frequent'}

Best models based on accuracy:
      Model  Best Score                                 Best Params
4        RF    0.872610                       {'n_estimators': 150}
5  AdaBoost    0.828828                        {'n_estimators': 50}
2        LR    0.823184                {'C': 0.01, 'penalty': 'l2'}
1       SVC    0.823145                   {'C': 2, 'kernel': 'rbf'}
3       KNN    0.809030  {'n_neighbors': 15, 'weights': 'distance'}
0     Dummy    0.550215               {'strategy': 'm

### 2.2 Testing

In [47]:
for model, names in zip(models, best_params.keys()):
    print("Predicting with model %s" % names)
    # print the model params
    model = model.set_params(**best_params[names])
    print(model)
    y_pred, y_pred_proba, cm = predict_ml_model(model, X_train, y_train, X_test, y_test)
    print_ml_metrics(cm, y_test, y_pred_proba)
    print("--------------------------------------")

Predicting with model Dummy
DummyClassifier(random_state=42, strategy='most_frequent')
Recall pheno1: 1.000000
Recall pheno0: 0.000000
Precision pheno1: 0.518987
Precision pheno0: 0.000000
F1 pheno1: 0.683333
F1 pheno0: nan
Accuracy: 0.518987
AUC: 0.500000
--------------------------------------
Predicting with model SVC
SVC(C=2, probability=True, random_state=42)
Recall pheno1: 0.853659
Recall pheno0: 0.789474
Precision pheno1: 0.813953
Precision pheno0: 0.833333
F1 pheno1: 0.833333
F1 pheno0: 0.810811
Accuracy: 0.822785
AUC: 0.908858
--------------------------------------
Predicting with model LR
LogisticRegression(C=0.01, max_iter=10000, random_state=42)
Recall pheno1: 0.804878
Recall pheno0: 0.789474
Precision pheno1: 0.804878
Precision pheno0: 0.789474
F1 pheno1: 0.804878
F1 pheno0: 0.789474
Accuracy: 0.797468
AUC: 0.895379
--------------------------------------
Predicting with model KNN
KNeighborsClassifier(n_neighbors=15, weights='distance')
Recall pheno1: 0.853659
Recall pheno0:

  f1_pheno0 = 2 * precision_pheno0 * recall_pheno0 / (precision_pheno0 + recall_pheno0)


Recall pheno1: 0.829268
Recall pheno0: 0.894737
Precision pheno1: 0.894737
Precision pheno0: 0.829268
F1 pheno1: 0.860759
F1 pheno0: 0.860759
Accuracy: 0.860759
AUC: 0.927792
--------------------------------------
Predicting with model AdaBoost
AdaBoostClassifier(random_state=42)
Recall pheno1: 0.780488
Recall pheno0: 0.842105
Precision pheno1: 0.842105
Precision pheno0: 0.780488
F1 pheno1: 0.810127
F1 pheno0: 0.810127
Accuracy: 0.810127
AUC: 0.910783
--------------------------------------
