In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

# cust_functions folder
from cust_functions.training import *
from cust_functions.graph_networks import *
from cust_functions.graph_creation import *

# Set random seed
SEED = 42
set_seed(SEED)


## 1.0 AKI Data

In [2]:
# Load data
input_data_qm = pd.read_csv("aki_data/test_qm.csv")
design_matrix = pd.read_csv("aki_data/design_matrix.tsv", sep="\t")

# Preprocess data
input_data_preprocessed = input_data_qm.fillna(0)
input_data = input_data_preprocessed.drop(['Protein'], axis=1)
design_matrix = design_matrix.replace(1, 0)
design_matrix = design_matrix.replace(2, 1)

# Split data into train and test
X_test = input_data.loc[:, ~input_data.columns.str.contains('M2012')].transpose()
X_train = input_data.loc[:, input_data.columns.str.contains('M2012')].transpose()
y_test = design_matrix['group'][~design_matrix['sample'].str.contains('M2012')]
y_train = design_matrix['group'][design_matrix['sample'].str.contains('M2012')]


### 1.1 Cross Validation

In [None]:
models = [DummyClassifier(random_state = SEED),
          SVC(random_state = SEED, probability = True),
          LogisticRegression(random_state = SEED, max_iter = 10000),
          KNeighborsClassifier(),
          RandomForestClassifier(random_state = SEED),
          AdaBoostClassifier(random_state = SEED),]

grid = {"Dummy": {"strategy": ["most_frequent"]},
        "SVC": {"C": [0.1, 1, 2], "kernel": ["linear", "rbf", "poly"]},
        "LR": {"C": [0.01, 0.1, 0.5, 1], "penalty": ["l1", "l2", "elasticnet", "none"]},
        "KNN": {"n_neighbors": [1, 5, 10, 15], "weights": ["uniform", "distance"]},
        "RF": {"n_estimators": [50, 100, 150]},
        "AdaBoost": {"n_estimators": [50, 100, 150]}}

scoring = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'f1_macro': make_scorer(f1_score, average='macro'),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score)
}

best_models, best_params, fitted_models = gridsearch(models, grid, X_train, y_train, scoring, refit = 'roc_auc')


In [4]:
for metric in best_models:
    best_models[metric].sort_values("Best Score", ascending=False, inplace=True)
    print(f"\nBest models based on {metric}:\n{best_models[metric]}")


Best models based on roc_auc:
      Model  Best Score                                 Best Params
4        RF    0.954688                       {'n_estimators': 150}
5  AdaBoost    0.909498                       {'n_estimators': 150}
1       SVC    0.820037                  {'C': 1, 'kernel': 'poly'}
2        LR    0.810723              {'C': 0.01, 'penalty': 'none'}
3       KNN    0.750797  {'n_neighbors': 10, 'weights': 'distance'}
0     Dummy    0.500000               {'strategy': 'most_frequent'}

Best models based on accuracy:
      Model  Best Score                                 Best Params
5  AdaBoost    0.759360                       {'n_estimators': 150}
4        RF    0.751232                       {'n_estimators': 150}
3       KNN    0.730049  {'n_neighbors': 10, 'weights': 'distance'}
1       SVC    0.723892                  {'C': 1, 'kernel': 'poly'}
2        LR    0.702956              {'C': 0.01, 'penalty': 'none'}
0     Dummy    0.574384               {'strategy': 'm

In [6]:
# Get top features of AdaBoost and Random Forest
top_features_with_names = extract_top_features(fitted_models, X_train, input_data_preprocessed)

for model_name, features in top_features_with_names.items():
    print(f"Top features for {model_name}:")
    for rank, feature in enumerate(features):
        print(f"{rank + 1}. {feature[0]} ({feature[1]}): {feature[2]}")
    print("--------------------")

Top features for RF:
1. 232 (P59665): 0.029
2. 462 (P80188): 0.028
3. 223 (P19320): 0.027
4. 271 (P13987): 0.023
5. 148 (P08571): 0.022
6. 163 (P61769): 0.021
7. 153 (P16070): 0.021
8. 536 (P08637): 0.019
9. 16 (P01833): 0.017
10. 78 (P05362): 0.016
11. 60 (Q08380): 0.015
12. 341 (P02654): 0.015
13. 339 (Q06033): 0.012
14. 382 (Q6EMK4): 0.012
15. 238 (P00995): 0.012
16. 191 (P15291): 0.011
17. 19 (P19823): 0.011
18. 85 (Q9Y6R7): 0.011
19. 41 (P06727): 0.01
20. 56 (P43652): 0.01
21. 356 (Q96PD5): 0.009
22. 321 (P10451): 0.009
23. 139 (Q86VB7): 0.009
24. 347 (P08493): 0.009
25. 35 (P61626): 0.009
26. 63 (P19827): 0.009
27. 425 (P27169): 0.008
28. 397 (P14151): 0.008
29. 439 (P22352): 0.008
30. 319 (Q9UBR2): 0.007
--------------------
Top features for AdaBoost:
1. 413 (P35858): 0.073
2. 426 (P01880): 0.06
3. 232 (P59665): 0.047
4. 236 (P01625): 0.04
5. 35 (P61626): 0.033
6. 41 (P06727): 0.033
7. 253 (P00915): 0.033
8. 339 (Q06033): 0.033
9. 42 (P04114): 0.027
10. 148 (P08571): 0.027
11. 2

In [7]:
# Find common features between AdaBoost and Random Forest
common_features_info = find_common_features(top_features_with_names, 'RF', 'AdaBoost')

print("Common top features between RandomForest and AdaBoost with positions, importance values, and protein names:")
for feature, info in common_features_info.items():
    print(f"{feature}: Protein - {info['RF_Protein']}, RandomForest Position - {info['RF_Position']}, Importance - {info['RF_Importance']}; "
          f"Protein - {info['AdaBoost_Protein']}, AdaBoost Position - {info['AdaBoost_Position']}, Importance - {info['AdaBoost_Importance']}")


Common top features between RandomForest and AdaBoost with positions, importance values, and protein names:
223: Protein - P19320, RandomForest Position - 3, Importance - 0.027; Protein - P19320, AdaBoost Position - 13, Importance - 0.02
35: Protein - P61626, RandomForest Position - 25, Importance - 0.009; Protein - P61626, AdaBoost Position - 5, Importance - 0.033
356: Protein - Q96PD5, RandomForest Position - 21, Importance - 0.009; Protein - Q96PD5, AdaBoost Position - 24, Importance - 0.013
232: Protein - P59665, RandomForest Position - 1, Importance - 0.029; Protein - P59665, AdaBoost Position - 3, Importance - 0.047
41: Protein - P06727, RandomForest Position - 19, Importance - 0.01; Protein - P06727, AdaBoost Position - 6, Importance - 0.033
78: Protein - P05362, RandomForest Position - 10, Importance - 0.016; Protein - P05362, AdaBoost Position - 12, Importance - 0.02
462: Protein - P80188, RandomForest Position - 2, Importance - 0.028; Protein - P80188, AdaBoost Position - 25,

### 1.2 Testing

In [8]:
for model, names in zip(models, best_params.keys()):
    print("Predicting with model %s" % names)
    # print the model params
    model = model.set_params(**best_params[names])
    print(model)
    y_pred, y_pred_proba, cm = predict_ml_model(model, X_train, y_train, X_test, y_test)
    print_ml_metrics(cm, y_test, y_pred_proba)
    print("--------------------------------------")

Predicting with model Dummy
DummyClassifier(random_state=42, strategy='most_frequent')
Recall pheno1: 1.000000
Recall pheno0: 0.000000
Precision pheno1: 0.750000
Precision pheno0: 0.000000
F1 pheno1: 0.857143
F1 pheno0: nan
Accuracy: 0.750000
AUC: 0.500000
--------------------------------------
Predicting with model SVC
SVC(C=1, kernel='poly', probability=True, random_state=42)
Recall pheno1: 0.976190
Recall pheno0: 1.000000
Precision pheno1: 1.000000
Precision pheno0: 0.933333
F1 pheno1: 0.987952
F1 pheno0: 0.965517
Accuracy: 0.982143
AUC: 0.996599
--------------------------------------
Predicting with model LR
LogisticRegression(C=0.01, max_iter=10000, penalty='none', random_state=42)
Recall pheno1: 0.952381
Recall pheno0: 0.857143
Precision pheno1: 0.952381
Precision pheno0: 0.857143
F1 pheno1: 0.952381
F1 pheno0: 0.857143
Accuracy: 0.928571
AUC: 0.984694
--------------------------------------
Predicting with model KNN
KNeighborsClassifier(n_neighbors=10, weights='distance')
Recall 

  f1_pheno0 = 2 * precision_pheno0 * recall_pheno0 / (precision_pheno0 + recall_pheno0)


Recall pheno1: 1.000000
Recall pheno0: 0.928571
Precision pheno1: 0.976744
Precision pheno0: 1.000000
F1 pheno1: 0.988235
F1 pheno0: 0.962963
Accuracy: 0.982143
AUC: 1.000000
--------------------------------------
Predicting with model AdaBoost
AdaBoostClassifier(n_estimators=150, random_state=42)
Recall pheno1: 0.904762
Recall pheno0: 0.785714
Precision pheno1: 0.926829
Precision pheno0: 0.733333
F1 pheno1: 0.915663
F1 pheno0: 0.758621
Accuracy: 0.875000
AUC: 0.947279
--------------------------------------


## 2.0 COVID-19 Data

In [12]:
covid_train_X = pd.read_csv('covid_data/covid_train_qm.csv', index_col=0)
covid_test_X = pd.read_csv('covid_data/covid_test_qm.csv', index_col=0)
covid_train_y = pd.read_csv('covid_data/covid_train_design_qm.csv', index_col=0)
covid_test_y = pd.read_csv('covid_data/covid_test_design_qm.csv', index_col=0)

# Reshape data
X_train = covid_train_X.drop(['Protein'], axis=1).transpose()
X_test = covid_test_X.drop(['Protein'], axis=1).transpose()
y_train = covid_train_y['group']
y_test = covid_test_y['group']

### 2.1 Cross Validation

In [None]:
models = [DummyClassifier(random_state = SEED),
          SVC(random_state = SEED, probability = True),
          LogisticRegression(random_state = SEED, max_iter = 10000),
          KNeighborsClassifier(),
          RandomForestClassifier(random_state = SEED),
          AdaBoostClassifier(random_state = SEED),]

grid = {"Dummy": {"strategy": ["most_frequent"]},
        "SVC": {"C": [0.1, 1, 2], "kernel": ["linear", "rbf", "poly"]},
        "LR": {"C": [0.01, 0.1, 0.5, 1], "penalty": ["l1", "l2", "elasticnet", "none"]},
        "KNN": {"n_neighbors": [1, 5, 10, 15], "weights": ["uniform", "distance"]},
        "RF": {"n_estimators": [50, 100, 150]},
        "AdaBoost": {"n_estimators": [50, 100, 150]}}

scoring = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'f1_macro': make_scorer(f1_score, average='macro'),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score)
}

best_models, best_params, fitted_models = gridsearch(models, grid, X_train, y_train, scoring, refit = 'roc_auc')


In [14]:
for metric in best_models:
    best_models[metric].sort_values("Best Score", ascending=False, inplace=True)
    print(f"\nBest models based on {metric}:\n{best_models[metric]}")


Best models based on roc_auc:
      Model  Best Score                                 Best Params
4        RF    0.944789                       {'n_estimators': 150}
1       SVC    0.915440                   {'C': 2, 'kernel': 'rbf'}
5  AdaBoost    0.909983                        {'n_estimators': 50}
2        LR    0.907310                {'C': 0.01, 'penalty': 'l2'}
3       KNN    0.898016  {'n_neighbors': 15, 'weights': 'distance'}
0     Dummy    0.500000               {'strategy': 'most_frequent'}

Best models based on accuracy:
      Model  Best Score                                 Best Params
4        RF    0.872610                       {'n_estimators': 150}
5  AdaBoost    0.828828                        {'n_estimators': 50}
2        LR    0.823184                {'C': 0.01, 'penalty': 'l2'}
1       SVC    0.823145                   {'C': 2, 'kernel': 'rbf'}
3       KNN    0.809030  {'n_neighbors': 15, 'weights': 'distance'}
0     Dummy    0.550215               {'strategy': 'm

In [19]:
# Get top features of AdaBoost and Random Forest
top_features_with_names = extract_top_features(fitted_models, X_train, covid_train_X)

for model_name, features in top_features_with_names.items():
    print(f"Top features for {model_name}: (Protein: Importance)")
    for rank, feature in enumerate(features):
        print(f"{rank + 1}. {feature[1]}: {feature[2]}")
    print("--------------------")

Top features for RF: (Protein: Importance)
1. P02790: 0.06
2. P01833: 0.044
3. P06396: 0.038
4. P01034: 0.036
5. P02775: 0.03
6. P02741: 0.029
7. P25311: 0.028
8. P36955: 0.026
9. P02760: 0.026
10. P0DJI9: 0.025
11. P02776: 0.024
12. Q14624: 0.021
13. P02649: 0.021
14. P05109: 0.02
15. P18428: 0.019
16. P07996: 0.016
17. P02748: 0.014
18. P00488: 0.014
19. P02750: 0.013
20. P00740: 0.012
21. P02749: 0.011
22. P00734: 0.011
23. P00748: 0.01
24. P02671: 0.01
25. P01031: 0.01
26. P08571: 0.009
27. P07998: 0.008
28. P02647: 0.008
29. Q9BXR6: 0.008
30. P04430: 0.007
--------------------
Top features for AdaBoost: (Protein: Importance)
1. P01833: 0.04
2. P25311: 0.04
3. P01019: 0.04
4. P00748: 0.04
5. P00488: 0.04
6. P00740: 0.04
7. P01023: 0.02
8. P01024: 0.02
9. P02748: 0.02
10. P22792: 0.02
11. P23142: 0.02
12. P13671: 0.02
13. P05109: 0.02
14. P02675: 0.02
15. P00747: 0.02
16. P23083: 0.02
17. P04430: 0.02
18. P01602: 0.02
19. P01619: 0.02
20. P18428: 0.02
21. P04003: 0.02
22. P02790: 0.

In [20]:
# Find common features between AdaBoost and Random Forest
common_features_info = find_common_features(top_features_with_names, 'RF', 'AdaBoost')

print("Common top features between RandomForest and AdaBoost with positions, importance values, and protein names:")
for feature, info in common_features_info.items():
    print(f"{feature}: Protein - {info['RF_Protein']}, RandomForest Position - {info['RF_Position']}, Importance - {info['RF_Importance']}; "
          f"Protein - {info['AdaBoost_Protein']}, AdaBoost Position - {info['AdaBoost_Position']}, Importance - {info['AdaBoost_Importance']}")


Common top features between RandomForest and AdaBoost with positions, importance values, and protein names:
6: Protein - P01833, RandomForest Position - 2, Importance - 0.044; Protein - P01833, AdaBoost Position - 1, Importance - 0.04
70: Protein - P00488, RandomForest Position - 18, Importance - 0.014; Protein - P00488, AdaBoost Position - 5, Importance - 0.04
40: Protein - P04430, RandomForest Position - 30, Importance - 0.007; Protein - P04430, AdaBoost Position - 17, Importance - 0.02
59: Protein - P02647, RandomForest Position - 28, Importance - 0.008; Protein - P02647, AdaBoost Position - 25, Importance - 0.02
10: Protein - P02748, RandomForest Position - 17, Importance - 0.014; Protein - P02748, AdaBoost Position - 9, Importance - 0.02
71: Protein - P00734, RandomForest Position - 22, Importance - 0.011; Protein - P00734, AdaBoost Position - 28, Importance - 0.02
140: Protein - P00740, RandomForest Position - 20, Importance - 0.012; Protein - P00740, AdaBoost Position - 6, Impor

### 2.2 Testing

In [None]:
for model, names in zip(models, best_params.keys()):
    print("Predicting with model %s" % names)
    # print the model params
    model = model.set_params(**best_params[names])
    print(model)
    y_pred, y_pred_proba, cm = predict_ml_model(model, X_train, y_train, X_test, y_test)
    print_ml_metrics(cm, y_test, y_pred_proba)
    print("--------------------------------------")

Predicting with model Dummy
DummyClassifier(random_state=42, strategy='most_frequent')
Recall pheno1: 1.000000
Recall pheno0: 0.000000
Precision pheno1: 0.518987
Precision pheno0: 0.000000
F1 pheno1: 0.683333
F1 pheno0: nan
Accuracy: 0.518987
AUC: 0.500000
--------------------------------------
Predicting with model SVC
SVC(C=2, probability=True, random_state=42)
Recall pheno1: 0.853659
Recall pheno0: 0.789474
Precision pheno1: 0.813953
Precision pheno0: 0.833333
F1 pheno1: 0.833333
F1 pheno0: 0.810811
Accuracy: 0.822785
AUC: 0.908858
--------------------------------------
Predicting with model LR
LogisticRegression(C=0.01, max_iter=10000, random_state=42)
Recall pheno1: 0.804878
Recall pheno0: 0.789474
Precision pheno1: 0.804878
Precision pheno0: 0.789474
F1 pheno1: 0.804878
F1 pheno0: 0.789474
Accuracy: 0.797468
AUC: 0.895379
--------------------------------------
Predicting with model KNN
KNeighborsClassifier(n_neighbors=15, weights='distance')
Recall pheno1: 0.853659
Recall pheno0:

  f1_pheno0 = 2 * precision_pheno0 * recall_pheno0 / (precision_pheno0 + recall_pheno0)


Recall pheno1: 0.829268
Recall pheno0: 0.894737
Precision pheno1: 0.894737
Precision pheno0: 0.829268
F1 pheno1: 0.860759
F1 pheno0: 0.860759
Accuracy: 0.860759
AUC: 0.927792
--------------------------------------
Predicting with model AdaBoost
AdaBoostClassifier(random_state=42)
Recall pheno1: 0.780488
Recall pheno0: 0.842105
Precision pheno1: 0.842105
Precision pheno0: 0.780488
F1 pheno1: 0.810127
F1 pheno0: 0.810127
Accuracy: 0.810127
AUC: 0.910783
--------------------------------------
