In [40]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

# cust_functions folder
from cust_functions.ml_helper import *
from cust_functions.training import set_seed

# Set random seed
SEED = 42
set_seed(SEED)


## 1.0 AKI Data

In [41]:
# Load data
input_data_qm = pd.read_csv("aki_data/test_qm.csv")
design_matrix = pd.read_csv("aki_data/design_matrix.tsv", sep="\t")

# Preprocess data
input_data_preprocessed = input_data_qm.fillna(0)
input_data = input_data_preprocessed.drop(['Protein'], axis=1)
design_matrix = design_matrix.replace(1, 0)
design_matrix = design_matrix.replace(2, 1)

# Split data into train and test
X_test = input_data.loc[:, ~input_data.columns.str.contains('M2012')].transpose()
X_train = input_data.loc[:, input_data.columns.str.contains('M2012')].transpose()
y_test = design_matrix['group'][~design_matrix['sample'].str.contains('M2012')]
y_train = design_matrix['group'][design_matrix['sample'].str.contains('M2012')]


### 1.1 Cross Validation

In [None]:
models = [DummyClassifier(random_state = SEED),
          SVC(random_state = SEED, probability = True),
          LogisticRegression(random_state = SEED, max_iter = 10000),
          KNeighborsClassifier(),
          RandomForestClassifier(random_state = SEED),
          AdaBoostClassifier(random_state = SEED),]

grid = {"Dummy": {"strategy": ["most_frequent"]},
        "SVC": {"C": [0.1, 1, 10, 15], "kernel": ["linear", "rbf", "poly"]},
        "LR": {"C": [0.0001, 0.001, 0.01, 0.1], "penalty": ["l1", "l2", "elasticnet", "none"]},
        "KNN": {"n_neighbors": [5, 10, 15, 20], "weights": ["uniform", "distance"]},
        "RF": {"n_estimators": [50, 100, 150], "max_depth": [3, 5, 10], "max_features": ["auto", "sqrt", "log2"], "min_samples_leaf": [1, 2, 4]},
        "AdaBoost": {"n_estimators": [50, 100, 150, 200], "learning_rate": [0.1, 0.5, 1]}}

scoring = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'f1_macro': make_scorer(f1_score, average='macro'),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score)
}

best_models, best_params, fitted_models = gridsearch(models, grid, X_train, y_train, scoring, refit = 'roc_auc')


In [44]:
for metric in best_models:
    best_models[metric].sort_values("Best Score", ascending=False, inplace=True)
    print(f"\nBest models based on {metric}:\n{best_models[metric]}")


Best models based on roc_auc:
      Model  Best Score  St. Dev.  \
4        RF    0.955208  0.028067   
5  AdaBoost    0.909498  0.096624   
1       SVC    0.821262  0.128525   
2        LR    0.810723  0.130290   
3       KNN    0.750797  0.089472   
0     Dummy    0.500000  0.000000   

                                         Best Params  
4  {'max_depth': 5, 'max_features': 'sqrt', 'min_...  
5          {'learning_rate': 1, 'n_estimators': 150}  
1                         {'C': 10, 'kernel': 'rbf'}  
2                   {'C': 0.0001, 'penalty': 'none'}  
3         {'n_neighbors': 10, 'weights': 'distance'}  
0                      {'strategy': 'most_frequent'}  

Best models based on accuracy:
      Model  Best Score  St. Dev.  \
4        RF    0.772414  0.074706   
5  AdaBoost    0.759360  0.097823   
3       KNN    0.730049  0.059540   
1       SVC    0.709606  0.151231   
2        LR    0.702956  0.141383   
0     Dummy    0.574384  0.005911   

                                

In [45]:
# Get top features of AdaBoost and Random Forest
top_features_with_names = extract_top_features(fitted_models, X_train, input_data_preprocessed)

for model_name, features in top_features_with_names.items():
    print(f"Top features for {model_name}:")
    for rank, feature in enumerate(features):
        print(f"{rank + 1}. {feature[0]} ({feature[1]}): {feature[2]}")
    print("--------------------")

Top features for RF:
1. 232 (P59665): 0.029
2. 462 (P80188): 0.029
3. 223 (P19320): 0.028
4. 271 (P13987): 0.023
5. 148 (P08571): 0.022
6. 163 (P61769): 0.021
7. 153 (P16070): 0.021
8. 536 (P08637): 0.02
9. 16 (P01833): 0.018
10. 78 (P05362): 0.017
11. 60 (Q08380): 0.015
12. 341 (P02654): 0.015
13. 382 (Q6EMK4): 0.012
14. 339 (Q06033): 0.012
15. 85 (Q9Y6R7): 0.012
16. 238 (P00995): 0.012
17. 191 (P15291): 0.011
18. 19 (P19823): 0.01
19. 41 (P06727): 0.01
20. 56 (P43652): 0.01
21. 356 (Q96PD5): 0.01
22. 321 (P10451): 0.009
23. 347 (P08493): 0.009
24. 35 (P61626): 0.009
25. 139 (Q86VB7): 0.009
26. 63 (P19827): 0.009
27. 425 (P27169): 0.009
28. 439 (P22352): 0.009
29. 397 (P14151): 0.009
30. 413 (P35858): 0.008
--------------------
Top features for AdaBoost:
1. 413 (P35858): 0.073
2. 426 (P01880): 0.06
3. 232 (P59665): 0.047
4. 236 (P01625): 0.04
5. 35 (P61626): 0.033
6. 41 (P06727): 0.033
7. 253 (P00915): 0.033
8. 339 (Q06033): 0.033
9. 42 (P04114): 0.027
10. 148 (P08571): 0.027
11. 238 

In [46]:
# Find common features between AdaBoost and Random Forest
common_features_info = find_common_features(top_features_with_names, 'RF', 'AdaBoost')

print("Common top features between RandomForest and AdaBoost with positions, importance values, and protein names:")
for feature, info in common_features_info.items():
    print(f"{feature}: Protein - {info['RF_Protein']}, RandomForest Position - {info['RF_Position']}, Importance - {info['RF_Importance']}; "
          f"Protein - {info['AdaBoost_Protein']}, AdaBoost Position - {info['AdaBoost_Position']}, Importance - {info['AdaBoost_Importance']}")


Common top features between RandomForest and AdaBoost with positions, importance values, and protein names:
35: Protein - P61626, RandomForest Position - 24, Importance - 0.009; Protein - P61626, AdaBoost Position - 5, Importance - 0.033
356: Protein - Q96PD5, RandomForest Position - 21, Importance - 0.01; Protein - Q96PD5, AdaBoost Position - 24, Importance - 0.013
232: Protein - P59665, RandomForest Position - 1, Importance - 0.029; Protein - P59665, AdaBoost Position - 3, Importance - 0.047
41: Protein - P06727, RandomForest Position - 19, Importance - 0.01; Protein - P06727, AdaBoost Position - 6, Importance - 0.033
78: Protein - P05362, RandomForest Position - 10, Importance - 0.017; Protein - P05362, AdaBoost Position - 12, Importance - 0.02
462: Protein - P80188, RandomForest Position - 2, Importance - 0.029; Protein - P80188, AdaBoost Position - 25, Importance - 0.013
238: Protein - P00995, RandomForest Position - 16, Importance - 0.012; Protein - P00995, AdaBoost Position - 11

### 1.2 Testing

In [33]:
for model, names in zip(models, best_params.keys()):
    print("Predicting with model %s" % names)
    # print the model params
    model = model.set_params(**best_params[names])
    print(model)
    y_pred, y_pred_proba, cm = predict_ml_model(model, X_train, y_train, X_test, y_test)
    print_ml_metrics(cm, y_test, y_pred_proba)
    print("--------------------------------------")

Predicting with model Dummy
DummyClassifier(random_state=42, strategy='most_frequent')
Accuracy: 0.750000
AUC: 0.500000
F1 Macro: nan
F1 pheno2: 0.857143
F1 pheno1: nan
Recall pheno2: 1.000000
Recall pheno1: 0.000000
Precision pheno2: 0.750000
Precision pheno1: 0.000000
--------------------------------------
Predicting with model SVC
SVC(C=10, probability=True, random_state=42)
Accuracy: 0.946429
AUC: 0.996599
F1 Macro: 0.926797
F1 pheno2: 0.964706
F1 pheno1: 0.888889
Recall pheno2: 0.976190
Recall pheno1: 0.857143
Precision pheno2: 0.953488
Precision pheno1: 0.923077
--------------------------------------
Predicting with model LR
LogisticRegression(C=0.0001, max_iter=10000, penalty='none', random_state=42)
Accuracy: 0.928571
AUC: 0.984694
F1 Macro: 0.904762
F1 pheno2: 0.952381
F1 pheno1: 0.857143
Recall pheno2: 0.952381
Recall pheno1: 0.857143
Precision pheno2: 0.952381
Precision pheno1: 0.857143
--------------------------------------
Predicting with model KNN
KNeighborsClassifier(n_n

  f1_pheno1 = 2 * precision_pheno0 * recall_pheno1 / (precision_pheno0 + recall_pheno1)


Accuracy: 0.982143
AUC: 1.000000
F1 Macro: 0.975599
F1 pheno2: 0.988235
F1 pheno1: 0.962963
Recall pheno2: 1.000000
Recall pheno1: 0.928571
Precision pheno2: 0.976744
Precision pheno1: 1.000000
--------------------------------------
Predicting with model AdaBoost
AdaBoostClassifier(learning_rate=1, n_estimators=150, random_state=42)
Accuracy: 0.875000
AUC: 0.947279
F1 Macro: 0.837142
F1 pheno2: 0.915663
F1 pheno1: 0.758621
Recall pheno2: 0.904762
Recall pheno1: 0.785714
Precision pheno2: 0.926829
Precision pheno1: 0.733333
--------------------------------------


## 2.0 COVID-19 Data

In [34]:
covid_train_X = pd.read_csv('covid_data/covid_train_qm.csv', index_col=0)
covid_test_X = pd.read_csv('covid_data/covid_test_qm.csv', index_col=0)
covid_train_y = pd.read_csv('covid_data/covid_train_design_qm.csv', index_col=0)
covid_test_y = pd.read_csv('covid_data/covid_test_design_qm.csv', index_col=0)

# Reshape data
X_train = covid_train_X.drop(['Protein'], axis=1).transpose()
X_test = covid_test_X.drop(['Protein'], axis=1).transpose()
y_train = covid_train_y['group']
y_test = covid_test_y['group']

### 2.1 Cross Validation

In [None]:
models = [DummyClassifier(random_state = SEED),
          SVC(random_state = SEED, probability = True),
          LogisticRegression(random_state = SEED, max_iter = 10000),
          KNeighborsClassifier(),
          RandomForestClassifier(random_state = SEED),
          AdaBoostClassifier(random_state = SEED),]

grid = {"Dummy": {"strategy": ["most_frequent"]},
        "SVC": {"C": [0.1, 1, 5, 10], "kernel": ["linear", "rbf", "poly"]},
        "LR": {"C": [0.0001, 0.001, 0.01, 0.1], "penalty": ["l1", "l2", "elasticnet", "none"]},
        "KNN": {"n_neighbors": [5, 10, 30, 50], "weights": ["uniform", "distance"]},
        "RF": {"n_estimators": [50, 100, 150, 200], "max_depth": [10, 15, 20], "max_features": ["auto", "sqrt", "log2"]},
        "AdaBoost": {"n_estimators": [100, 150, 200], "learning_rate": [0.01, 0.05, 0.1, 1]}}

scoring = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'f1_macro': make_scorer(f1_score, average='macro'),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score)
}

best_models, best_params, fitted_models = gridsearch(models, grid, X_train, y_train, scoring, refit = 'roc_auc')


In [36]:
for metric in best_models:
    best_models[metric].sort_values("Best Score", ascending=False, inplace=True)
    print(f"\nBest models based on {metric}:\n{best_models[metric]}")


Best models based on roc_auc:
      Model  Best Score  St. Dev.  \
4        RF    0.949431  0.020366   
5  AdaBoost    0.927131  0.020283   
1       SVC    0.916160  0.019215   
2        LR    0.908927  0.036591   
3       KNN    0.904996  0.027914   
0     Dummy    0.500000  0.000000   

                                         Best Params  
4  {'max_depth': 10, 'max_features': 'log2', 'n_e...  
5        {'learning_rate': 0.1, 'n_estimators': 150}  
1                          {'C': 5, 'kernel': 'rbf'}  
2                      {'C': 0.001, 'penalty': 'l2'}  
3         {'n_neighbors': 30, 'weights': 'distance'}  
0                      {'strategy': 'most_frequent'}  

Best models based on accuracy:
      Model  Best Score  St. Dev.  \
4        RF    0.869803  0.021194   
5  AdaBoost    0.854240  0.030831   
3       KNN    0.827390  0.018658   
1       SVC    0.826001  0.022341   
2        LR    0.821706  0.034890   
0     Dummy    0.550215  0.002696   

                                

In [37]:
# Get top features of AdaBoost and Random Forest
top_features_with_names = extract_top_features(fitted_models, X_train, covid_train_X)

for model_name, features in top_features_with_names.items():
    print(f"Top features for {model_name}: (Protein: Importance)")
    for rank, feature in enumerate(features):
        print(f"{rank + 1}. {feature[1]}: {feature[2]}")
    print("--------------------")

Top features for RF: (Protein: Importance)
1. P02790: 0.039
2. P01833: 0.035
3. P01034: 0.032
4. P06396: 0.028
5. P02760: 0.028
6. P02741: 0.027
7. P02776: 0.025
8. P36955: 0.024
9. P25311: 0.023
10. P05109: 0.022
11. P18428: 0.022
12. P0DJI9: 0.02
13. P02775: 0.02
14. P00740: 0.017
15. Q14624: 0.017
16. P02649: 0.015
17. P02748: 0.015
18. P02750: 0.014
19. P07996: 0.014
20. P02671: 0.013
21. P00488: 0.012
22. P00734: 0.012
23. P07998: 0.01
24. P02747: 0.01
25. P02745: 0.01
26. P08571: 0.009
27. P01019: 0.009
28. Q9BXR6: 0.009
29. P01031: 0.009
30. P00748: 0.009
--------------------
Top features for AdaBoost: (Protein: Importance)
1. P06396: 0.073
2. P00740: 0.067
3. P01833: 0.06
4. P25311: 0.06
5. P00748: 0.06
6. P02775: 0.053
7. P02760: 0.047
8. P23083: 0.04
9. P04430: 0.04
10. P00488: 0.04
11. P00734: 0.04
12. P02748: 0.027
13. P02649: 0.027
14. P05109: 0.027
15. P18428: 0.027
16. P02790: 0.027
17. P04004: 0.027
18. Q9BXR6: 0.027
19. A0A075B6K4: 0.027
20. A0A0B4J1Y9: 0.027
21. P0102

In [38]:
# Find common features between AdaBoost and Random Forest
common_features_info = find_common_features(top_features_with_names, 'RF', 'AdaBoost')

print("Common top features between RandomForest and AdaBoost with positions, importance values, and protein names:")
for feature, info in common_features_info.items():
    print(f"{feature}: Protein - {info['RF_Protein']}, RandomForest Position - {info['RF_Position']}, Importance - {info['RF_Importance']}; "
          f"Protein - {info['AdaBoost_Protein']}, AdaBoost Position - {info['AdaBoost_Position']}, Importance - {info['AdaBoost_Importance']}")


Common top features between RandomForest and AdaBoost with positions, importance values, and protein names:
89: Protein - Q9BXR6, RandomForest Position - 28, Importance - 0.009; Protein - Q9BXR6, AdaBoost Position - 18, Importance - 0.027
6: Protein - P01833, RandomForest Position - 2, Importance - 0.035; Protein - P01833, AdaBoost Position - 3, Importance - 0.06
70: Protein - P00488, RandomForest Position - 21, Importance - 0.012; Protein - P00488, AdaBoost Position - 10, Importance - 0.04
71: Protein - P00734, RandomForest Position - 22, Importance - 0.012; Protein - P00734, AdaBoost Position - 11, Importance - 0.04
10: Protein - P02748, RandomForest Position - 17, Importance - 0.015; Protein - P02748, AdaBoost Position - 12, Importance - 0.027
140: Protein - P00740, RandomForest Position - 14, Importance - 0.017; Protein - P00740, AdaBoost Position - 2, Importance - 0.067
12: Protein - P01034, RandomForest Position - 3, Importance - 0.032; Protein - P01034, AdaBoost Position - 27, I

### 2.2 Testing

In [39]:
for model, names in zip(models, best_params.keys()):
    print("Predicting with model %s" % names)
    # print the model params
    model = model.set_params(**best_params[names])
    print(model)
    y_pred, y_pred_proba, cm = predict_ml_model(model, X_train, y_train, X_test, y_test)
    print_ml_metrics(cm, y_test, y_pred_proba)
    print("--------------------------------------")

Predicting with model Dummy
DummyClassifier(random_state=42, strategy='most_frequent')
Accuracy: 0.518987
AUC: 0.500000
F1 Macro: nan
F1 pheno2: 0.683333
F1 pheno1: nan
Recall pheno2: 1.000000
Recall pheno1: 0.000000
Precision pheno2: 0.518987
Precision pheno1: 0.000000
--------------------------------------
Predicting with model SVC
SVC(C=5, probability=True, random_state=42)
Accuracy: 0.835443
AUC: 0.905006
F1 Macro: 0.835020
F1 pheno2: 0.843373
F1 pheno1: 0.826667
Recall pheno2: 0.853659
Recall pheno1: 0.815789
Precision pheno2: 0.833333
Precision pheno1: 0.837838
--------------------------------------
Predicting with model LR
LogisticRegression(C=0.001, max_iter=10000, random_state=42)
Accuracy: 0.822785
AUC: 0.892811
F1 Macro: 0.821382
F1 pheno2: 0.837209
F1 pheno1: 0.805556
Recall pheno2: 0.878049
Recall pheno1: 0.763158
Precision pheno2: 0.800000
Precision pheno1: 0.852941
--------------------------------------
Predicting with model KNN
KNeighborsClassifier(n_neighbors=30, weigh

  f1_pheno1 = 2 * precision_pheno0 * recall_pheno1 / (precision_pheno0 + recall_pheno1)


Accuracy: 0.822785
AUC: 0.917202
F1 Macro: 0.822529
F1 pheno2: 0.829268
F1 pheno1: 0.815789
Recall pheno2: 0.829268
Recall pheno1: 0.815789
Precision pheno2: 0.829268
Precision pheno1: 0.815789
--------------------------------------
Predicting with model AdaBoost
AdaBoostClassifier(learning_rate=0.1, n_estimators=150, random_state=42)
Accuracy: 0.797468
AUC: 0.895379
F1 Macro: 0.797436
F1 pheno2: 0.800000
F1 pheno1: 0.794872
Recall pheno2: 0.780488
Recall pheno1: 0.815789
Precision pheno2: 0.820513
Precision pheno1: 0.775000
--------------------------------------
