In [1]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

# cust_functions folder
from cust_functions.training import *
from cust_functions.graph_networks import *
from cust_functions.graph_creation import *

# Set random seed
SEED = 42
set_seed(SEED)


## 1.0 AKI Data

In [2]:
# Load data
input_data_qm = pd.read_csv("aki_data/test_qm.csv")
design_matrix = pd.read_csv("aki_data/design_matrix.tsv", sep="\t")

# Preprocess data
input_data_preprocessed = input_data_qm.fillna(0)
input_data = input_data_preprocessed.drop(['Protein'], axis=1)
design_matrix = design_matrix.replace(1, 0)
design_matrix = design_matrix.replace(2, 1)

# Split data into train and test
X_test = input_data.loc[:, ~input_data.columns.str.contains('M2012')].transpose()
X_train = input_data.loc[:, input_data.columns.str.contains('M2012')].transpose()
y_test = design_matrix['group'][~design_matrix['sample'].str.contains('M2012')]
y_train = design_matrix['group'][design_matrix['sample'].str.contains('M2012')]


### 1.1 Cross Validation

In [3]:
models = [DummyClassifier(random_state = SEED),
          SVC(random_state = SEED, probability = True),
          LogisticRegression(random_state = SEED, max_iter = 10000),
          KNeighborsClassifier(),
          RandomForestClassifier(random_state = SEED),
          AdaBoostClassifier(random_state = SEED),]

grid = {"Dummy": {"strategy": ["most_frequent"]},
        "SVC": {"C": [0.1, 1, 5, 10], "kernel": ["linear", "rbf", "poly"]},
        "LR": {"C": [0.0001, 0.001, 0.01, 0.1], "penalty": ["l1", "l2", "elasticnet", "none"]},
        "KNN": {"n_neighbors": [5, 10, 15, 20], "weights": ["uniform", "distance"]},
        "RF": {"n_estimators": [50, 100, 150, 200], "max_depth": [5, 10, 15, 20], "max_features": ["auto", "sqrt", "log2"], "min_samples_leaf": [1, 2, 4]},
        "AdaBoost": {"n_estimators": [50, 100, 150, 200], "learning_rate": [0.01, 0.1, 0.5, 1]}}

scoring = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'f1_macro': make_scorer(f1_score, average='macro'),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score)
}

best_models, best_params, fitted_models = gridsearch(models, grid, X_train, y_train, scoring, refit = 'roc_auc')


Training model Dummy


Best parameters for Dummy: {'strategy': 'most_frequent'}
Best ROC AUC score for Dummy: 0.5
Training model SVC
Best parameters for SVC: {'C': 10, 'kernel': 'rbf'}
Best ROC AUC score for SVC: 0.8212622549019608
Training model LR


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/scratch/yves/mi

Best parameters for LR: {'C': 0.0001, 'penalty': 'none'}
Best ROC AUC score for LR: 0.8107230392156863
Training model KNN


Traceback (most recent call last):
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 136, in __call__
    score = scorer._score(
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 353, in _score
    y_pred = method_caller(estimator, "predict", X)
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/utils/_response.py", line 85, in _get_response_values
    y_pred = prediction_method(X)
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/neighbors/_classification.py", line 246, in predict
    if self._fit_method == "brute" and ArgKminClassMode.is_usable_for(
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages

Best parameters for KNN: {'n_neighbors': 10, 'weights': 'distance'}
Best ROC AUC score for KNN: 0.750796568627451
Training model RF


240 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
34 fits failed with the following error:
Traceback (most recent call last):
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packa

Best parameters for RF: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'n_estimators': 150}
Best ROC AUC score for RF: 0.9552083333333332
Training model AdaBoost
Best parameters for AdaBoost: {'learning_rate': 1, 'n_estimators': 150}
Best ROC AUC score for AdaBoost: 0.9094975490196078


In [4]:
for metric in best_models:
    best_models[metric].sort_values("Best Score", ascending=False, inplace=True)
    print(f"\nBest models based on {metric}:\n{best_models[metric]}")


Best models based on roc_auc:
      Model  Best Score                                        Best Params
4        RF    0.955208  {'max_depth': 5, 'max_features': 'sqrt', 'min_...
5  AdaBoost    0.909498          {'learning_rate': 1, 'n_estimators': 150}
1       SVC    0.821262                         {'C': 10, 'kernel': 'rbf'}
2        LR    0.810723                   {'C': 0.0001, 'penalty': 'none'}
3       KNN    0.750797         {'n_neighbors': 10, 'weights': 'distance'}
0     Dummy    0.500000                      {'strategy': 'most_frequent'}

Best models based on accuracy:
      Model  Best Score                                        Best Params
4        RF    0.772414  {'max_depth': 5, 'max_features': 'sqrt', 'min_...
5  AdaBoost    0.759360          {'learning_rate': 1, 'n_estimators': 150}
3       KNN    0.730049         {'n_neighbors': 10, 'weights': 'distance'}
1       SVC    0.709606                         {'C': 10, 'kernel': 'rbf'}
2        LR    0.702956              

In [5]:
# Get top features of AdaBoost and Random Forest
top_features_with_names = extract_top_features(fitted_models, X_train, input_data_preprocessed)

for model_name, features in top_features_with_names.items():
    print(f"Top features for {model_name}:")
    for rank, feature in enumerate(features):
        print(f"{rank + 1}. {feature[0]} ({feature[1]}): {feature[2]}")
    print("--------------------")

Top features for RF:
1. 232 (P59665): 0.029
2. 462 (P80188): 0.029
3. 223 (P19320): 0.028
4. 271 (P13987): 0.023
5. 148 (P08571): 0.022
6. 163 (P61769): 0.021
7. 153 (P16070): 0.021
8. 536 (P08637): 0.02
9. 16 (P01833): 0.018
10. 78 (P05362): 0.017
11. 60 (Q08380): 0.015
12. 341 (P02654): 0.015
13. 382 (Q6EMK4): 0.012
14. 339 (Q06033): 0.012
15. 85 (Q9Y6R7): 0.012
16. 238 (P00995): 0.012
17. 191 (P15291): 0.011
18. 19 (P19823): 0.01
19. 41 (P06727): 0.01
20. 56 (P43652): 0.01
21. 356 (Q96PD5): 0.01
22. 321 (P10451): 0.009
23. 347 (P08493): 0.009
24. 35 (P61626): 0.009
25. 139 (Q86VB7): 0.009
26. 63 (P19827): 0.009
27. 425 (P27169): 0.009
28. 439 (P22352): 0.009
29. 397 (P14151): 0.009
30. 413 (P35858): 0.008
--------------------
Top features for AdaBoost:
1. 413 (P35858): 0.073
2. 426 (P01880): 0.06
3. 232 (P59665): 0.047
4. 236 (P01625): 0.04
5. 35 (P61626): 0.033
6. 41 (P06727): 0.033
7. 253 (P00915): 0.033
8. 339 (Q06033): 0.033
9. 42 (P04114): 0.027
10. 148 (P08571): 0.027
11. 238 

In [6]:
# Find common features between AdaBoost and Random Forest
common_features_info = find_common_features(top_features_with_names, 'RF', 'AdaBoost')

print("Common top features between RandomForest and AdaBoost with positions, importance values, and protein names:")
for feature, info in common_features_info.items():
    print(f"{feature}: Protein - {info['RF_Protein']}, RandomForest Position - {info['RF_Position']}, Importance - {info['RF_Importance']}; "
          f"Protein - {info['AdaBoost_Protein']}, AdaBoost Position - {info['AdaBoost_Position']}, Importance - {info['AdaBoost_Importance']}")


Common top features between RandomForest and AdaBoost with positions, importance values, and protein names:
35: Protein - P61626, RandomForest Position - 24, Importance - 0.009; Protein - P61626, AdaBoost Position - 5, Importance - 0.033
356: Protein - Q96PD5, RandomForest Position - 21, Importance - 0.01; Protein - Q96PD5, AdaBoost Position - 24, Importance - 0.013
232: Protein - P59665, RandomForest Position - 1, Importance - 0.029; Protein - P59665, AdaBoost Position - 3, Importance - 0.047
41: Protein - P06727, RandomForest Position - 19, Importance - 0.01; Protein - P06727, AdaBoost Position - 6, Importance - 0.033
78: Protein - P05362, RandomForest Position - 10, Importance - 0.017; Protein - P05362, AdaBoost Position - 12, Importance - 0.02
462: Protein - P80188, RandomForest Position - 2, Importance - 0.029; Protein - P80188, AdaBoost Position - 25, Importance - 0.013
238: Protein - P00995, RandomForest Position - 16, Importance - 0.012; Protein - P00995, AdaBoost Position - 11

### 1.2 Testing

In [27]:
for model, names in zip(models, best_params.keys()):
    print("Predicting with model %s" % names)
    # print the model params
    model = model.set_params(**best_params[names])
    print(model)
    y_pred, y_pred_proba, cm = predict_ml_model(model, X_train, y_train, X_test, y_test)
    print_ml_metrics(cm, y_test, y_pred_proba)
    print("--------------------------------------")

Predicting with model Dummy
DummyClassifier(random_state=42, strategy='most_frequent')
Accuracy: 0.518987
AUC: 0.500000
F1 Macro: nan
F1 pheno1: 0.683333
F1 pheno0: nan
Recall pheno1: 1.000000
Recall pheno0: 0.000000
Precision pheno1: 0.518987
Precision pheno0: 0.000000
--------------------------------------
Predicting with model SVC
SVC(C=5, probability=True, random_state=42)
Accuracy: 0.835443
AUC: 0.905006
F1 Macro: 0.835020
F1 pheno1: 0.843373
F1 pheno0: 0.826667
Recall pheno1: 0.853659
Recall pheno0: 0.815789
Precision pheno1: 0.833333
Precision pheno0: 0.837838
--------------------------------------
Predicting with model LR
LogisticRegression(C=0.001, max_iter=10000, random_state=42)
Accuracy: 0.822785
AUC: 0.892811
F1 Macro: 0.821382
F1 pheno1: 0.837209
F1 pheno0: 0.805556
Recall pheno1: 0.878049
Recall pheno0: 0.763158
Precision pheno1: 0.800000
Precision pheno0: 0.852941
--------------------------------------
Predicting with model KNN
KNeighborsClassifier(n_neighbors=20, weigh

  f1_pheno1 = 2 * precision_pheno0 * recall_pheno1 / (precision_pheno0 + recall_pheno1)


Accuracy: 0.822785
AUC: 0.917202
F1 Macro: 0.822529
F1 pheno1: 0.829268
F1 pheno0: 0.815789
Recall pheno1: 0.829268
Recall pheno0: 0.815789
Precision pheno1: 0.829268
Precision pheno0: 0.815789
--------------------------------------
Predicting with model AdaBoost
AdaBoostClassifier(learning_rate=0.1, n_estimators=150, random_state=42)
Accuracy: 0.797468
AUC: 0.895379
F1 Macro: 0.797436
F1 pheno1: 0.800000
F1 pheno0: 0.794872
Recall pheno1: 0.780488
Recall pheno0: 0.815789
Precision pheno1: 0.820513
Precision pheno0: 0.775000
--------------------------------------


## 2.0 COVID-19 Data

In [17]:
covid_train_X = pd.read_csv('covid_data/covid_train_qm.csv', index_col=0)
covid_test_X = pd.read_csv('covid_data/covid_test_qm.csv', index_col=0)
covid_train_y = pd.read_csv('covid_data/covid_train_design_qm.csv', index_col=0)
covid_test_y = pd.read_csv('covid_data/covid_test_design_qm.csv', index_col=0)

# Reshape data
X_train = covid_train_X.drop(['Protein'], axis=1).transpose()
X_test = covid_test_X.drop(['Protein'], axis=1).transpose()
y_train = covid_train_y['group']
y_test = covid_test_y['group']

### 2.1 Cross Validation

In [18]:
models = [DummyClassifier(random_state = SEED),
          SVC(random_state = SEED, probability = True),
          LogisticRegression(random_state = SEED, max_iter = 10000),
          KNeighborsClassifier(),
          RandomForestClassifier(random_state = SEED),
          AdaBoostClassifier(random_state = SEED),]

grid = {"Dummy": {"strategy": ["most_frequent"]},
        "SVC": {"C": [0.1, 1, 5, 10], "kernel": ["linear", "rbf", "poly"]},
        "LR": {"C": [0.0001, 0.001, 0.01, 0.1], "penalty": ["l1", "l2", "elasticnet", "none"]},
        "KNN": {"n_neighbors": [5, 10, 20, 30], "weights": ["uniform", "distance"]},
        "RF": {"n_estimators": [50, 100, 150, 200], "max_depth": [5, 10, 15, 20], "max_features": ["auto", "sqrt", "log2"]},
        "AdaBoost": {"n_estimators": [50, 100, 150, 200], "learning_rate": [0.01, 0.1, 0.5, 1]}}

scoring = {
    'roc_auc': 'roc_auc',
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'f1_macro': make_scorer(f1_score, average='macro'),
    'recall': make_scorer(recall_score),
    'precision': make_scorer(precision_score)
}

best_models, best_params, fitted_models = gridsearch(models, grid, X_train, y_train, scoring, refit = 'roc_auc')


Training model Dummy
Best parameters for Dummy: {'strategy': 'most_frequent'}
Best ROC AUC score for Dummy: 0.5
Training model SVC
Best parameters for SVC: {'C': 5, 'kernel': 'rbf'}
Best ROC AUC score for SVC: 0.916159766159766
Training model LR


40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/linear_model/_logistic.py", line 1168, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/scratch/yves/mi

Best parameters for LR: {'C': 0.001, 'penalty': 'l2'}
Best ROC AUC score for LR: 0.9089267792392792
Training model KNN
Best parameters for KNN: {'n_neighbors': 30, 'weights': 'distance'}
Best ROC AUC score for KNN: 0.9049956409331408
Training model RF


Traceback (most recent call last):
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 459, in _score
    y_pred = method_caller(clf, "decision_function", X, pos_label=pos_label)
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/metrics/_scorer.py", line 86, in _cached_call
    result, _ = _get_response_values(
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/utils/_response.py", line 73, in _get_response_values
    prediction_method = _check_response_method(estimator, response_method)
  File "/scratch/yves/miniconda/envs/pytorch_cuda/lib/python3.8/site-packages/sklearn/utils/validation.py", line 1940, in _check_response_method
    raise AttributeError(
AttributeError: KNeighborsClassifier has none of the following attributes: decision_function.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File

Best parameters for RF: {'max_depth': 10, 'max_features': 'log2', 'n_estimators': 150}
Best ROC AUC score for RF: 0.949430893180893
Training model AdaBoost
Best parameters for AdaBoost: {'learning_rate': 0.1, 'n_estimators': 150}
Best ROC AUC score for AdaBoost: 0.9271309130684131


In [19]:
for metric in best_models:
    best_models[metric].sort_values("Best Score", ascending=False, inplace=True)
    print(f"\nBest models based on {metric}:\n{best_models[metric]}")


Best models based on roc_auc:
      Model  Best Score                                        Best Params
4        RF    0.949431  {'max_depth': 10, 'max_features': 'log2', 'n_e...
5  AdaBoost    0.927131        {'learning_rate': 0.1, 'n_estimators': 150}
1       SVC    0.916160                          {'C': 5, 'kernel': 'rbf'}
2        LR    0.908927                      {'C': 0.001, 'penalty': 'l2'}
3       KNN    0.904996         {'n_neighbors': 30, 'weights': 'distance'}
0     Dummy    0.500000                      {'strategy': 'most_frequent'}

Best models based on accuracy:
      Model  Best Score                                        Best Params
4        RF    0.869803  {'max_depth': 10, 'max_features': 'log2', 'n_e...
5  AdaBoost    0.854240        {'learning_rate': 0.1, 'n_estimators': 150}
3       KNN    0.827390         {'n_neighbors': 30, 'weights': 'distance'}
1       SVC    0.826001                          {'C': 5, 'kernel': 'rbf'}
2        LR    0.821706              

In [20]:
# Get top features of AdaBoost and Random Forest
top_features_with_names = extract_top_features(fitted_models, X_train, covid_train_X)

for model_name, features in top_features_with_names.items():
    print(f"Top features for {model_name}: (Protein: Importance)")
    for rank, feature in enumerate(features):
        print(f"{rank + 1}. {feature[1]}: {feature[2]}")
    print("--------------------")

Top features for RF: (Protein: Importance)
1. P02790: 0.039
2. P01833: 0.035
3. P01034: 0.032
4. P06396: 0.028
5. P02760: 0.028
6. P02741: 0.027
7. P02776: 0.025
8. P36955: 0.024
9. P25311: 0.023
10. P05109: 0.022
11. P18428: 0.022
12. P0DJI9: 0.02
13. P02775: 0.02
14. P00740: 0.017
15. Q14624: 0.017
16. P02649: 0.015
17. P02748: 0.015
18. P02750: 0.014
19. P07996: 0.014
20. P02671: 0.013
21. P00488: 0.012
22. P00734: 0.012
23. P07998: 0.01
24. P02747: 0.01
25. P02745: 0.01
26. P08571: 0.009
27. P01019: 0.009
28. Q9BXR6: 0.009
29. P01031: 0.009
30. P00748: 0.009
--------------------
Top features for AdaBoost: (Protein: Importance)
1. P06396: 0.073
2. P00740: 0.067
3. P01833: 0.06
4. P25311: 0.06
5. P00748: 0.06
6. P02775: 0.053
7. P02760: 0.047
8. P23083: 0.04
9. P04430: 0.04
10. P00488: 0.04
11. P00734: 0.04
12. P02748: 0.027
13. P02649: 0.027
14. P05109: 0.027
15. P18428: 0.027
16. P02790: 0.027
17. P04004: 0.027
18. Q9BXR6: 0.027
19. A0A075B6K4: 0.027
20. A0A0B4J1Y9: 0.027
21. P0102

In [21]:
# Find common features between AdaBoost and Random Forest
common_features_info = find_common_features(top_features_with_names, 'RF', 'AdaBoost')

print("Common top features between RandomForest and AdaBoost with positions, importance values, and protein names:")
for feature, info in common_features_info.items():
    print(f"{feature}: Protein - {info['RF_Protein']}, RandomForest Position - {info['RF_Position']}, Importance - {info['RF_Importance']}; "
          f"Protein - {info['AdaBoost_Protein']}, AdaBoost Position - {info['AdaBoost_Position']}, Importance - {info['AdaBoost_Importance']}")


Common top features between RandomForest and AdaBoost with positions, importance values, and protein names:
89: Protein - Q9BXR6, RandomForest Position - 28, Importance - 0.009; Protein - Q9BXR6, AdaBoost Position - 18, Importance - 0.027
6: Protein - P01833, RandomForest Position - 2, Importance - 0.035; Protein - P01833, AdaBoost Position - 3, Importance - 0.06
70: Protein - P00488, RandomForest Position - 21, Importance - 0.012; Protein - P00488, AdaBoost Position - 10, Importance - 0.04
71: Protein - P00734, RandomForest Position - 22, Importance - 0.012; Protein - P00734, AdaBoost Position - 11, Importance - 0.04
10: Protein - P02748, RandomForest Position - 17, Importance - 0.015; Protein - P02748, AdaBoost Position - 12, Importance - 0.027
140: Protein - P00740, RandomForest Position - 14, Importance - 0.017; Protein - P00740, AdaBoost Position - 2, Importance - 0.067
12: Protein - P01034, RandomForest Position - 3, Importance - 0.032; Protein - P01034, AdaBoost Position - 27, I

### 2.2 Testing

In [15]:
for model, names in zip(models, best_params.keys()):
    print("Predicting with model %s" % names)
    # print the model params
    model = model.set_params(**best_params[names])
    print(model)
    y_pred, y_pred_proba, cm = predict_ml_model(model, X_train, y_train, X_test, y_test)
    print_ml_metrics(cm, y_test, y_pred_proba)
    print("--------------------------------------")

Predicting with model Dummy
DummyClassifier(random_state=42, strategy='most_frequent')
Accuracy: 0.518987
AUC: 0.500000
F1 Macro: nan
F1 pheno1: 0.683333
F1 pheno0: nan
Recall pheno1: 1.000000
Recall pheno0: 0.000000
Precision pheno1: 0.518987
Precision pheno0: 0.000000
--------------------------------------
Predicting with model SVC
SVC(C=5, probability=True, random_state=42)
Accuracy: 0.835443
AUC: 0.905006
F1 Macro: 0.835020
F1 pheno1: 0.843373
F1 pheno0: 0.826667
Recall pheno1: 0.853659
Recall pheno0: 0.815789
Precision pheno1: 0.833333
Precision pheno0: 0.837838
--------------------------------------
Predicting with model LR
LogisticRegression(C=0.001, max_iter=10000, random_state=42)
Accuracy: 0.822785
AUC: 0.892811
F1 Macro: 0.821382
F1 pheno1: 0.837209
F1 pheno0: 0.805556
Recall pheno1: 0.878049
Recall pheno0: 0.763158
Precision pheno1: 0.800000
Precision pheno0: 0.852941
--------------------------------------
Predicting with model KNN
KNeighborsClassifier(n_neighbors=30, weigh

  f1_pheno1 = 2 * precision_pheno0 * recall_pheno1 / (precision_pheno0 + recall_pheno1)


Accuracy: 0.886076
AUC: 0.945122
F1 Macro: 0.886003
F1 pheno1: 0.888889
F1 pheno0: 0.883117
Recall pheno1: 0.878049
Recall pheno0: 0.894737
Precision pheno1: 0.900000
Precision pheno0: 0.871795
--------------------------------------
Predicting with model AdaBoost
AdaBoostClassifier(learning_rate=0.1, n_estimators=150, random_state=42)
Accuracy: 0.797468
AUC: 0.895379
F1 Macro: 0.797436
F1 pheno1: 0.800000
F1 pheno0: 0.794872
Recall pheno1: 0.780488
Recall pheno0: 0.815789
Precision pheno1: 0.820513
Precision pheno0: 0.775000
--------------------------------------
