# Library import

In [None]:
from peptdeep.pretrained_models import ModelManager
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import RandomOverSampler

import sys
# sys.path.append("C:/Users/Walraff/OneDrive - Universite de Liege/Documents/Ulg/Master2/TFE/")
import utils

import warnings
warnings.simplefilter("ignore", category=FutureWarning)

In [2]:
data_path = "C:/Users/Walraff/OneDrive - Universite de Liege/Documents/Ulg/Master2/TFE/data"
original_df = pd.read_csv(f'{data_path}/final_status_SPARE.csv')
original_df

Unnamed: 0,ProteinName_SPARE,Peptide_SPARE,Status_SPARE
0,sp|P02751|FINC_HUMAN,VDVIPVNLPGEHGQR,bon
1,sp|P02751|FINC_HUMAN,STTPDITGYR,bon
2,sp|P02751|FINC_HUMAN,SYTITGLQPGTDYK,bon
3,sp|P02751|FINC_HUMAN,IYLYTLNDNAR,bon
4,sp|P04114|APOB_HUMAN,TGISPLALIK,bon
...,...,...,...
150,sp|P02743|SAMP_HUMAN,VGEYSLYIGR,bon
151,sp|P04004|VTNC_HUMAN,GQYCYELDEK,mauvais
152,sp|P04004|VTNC_HUMAN,FEDGVLDPDYPR,bon
153,sp|P04004|VTNC_HUMAN,DWHGVPGQVDAAMAGR,bon


# Functions

In [None]:
def encode_ccs_feature(dataframe, model_manager):
    """
    Encodes CCS features for a given DataFrame using a pretrained CCS encoder.

    Args:
        dataframe (pd.DataFrame): Input DataFrame containing peptide sequences and required metadata.
        model_manager (ModelManager): Manager object providing access to the pretrained CCS model.

    Returns:
        np.ndarray: Numpy array of encoded peptide features (shape: [num_samples, embedding_dim]).
    """
    encoder = model_manager.ccs_model.model.ccs_encoder
    encoded_features = []

    for i in range(len(dataframe)):
        row = dataframe.iloc[[i]]

        # Extract amino acid index features
        aa_indices = model_manager.ccs_model._get_26aa_indice_features(row)

        # Extract modification features
        mod_x = model_manager.ccs_model._get_mod_features(row)

        # Extract charge (required for CCS model input)
        charge = torch.tensor(row["charge"].values)

        # Encode the peptide representation using the encoder
        with torch.no_grad():
            row_encoded_features = encoder(aa_indices, mod_x, charge)

        # Store the encoded representation as a NumPy array
        encoded_features.append(row_encoded_features.squeeze(0).numpy())

    encoded_features = np.array(encoded_features)
    return encoded_features


In [None]:
# Creating a dataframe with the sequences and labels
df = pd.DataFrame()
df["sequence"] = original_df["Peptide_SPARE"]
# column required for CCS model
df["mods"] = ''
df["mod_sites"] = ''
df["nAA"] = df.sequence.str.len()
df["charge"] = 0
df["quantotypic"] = original_df.apply(lambda row: 0 if row['Status_SPARE'] =='bon' else 1, axis=1)

positive_df = df[df['quantotypic'] == 0]
negative_df = df[df['quantotypic'] == 1]

class_counts = df['quantotypic'].value_counts()
max_len = df['sequence'].str.len().max()
pos_weight = class_counts[0] / class_counts[1]
print(class_counts)
print(pos_weight)

quantotypic
0    117
1     38
Name: count, dtype: int64
3.0789473684210527


# Random Forest

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
count = 1

# Trying with and without weighting
for weighting in [False, True]:
    temp_result_list = []
    # loop over folds
    for fold, (train_idx, test_idx) in enumerate(kf.split(df['sequence'], df['quantotypic'])):
        # Split the data into training and test sets
        train_df = df.iloc[train_idx]
        test_df = df.iloc[test_idx]
        print(f"Fold: {fold}")
        print()

        model_mgr = ModelManager()

        # Prepare training and test data
        X_train = encode_ccs_feature(train_df, model_mgr)
        y_train = train_df['quantotypic'].values

        X_test = encode_ccs_feature(test_df, model_mgr)
        y_test = test_df['quantotypic'].values

        # Param grid for RandomForestClassifier
        param_grid = {
            'max_depth': [None, 1, 3, 5, 6],
            'max_features': ['sqrt', 'log2', None], # None => max feature = n features
        }

        # If weighting is True, add class_weight to the param_grid
        if weighting:
            rf = RandomForestClassifier(n_estimators=1000, random_state=42, class_weight='balanced')
        else:
            rf = RandomForestClassifier(n_estimators=1000, random_state=42)

        # Train the model using GridSearchCV
        grid_search_rf = GridSearchCV(rf, param_grid, cv=5, scoring='average_precision', verbose=4, n_jobs=-1)
        grid_search_rf.fit(X_train, y_train)

        print(grid_search_rf.best_params_)
        print(grid_search_rf.best_score_)

        # Retrieve the best parameters
        best_params = grid_search_rf.best_params_

        # Initialize a new RandomForestClassifier with the best parameters
        if weighting:
            best_rf = RandomForestClassifier(n_estimators=1000,
                **best_params,
                random_state=42,
                class_weight='balanced'  # Include any additional arguments you used in GridSearchCV
            )
        else:
            best_rf = RandomForestClassifier(n_estimators=1000,
                **best_params,
                random_state=42  # Include any additional arguments you used in GridSearchCV
            )

        # Retrain the model on the entire training set
        best_rf.fit(X_train, y_train)

        # Predict on the test set
        y_pred = best_rf.predict(X_test)

        # Evaluate the model using the provided eval function
        accuracy, precision, recall, f1, roc_auc, pr_auc = utils.eval(y_test, y_pred)

        metrics = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "roc_auc": roc_auc,
            "pr_auc": pr_auc
        }

        temp_result_list.append(metrics)

        # Print the results
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"ROC AUC: {roc_auc:.4f}")
        print(f"PR AUC: {pr_auc:.4f}")

    accuracy_list = [result['accuracy'] for result in temp_result_list]
    precision_list = [result['precision'] for result in temp_result_list]
    recall_list = [result['recall'] for result in temp_result_list]
    f1_list = [result['f1'] for result in temp_result_list]
    roc_auc_list = [result['roc_auc'] for result in temp_result_list]
    pr_auc_list = [result['pr_auc'] for result in temp_result_list]

    metrics_summary = {
        "accuracy_mean": np.mean(accuracy_list),
        "accuracy_std": np.std(accuracy_list),
        "precision_mean": np.mean(precision_list),
        "precision_std": np.std(precision_list),
        "recall_mean": np.mean(recall_list),
        "recall_std": np.std(recall_list),
        "f1_mean": np.mean(f1_list),
        "f1_std": np.std(f1_list),
        "roc_auc_mean": np.mean(roc_auc_list),
        "roc_auc_std": np.std(roc_auc_list),
        "pr_auc_mean": np.mean(pr_auc_list),
        "pr_auc_std": np.std(pr_auc_list)
    }

    dict_save = {
        "weighting": weighting,
        "oversampling": False,
        "metrics_summary": metrics_summary
    }

    path = "C:/Users/Walraff/OneDrive - Universite de Liege/Documents/Ulg/Master2/TFE/Results/ValidProtocol"
    utils.write_into_json(dict_save, f"{path}/CCS_RF_experiment_{count}.json")
    count+=1

Fold: 0

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'max_depth': 1, 'max_features': 'log2'}
0.5207055871258658


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.7742
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC AUC: 0.5000
PR AUC: 0.2258
Fold: 1

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'max_depth': 1, 'max_features': 'log2'}
0.47665149571854465
Accuracy: 0.8065
Precision: 1.0000
Recall: 0.1429
F1 Score: 0.2500
ROC AUC: 0.5714
PR AUC: 0.3364
Fold: 2

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'max_depth': 5, 'max_features': 'sqrt'}
0.47116841547871974
Accuracy: 0.7742
Precision: 0.6667
Recall: 0.2500
F1 Score: 0.3636
ROC AUC: 0.6033
PR AUC: 0.3602
Fold: 3

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'max_depth': 3, 'max_features': 'log2'}
0.4153400993571272
Accuracy: 0.7742
Precision: 1.0000
Recall: 0.1250
F1 Score: 0.2222
ROC AUC: 0.5625
PR AUC: 0.3508
Fold: 4

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'max_depth': None, 'max_features': None}
0.5089559691989359
Accuracy: 0.6774
Precision: 0.3333
Recall: 0.2500
F1 Score: 0.2857
ROC AUC: 0.5380
PR

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# New loop with oversampling using ImbPipeline
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

temp_result_list = []
# loop over folds
for fold, (train_idx, test_idx) in enumerate(kf.split(df['sequence'], df['quantotypic'])):
    # Split the data into training and test sets
    train_df = df.iloc[train_idx]
    test_df = df.iloc[test_idx]
    print(f"Fold: {fold}")
    print()

    # Create a ModelManager instance
    model_mgr = ModelManager()

    # Prepare training and test data
    X_train = encode_ccs_feature(train_df, model_mgr)
    y_train = train_df['quantotypic'].values

    X_test = encode_ccs_feature(test_df, model_mgr)
    y_test = test_df['quantotypic'].values

    # Param grid for RandomForestClassifier
    param_grid = {
    'classifier__max_depth': [None, 1, 3, 5, 6],
    'classifier__max_features': ['sqrt', 'log2', None],  # None = use all the features
    }

    # Creating the pipeline with oversampling and RandomForestClassifier
    pipeline = ImbPipeline([
        ('oversampler', RandomOverSampler(random_state=42)),
        ('classifier', RandomForestClassifier(
            n_estimators=1000,
            random_state=42
        ))
    ])

    # Train the model using GridSearchCV
    grid_search_rf = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=5,
        scoring='average_precision',
        verbose=4,
        n_jobs=-1
    )

    grid_search_rf.fit(X_train, y_train)

    print(grid_search_rf.best_params_)
    print(grid_search_rf.best_score_)

    # Retrieve the best parameters
    best_params = grid_search_rf.best_params_

    best_rf = RandomForestClassifier(
        n_estimators=1000, 
        **{k.split('__')[1]: v for k, v in best_params.items()}, 
        random_state=42
    )

    # Retrain the model on the entire training set with oversampling
    oversampler = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    best_rf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = best_rf.predict(X_test)

    # Evaluate the model using the provided eval function
    accuracy, precision, recall, f1, roc_auc, pr_auc = utils.eval(y_test, y_pred)

    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc,
        "pr_auc": pr_auc
    }

    temp_result_list.append(metrics)

    # Print the results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")

accuracy_list = [result['accuracy'] for result in temp_result_list]
precision_list = [result['precision'] for result in temp_result_list]
recall_list = [result['recall'] for result in temp_result_list]
f1_list = [result['f1'] for result in temp_result_list]
roc_auc_list = [result['roc_auc'] for result in temp_result_list]
pr_auc_list = [result['pr_auc'] for result in temp_result_list]

metrics_summary = {
    "accuracy_mean": np.mean(accuracy_list),
    "accuracy_std": np.std(accuracy_list),
    "precision_mean": np.mean(precision_list),
    "precision_std": np.std(precision_list),
    "recall_mean": np.mean(recall_list),
    "recall_std": np.std(recall_list),
    "f1_mean": np.mean(f1_list),
    "f1_std": np.std(f1_list),
    "roc_auc_mean": np.mean(roc_auc_list),
    "roc_auc_std": np.std(roc_auc_list),
    "pr_auc_mean": np.mean(pr_auc_list),
    "pr_auc_std": np.std(pr_auc_list)
}

dict_save = {
    "weighting": False,
    "oversampling": True,
    "metrics_summary": metrics_summary
}

path = "C:/Users/Walraff/OneDrive - Universite de Liege/Documents/Ulg/Master2/TFE/Results/ValidProtocol"
utils.write_into_json(dict_save, f"{path}/CCS_RF_experiment_3.json")

Fold: 0

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': 1, 'classifier__max_features': 'sqrt'}
0.485981244458688


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Accuracy: 0.7742
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC AUC: 0.5000
PR AUC: 0.2258
Fold: 1

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': 1, 'classifier__max_features': 'sqrt'}
0.43731086641911493
Accuracy: 0.8065
Precision: 1.0000
Recall: 0.1429
F1 Score: 0.2500
ROC AUC: 0.5714
PR AUC: 0.3364
Fold: 2

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': 5, 'classifier__max_features': 'sqrt'}
0.4325548091436838
Accuracy: 0.7742
Precision: 0.6667
Recall: 0.2500
F1 Score: 0.3636
ROC AUC: 0.6033
PR AUC: 0.3602
Fold: 3

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': 1, 'classifier__max_features': None}
0.3923907030972248
Accuracy: 0.7742
Precision: 0.6667
Recall: 0.2500
F1 Score: 0.3636
ROC AUC: 0.6033
PR AUC: 0.3602
Fold: 4

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': None, 'classifier__max_features': 'log2'}
0.4928408

# XGBoost

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
count = 1

# Trying with and without weighting
for weighting in [False, True]:
    temp_result_list = []

    # loop over folds
    for fold, (train_idx, test_idx) in enumerate(kf.split(df['sequence'], df['quantotypic'])):
        # Split the data into training and test sets
        train_df = df.iloc[train_idx]
        test_df = df.iloc[test_idx]
        print(f"Fold: {fold}")
        print()

        # Create a ModelManager instance
        model_mgr = ModelManager()

        # Prepare training and test data
        X_train = encode_ccs_feature(train_df, model_mgr)
        y_train = train_df['quantotypic'].values

        X_test = encode_ccs_feature(test_df, model_mgr)
        y_test = test_df['quantotypic'].values

        # Param grid for XGBClassifier
        param_grid = {
            'max_depth': [None, 1, 3, 5, 6],              # Maximum tree depth, max 7
            'learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4],
        }

        # If weighting is True, add scale_pos_weight to the param_grid
        if weighting:
            model = XGBClassifier(
                n_estimators=1000,
                scale_pos_weight=pos_weight,  # Compensation du déséquilibre
                random_state=42
            )
        else:
            model = XGBClassifier(
                n_estimators=1000,
                random_state=42
            )

        # Train the model using GridSearchCV
        grid_search = GridSearchCV(
            model, param_grid, cv=5, scoring="average_precision", verbose=4, n_jobs=-1
        )
        grid_search_rf.fit(X_train, y_train)

        print(grid_search_rf.best_params_)
        print(grid_search_rf.best_score_)

        # Retrieve the best parameters
        best_params = grid_search_rf.best_params_

        # Initialize a new RandomForestClassifier with the best parameters
        if weighting:
            best_model = XGBClassifier(
                n_estimators=1000,
                scale_pos_weight=pos_weight,
                random_state=42,
                **best_params
            )
        else:
            best_model = XGBClassifier(
                n_estimators=1000,
                random_state=42,
                **best_params
            )

        # Retrain the model on the entire training set
        best_rf.fit(X_train, y_train)

        # Predict on the test set
        y_pred = best_rf.predict(X_test)

        # Evaluate the model using the provided eval function
        accuracy, precision, recall, f1, roc_auc, pr_auc = utils.eval(y_test, y_pred)

        metrics = {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "roc_auc": roc_auc,
            "pr_auc": pr_auc
        }

        temp_result_list.append(metrics)

        # Print the results
        print(f"Accuracy: {accuracy:.4f}")
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        print(f"ROC AUC: {roc_auc:.4f}")
        print(f"PR AUC: {pr_auc:.4f}")

    accuracy_list = [result['accuracy'] for result in temp_result_list]
    precision_list = [result['precision'] for result in temp_result_list]
    recall_list = [result['recall'] for result in temp_result_list]
    f1_list = [result['f1'] for result in temp_result_list]
    roc_auc_list = [result['roc_auc'] for result in temp_result_list]
    pr_auc_list = [result['pr_auc'] for result in temp_result_list]

    metrics_summary = {
        "accuracy_mean": np.mean(accuracy_list),
        "accuracy_std": np.std(accuracy_list),
        "precision_mean": np.mean(precision_list),
        "precision_std": np.std(precision_list),
        "recall_mean": np.mean(recall_list),
        "recall_std": np.std(recall_list),
        "f1_mean": np.mean(f1_list),
        "f1_std": np.std(f1_list),
        "roc_auc_mean": np.mean(roc_auc_list),
        "roc_auc_std": np.std(roc_auc_list),
        "pr_auc_mean": np.mean(pr_auc_list),
        "pr_auc_std": np.std(pr_auc_list)
    }

    dict_save = {
        "weighting": weighting,
        "oversampling": False,
        "metrics_summary": metrics_summary
    }

    path = "C:/Users/Walraff/OneDrive - Universite de Liege/Documents/Ulg/Master2/TFE/Results/ValidProtocol"
    utils.write_into_json(dict_save, f"{path}/CCS_XGB_experiment_{count}.json")
    count+=1

Fold: 0

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': 1, 'classifier__max_features': 'sqrt'}
0.485981244458688
Accuracy: 0.7419
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC AUC: 0.4792
PR AUC: 0.2258
Fold: 1

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': 1, 'classifier__max_features': 'sqrt'}
0.43731086641911493
Accuracy: 0.7742
Precision: 0.5000
Recall: 0.1429
F1 Score: 0.2222
ROC AUC: 0.5506
PR AUC: 0.2650
Fold: 2

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': 5, 'classifier__max_features': 'sqrt'}
0.4325548091436838
Accuracy: 0.7742
Precision: 0.6667
Recall: 0.2500
F1 Score: 0.3636
ROC AUC: 0.6033
PR AUC: 0.3602
Fold: 3

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': 1, 'classifier__max_features': None}
0.3923907030972248
Accuracy: 0.7742
Precision: 1.0000
Recall: 0.1250
F1 Score: 0.2222
ROC AUC: 0.5625
PR AUC: 0

In [None]:
# New loop with oversampling using ImbPipeline
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

temp_result_list = []
# loop over folds
for fold, (train_idx, test_idx) in enumerate(kf.split(df['sequence'], df['quantotypic'])):
    # Split the data into training and test sets
    train_df = df.iloc[train_idx]
    test_df = df.iloc[test_idx]
    print(f"Fold: {fold}")
    print()

    # Create a ModelManager instance
    model_mgr = ModelManager()

    # Prepare training and test data
    X_train = encode_ccs_feature(train_df, model_mgr)
    y_train = train_df['quantotypic'].values

    X_test = encode_ccs_feature(test_df, model_mgr)
    y_test = test_df['quantotypic'].values

    # Param grid for RandomForestClassifier
    param_grid = {
        'classifier__max_depth': [None, 1, 3, 5, 6], 
        'classifier__learning_rate': [0.01, 0.1, 0.2, 0.3, 0.4], 
    }

    # Creating the pipeline with oversampling and XGBClassifier
    pipeline = ImbPipeline([
        ('oversampler', RandomOverSampler(random_state=42)), 
        ('classifier', XGBClassifier(
            n_estimators=1000, 
            random_state=42
        ))
    ])

    # Train the model using GridSearchCV
    grid_search_xgb = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grid,
        cv=5, 
        scoring='average_precision',
        verbose=4,
        n_jobs=-1
    )

    grid_search_rf.fit(X_train, y_train)

    print(grid_search_rf.best_params_)
    print(grid_search_rf.best_score_)

    # Retrieve the best parameters
    best_params = grid_search_rf.best_params_

    best_xgb = XGBClassifier(
        n_estimators=1000,
        random_state=42,
        **best_params 
    )

    # Retrain the model on the entire training set with oversampling
    oversampler = RandomOverSampler(random_state=42)
    X_resampled, y_resampled = oversampler.fit_resample(X_train, y_train)
    best_rf.fit(X_train, y_train)

    # Predict on the test set
    y_pred = best_rf.predict(X_test)

    # Evaluate the model using the provided eval function
    accuracy, precision, recall, f1, roc_auc, pr_auc = utils.eval(y_test, y_pred)

    metrics = {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
        "roc_auc": roc_auc,
        "pr_auc": pr_auc
    }

    temp_result_list.append(metrics)

    # Print the results
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")
    print(f"PR AUC: {pr_auc:.4f}")

accuracy_list = [result['accuracy'] for result in temp_result_list]
precision_list = [result['precision'] for result in temp_result_list]
recall_list = [result['recall'] for result in temp_result_list]
f1_list = [result['f1'] for result in temp_result_list]
roc_auc_list = [result['roc_auc'] for result in temp_result_list]
pr_auc_list = [result['pr_auc'] for result in temp_result_list]

metrics_summary = {
    "accuracy_mean": np.mean(accuracy_list),
    "accuracy_std": np.std(accuracy_list),
    "precision_mean": np.mean(precision_list),
    "precision_std": np.std(precision_list),
    "recall_mean": np.mean(recall_list),
    "recall_std": np.std(recall_list),
    "f1_mean": np.mean(f1_list),
    "f1_std": np.std(f1_list),
    "roc_auc_mean": np.mean(roc_auc_list),
    "roc_auc_std": np.std(roc_auc_list),
    "pr_auc_mean": np.mean(pr_auc_list),
    "pr_auc_std": np.std(pr_auc_list)
}

dict_save = {
    "weighting": False,
    "oversampling": True,
    "metrics_summary": metrics_summary
}

path = "C:/Users/Walraff/OneDrive - Universite de Liege/Documents/Ulg/Master2/TFE/Results/ValidProtocol"
utils.write_into_json(dict_save, f"{path}/CCS_XGB_experiment_3.json")

Fold: 0

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': 1, 'classifier__max_features': 'sqrt'}
0.485981244458688
Accuracy: 0.7419
Precision: 0.0000
Recall: 0.0000
F1 Score: 0.0000
ROC AUC: 0.4792
PR AUC: 0.2258
Fold: 1

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': 1, 'classifier__max_features': 'sqrt'}
0.43731086641911493
Accuracy: 0.7742
Precision: 0.5000
Recall: 0.1429
F1 Score: 0.2222
ROC AUC: 0.5506
PR AUC: 0.2650
Fold: 2

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': 5, 'classifier__max_features': 'sqrt'}
0.4325548091436838
Accuracy: 0.7742
Precision: 0.6667
Recall: 0.2500
F1 Score: 0.3636
ROC AUC: 0.6033
PR AUC: 0.3602
Fold: 3

Fitting 5 folds for each of 15 candidates, totalling 75 fits
{'classifier__max_depth': 1, 'classifier__max_features': None}
0.3923907030972248
Accuracy: 0.7742
Precision: 1.0000
Recall: 0.1250
F1 Score: 0.2222
ROC AUC: 0.5625
PR AUC: 0