In [1]:
import os
import pandas as pd
import numpy as np
from scipy.stats import uniform, randint
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from xgboost import XGBClassifier
import warnings
from sklearn.metrics import make_scorer, roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, classification_report
from config import (
    HRV_DATA_DIR,
    DATA_DIR,
    PARAMS_LIST,
    DEFAULT_FILTER_FN,
    DEFAULT_SMOOTH_FN,
    DEFAULT_NORMALIZE_FN,
)
from utils.dataset_preparation import prepare_dataset

In [8]:
# from itertools import product
# import mlflow
# mlflow.autolog(disable=True)

In [2]:
warnings.filterwarnings("ignore", category=DeprecationWarning, message="`trapz` is deprecated. Use `trapezoid` instead")

In [7]:
df = prepare_dataset(
    hrv_data_dir=HRV_DATA_DIR,
    data_dir=DATA_DIR,
    params_list=PARAMS_LIST,
    filter_fn=DEFAULT_FILTER_FN,
    smooth_fn=DEFAULT_SMOOTH_FN,
    normalize_fn=DEFAULT_NORMALIZE_FN,
)

# df = pd.read_csv(f'{DATA_DIR}/dataset.csv')

df.head()

Unnamed: 0,max_1_300_300,ADHD,min_4_600_150,lf_std_8_4000_2000,lf_hf_ratio_median_8_4000_2000,lf_hf_ratio_cv_2_150_50,tinn_180_60,poincare_sd1_sd2_ratio_400_100
0,0.138568,1,0.000488,0.162894,1.464409,1.268023,1.070247,0.194075
1,0.037795,0,0.000223,0.125344,0.930363,1.004642,0.458473,0.095535
2,0.099146,1,0.001406,0.05523,0.944683,1.134255,0.968127,0.210137
3,0.077804,1,0.002636,0.115625,1.265316,0.98803,0.622282,0.104197
4,0.041324,1,0.000881,0.074661,1.376249,1.576969,0.504681,0.121715


In [8]:
target_col = 'ADHD'
features = [col for col in df.columns if col != target_col]
X = df[features]
y = df[target_col]

In [9]:
# baseline model
model = XGBClassifier(
    eval_metric="logloss",
    random_state=42
)

cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
print(f"Mean accuracy: {np.mean(cv_scores):.4f}")
print(f"Std dev: {np.std(cv_scores):.4f}")

Mean accuracy: 0.7875
Std dev: 0.1016


In [10]:
# hyperparameters tuning
param_distributions = {
    'n_estimators': randint(90, 110),
    'max_depth': randint(1, 4),
    'learning_rate': uniform(0.03, 0.04),
    'subsample': uniform(0.75, 0.1),
    'colsample_bytree': uniform(0.65, 0.1)
}

random_search = RandomizedSearchCV(
    estimator=XGBClassifier(eval_metric='logloss', random_state=42),
    param_distributions=param_distributions,
    n_iter=10000,
    scoring='accuracy',
    cv=5,
    random_state=42,
    n_jobs=-1
)

random_search.fit(X, y)

best_params_random = random_search.best_params_
best_score_random = random_search.best_score_

print("Best Parameters (Randomized Search):", best_params_random)
print(f"Best Cross-Validation Accuracy (Randomized Search): {best_score_random:.2f}")

Best Parameters (Randomized Search): {'colsample_bytree': 0.7301970064625416, 'learning_rate': 0.06404475163103546, 'max_depth': 3, 'n_estimators': 109, 'subsample': 0.7932446742814552}
Best Cross-Validation Accuracy (Randomized Search): 0.82


In [11]:
scoring = {
    'accuracy': 'accuracy',
    'roc_auc': 'roc_auc',
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score)
}

best_model = XGBClassifier(
    eval_metric='logloss',
    random_state=42,
    **best_params_random
)

best_model.fit(X, y)

from sklearn.model_selection import cross_validate

results = cross_validate(
    estimator=best_model,
    X=X,
    y=y,
    scoring=scoring,
    cv=5,
    return_train_score=False
)

for metric in scoring.keys():
    print(f"Mean CV {metric.capitalize()}: {results['test_' + metric].mean():.4f}")


Mean CV Accuracy: 0.8250
Mean CV Roc_auc: 0.8267
Mean CV Precision: 0.8216
Mean CV Recall: 0.8357
Mean CV F1: 0.8191


In [20]:
# kf = StratifiedKFold(n_splits=5, shuffle=True)

# best_model = XGBClassifier(**best_params_random, eval_metric='logloss')

# for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), start=1):
#     best_model.fit(X.iloc[train_idx], y.iloc[train_idx])
#     y_pred_fold = best_model.predict(X.iloc[test_idx])
    
#     cm = confusion_matrix(y.iloc[test_idx], y_pred_fold)
#     print(f"Confusion Matrix for Fold {fold}:")
#     print(cm)
#     ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1']).plot(cmap='Blues')


In [27]:
# kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# best_model = XGBClassifier(**best_params_random, eval_metric='logloss', random_state=42)

# for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), start=1):
#     best_model.fit(X.iloc[train_idx], y.iloc[train_idx])
#     y_pred_fold = best_model.predict(X.iloc[test_idx])
#     y_proba_fold = best_model.predict_proba(X.iloc[test_idx])[:, 1]

#     cm = confusion_matrix(y.iloc[test_idx], y_pred_fold)
#     print(f"Confusion Matrix for Fold {fold}:")
#     print(cm)
#     ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Class 0', 'Class 1']).plot(cmap='Blues')

#     accuracy = accuracy_score(y.iloc[test_idx], y_pred_fold)
#     roc_auc = roc_auc_score(y.iloc[test_idx], y_proba_fold)
#     precision = precision_score(y.iloc[test_idx], y_pred_fold)
#     recall = recall_score(y.iloc[test_idx], y_pred_fold)
#     f1 = f1_score(y.iloc[test_idx], y_pred_fold)

#     print(f"Fold {fold} - Accuracy: {accuracy:.4f}, ROC AUC: {roc_auc:.4f}, "
#           f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
#     print("Classification Report:")
#     print(classification_report(y.iloc[test_idx], y_pred_fold, target_names=['Class 0', 'Class 1']))


In [None]:
# filter_functions = [default_filter_fn]
# smooth_functions = [rolling_mean_smooth_fn, no_smooth_fn]
# normalize_functions = [z_score_normalize_fn, min_max_scaler_fn, robust_scaler_fn, standard_scaler_fn, no_normalize_fn]

# params_variations_options = [
#     # Configurations for hf_max_feature
#     [
#         {"params": [(1, 300, 300)], "feature_func": hf_power_features, "feature_name": hf_max_feature},
#         {"params": [(2, 600, 300)], "feature_func": hf_power_features, "feature_name": hf_max_feature},
#         {"params": [(4, 1200, 600)], "feature_func": hf_power_features, "feature_name": hf_max_feature},
#         {"params": [(8, 2400, 1200)], "feature_func": hf_power_features, "feature_name": hf_max_feature},
#         {"params": [(10, 3000, 1500)], "feature_func": hf_power_features, "feature_name": hf_max_feature}
#     ],
#     # Configurations for hf_min_feature
#     [
#         {"params": [(4, 600, 150)], "feature_func": hf_power_features, "feature_name": hf_min_feature},
#         {"params": [(6, 900, 450)], "feature_func": hf_power_features, "feature_name": hf_min_feature},
#         {"params": [(8, 1200, 600)], "feature_func": hf_power_features, "feature_name": hf_min_feature},
#         {"params": [(12, 1800, 900)], "feature_func": hf_power_features, "feature_name": hf_min_feature},
#         {"params": [(16, 2400, 1200)], "feature_func": hf_power_features, "feature_name": hf_min_feature}
#     ],
#     # Configurations for lf_std_feature
#     [
#         {"params": [(4, 1500, 750)], "feature_func": lf_hf_ratio_features, "feature_name": lf_std_feature},
#         {"params": [(4, 1000, 500)], "feature_func": lf_hf_ratio_features, "feature_name": lf_std_feature},
#         {"params": [(4, 2000, 1000)], "feature_func": lf_hf_ratio_features, "feature_name": lf_std_feature},
#         {"params": [(6, 3000, 1500)], "feature_func": lf_hf_ratio_features, "feature_name": lf_std_feature},
#         {"params": [(8, 4000, 2000)], "feature_func": lf_hf_ratio_features, "feature_name": lf_std_feature}
#     ],
#     # Configurations for lf_hf_ratio_median_feature
#     [
#         {"params": [(4, 2000, 1000)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_median_feature},
#         {"params": [(4, 1000, 500)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_median_feature},
#         {"params": [(6, 3000, 1500)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_median_feature},
#         {"params": [(8, 4000, 2000)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_median_feature},
#         {"params": [(10, 5000, 2500)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_median_feature}
#     ],
#     # Configurations for lf_hf_ratio_cv_feature
#     [
#         {"params": [(2, 150, 50)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_cv_feature},
#         {"params": [(2, 200, 75)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_cv_feature},
#         {"params": [(2, 300, 100)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_cv_feature},
#         {"params": [(4, 600, 200)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_cv_feature},
#         {"params": [(6, 900, 300)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_cv_feature}
#     ],
#     # Configurations for tinn_feature
#     [
#         {"params": [(180, 60)], "feature_func": time_domain_features, "feature_name": tinn_feature},
#         {"params": [(200, 80)], "feature_func": time_domain_features, "feature_name": tinn_feature},
#         {"params": [(250, 100)], "feature_func": time_domain_features, "feature_name": tinn_feature},
#         {"params": [(300, 150)], "feature_func": time_domain_features, "feature_name": tinn_feature},
#         {"params": [(400, 200)], "feature_func": time_domain_features, "feature_name": tinn_feature}
#     ],
#     # Configurations for poincare_sd1_sd2_ratio_feature
#     [
#         {"params": [(1200, 300)], "feature_func": time_domain_features, "feature_name": poincare_sd1_sd2_ratio_feature},
#         {"params": [(1000, 250)], "feature_func": time_domain_features, "feature_name": poincare_sd1_sd2_ratio_feature},
#         {"params": [(800, 200)], "feature_func": time_domain_features, "feature_name": poincare_sd1_sd2_ratio_feature},
#         {"params": [(600, 150)], "feature_func": time_domain_features, "feature_name": poincare_sd1_sd2_ratio_feature},
#         {"params": [(400, 100)], "feature_func": time_domain_features, "feature_name": poincare_sd1_sd2_ratio_feature}
#     ]
# ]

# results = []
# max_score_so_far = 0

# for filter_fn, smooth_fn, normalize_fn, param_combination in product(
#     filter_functions,
#     smooth_functions,
#     normalize_functions,
#     product(*params_variations_options)
# ):
#     params_variations = list(param_combination)
    
#     signals, labels, times = preprocess_files(
#         files, df_patients,
#         filter_fn=filter_fn,
#         smooth_fn=smooth_fn,
#         normalize_fn=normalize_fn
#     )

#     dataframes = []
#     for feature_info in params_variations:
#         df_features = process_signals(
#             signals, labels, feature_info["params"], feature_info["feature_func"], feature_info["feature_name"]
#         )
#         df_features['ADHD'] = labels
#         dataframes.append(df_features)

#     df_combined = dataframes[0]
#     for df_add in dataframes[1:]:
#         df_combined = df_combined.merge(df_add, on=['id', 'ADHD'], how='inner')

#     df_combined = df_combined.drop(columns=['id'])

#     target_col = 'ADHD'
#     features = [col for col in df_combined.columns if col != target_col]
#     X = df_combined[features]
#     y = df_combined[target_col]

#     model = XGBClassifier(eval_metric="logloss", random_state=42)
#     cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

#     results.append({
#         "filter_fn": filter_fn.__name__,
#         "smooth_fn": smooth_fn.__name__,
#         "normalize_fn": normalize_fn.__name__,
#         "params_combination": param_combination,
#         "mean_accuracy": np.mean(cv_scores),
#         "std_accuracy": np.std(cv_scores)
#     })
#     if np.mean(cv_scores) > max_score_so_far:
#         max_score_so_far = np.mean(cv_scores)
        
#         print(f"""Best score: {np.mean(cv_scores)} - for
#                     "filter_fn": {filter_fn.__name__},
#                     "smooth_fn": {smooth_fn.__name__},
#                     "normalize_fn": {normalize_fn.__name__},
#                     "params_combination": {param_combination }
#               """)

#     print(f"{np.mean(cv_scores)} +/- {np.std(cv_scores)}")

# results_df = pd.DataFrame(results)
# results_df.to_csv("preprocessing_feature_combinations_results.csv", index=False)
# results_df.display()

In [None]:
# filter_functions = [default_filter_fn]
# smooth_functions = [rolling_mean_smooth_fn, no_smooth_fn]
# normalize_functions = [z_score_normalize_fn, min_max_scaler_fn, robust_scaler_fn, standard_scaler_fn, no_normalize_fn]

# params_variations_options = [
#     # Configurations for hf_max_feature
#     [
#         {"params": [(1, 300, 300)], "feature_func": hf_power_features, "feature_name": hf_max_feature},
#         {"params": [(2, 600, 300)], "feature_func": hf_power_features, "feature_name": hf_max_feature},
#         {"params": [(4, 1200, 600)], "feature_func": hf_power_features, "feature_name": hf_max_feature},
#         {"params": [(8, 2400, 1200)], "feature_func": hf_power_features, "feature_name": hf_max_feature},
#         {"params": [(10, 3000, 1500)], "feature_func": hf_power_features, "feature_name": hf_max_feature}
#     ],
#     # Configurations for hf_min_feature
#     [
#         {"params": [(4, 600, 150)], "feature_func": hf_power_features, "feature_name": hf_min_feature},
#         {"params": [(6, 900, 450)], "feature_func": hf_power_features, "feature_name": hf_min_feature},
#         {"params": [(8, 1200, 600)], "feature_func": hf_power_features, "feature_name": hf_min_feature},
#         {"params": [(12, 1800, 900)], "feature_func": hf_power_features, "feature_name": hf_min_feature},
#         {"params": [(16, 2400, 1200)], "feature_func": hf_power_features, "feature_name": hf_min_feature}
#     ],
#     # Configurations for lf_std_feature
#     [
#         {"params": [(4, 1500, 750)], "feature_func": lf_hf_ratio_features, "feature_name": lf_std_feature},
#         {"params": [(4, 1000, 500)], "feature_func": lf_hf_ratio_features, "feature_name": lf_std_feature},
#         {"params": [(4, 2000, 1000)], "feature_func": lf_hf_ratio_features, "feature_name": lf_std_feature},
#         {"params": [(6, 3000, 1500)], "feature_func": lf_hf_ratio_features, "feature_name": lf_std_feature},
#         {"params": [(8, 4000, 2000)], "feature_func": lf_hf_ratio_features, "feature_name": lf_std_feature}
#     ],
#     # Configurations for lf_hf_ratio_median_feature
#     [
#         {"params": [(4, 2000, 1000)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_median_feature},
#         {"params": [(4, 1000, 500)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_median_feature},
#         {"params": [(6, 3000, 1500)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_median_feature},
#         {"params": [(8, 4000, 2000)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_median_feature},
#         {"params": [(10, 5000, 2500)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_median_feature}
#     ],
#     # Configurations for lf_hf_ratio_cv_feature
#     [
#         {"params": [(2, 150, 50)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_cv_feature},
#         {"params": [(2, 200, 75)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_cv_feature},
#         {"params": [(2, 300, 100)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_cv_feature},
#         {"params": [(4, 600, 200)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_cv_feature},
#         {"params": [(6, 900, 300)], "feature_func": lf_hf_ratio_features, "feature_name": lf_hf_ratio_cv_feature}
#     ],
#     # Configurations for tinn_feature
#     [
#         {"params": [(180, 60)], "feature_func": time_domain_features, "feature_name": tinn_feature},
#         {"params": [(200, 80)], "feature_func": time_domain_features, "feature_name": tinn_feature},
#         {"params": [(250, 100)], "feature_func": time_domain_features, "feature_name": tinn_feature},
#         {"params": [(300, 150)], "feature_func": time_domain_features, "feature_name": tinn_feature},
#         {"params": [(400, 200)], "feature_func": time_domain_features, "feature_name": tinn_feature}
#     ],
#     # Configurations for poincare_sd1_sd2_ratio_feature
#     [
#         {"params": [(1200, 300)], "feature_func": time_domain_features, "feature_name": poincare_sd1_sd2_ratio_feature},
#         {"params": [(1000, 250)], "feature_func": time_domain_features, "feature_name": poincare_sd1_sd2_ratio_feature},
#         {"params": [(800, 200)], "feature_func": time_domain_features, "feature_name": poincare_sd1_sd2_ratio_feature},
#         {"params": [(600, 150)], "feature_func": time_domain_features, "feature_name": poincare_sd1_sd2_ratio_feature},
#         {"params": [(400, 100)], "feature_func": time_domain_features, "feature_name": poincare_sd1_sd2_ratio_feature}
#     ]
# ]

# results = []
# max_score_so_far = 0
# output_file = "preprocessing_feature_combinations_results.csv"

# if not os.path.exists(output_file):
#     with open(output_file, 'w') as f:
#         f.write("filter_fn,smooth_fn,normalize_fn,params_combination,mean_accuracy,std_accuracy\n")

# for filter_fn, smooth_fn, normalize_fn, param_combination in product(
#     filter_functions,
#     smooth_functions,
#     normalize_functions,
#     product(*params_variations_options)
# ):
#     params_variations = list(param_combination)
    
#     signals, labels, times = preprocess_files(
#         files, df_patients,
#         filter_fn=filter_fn,
#         smooth_fn=smooth_fn,
#         normalize_fn=normalize_fn
#     )

#     dataframes = []
#     for feature_info in params_variations:
#         df_features = process_signals(
#             signals, labels, feature_info["params"], feature_info["feature_func"], feature_info["feature_name"]
#         )
#         df_features['ADHD'] = labels
#         dataframes.append(df_features)

#     df_combined = dataframes[0]
#     for df_add in dataframes[1:]:
#         df_combined = df_combined.merge(df_add, on=['id', 'ADHD'], how='inner')

#     df_combined = df_combined.drop(columns=['id'])

#     target_col = 'ADHD'
#     features = [col for col in df_combined.columns if col != target_col]
#     X = df_combined[features]
#     y = df_combined[target_col]

#     model = XGBClassifier(eval_metric="logloss", random_state=42)
#     cv_scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')

#     mean_accuracy = np.mean(cv_scores)
#     std_accuracy = np.std(cv_scores)

#     results.append({
#         "filter_fn": filter_fn.__name__,
#         "smooth_fn": smooth_fn.__name__,
#         "normalize_fn": normalize_fn.__name__,
#         "params_combination": param_combination,
#         "mean_accuracy": mean_accuracy,
#         "std_accuracy": std_accuracy
#     })

#     with open(output_file, 'a') as f:
#         f.write(f"{filter_fn.__name__},{smooth_fn.__name__},{normalize_fn.__name__},"
#                 f"{param_combination},{mean_accuracy},{std_accuracy}\n")

#     if mean_accuracy > max_score_so_far:
#         max_score_so_far = mean_accuracy
#         print(f"""Best score: {mean_accuracy} - for
#                     "filter_fn": {filter_fn.__name__},
#                     "smooth_fn": {smooth_fn.__name__},
#                     "normalize_fn": {normalize_fn.__name__},
#                     "params_combination": {param_combination}
#               """)

#     print(f"{mean_accuracy} +/- {std_accuracy}")
