# Utils

In [1]:
from copy import deepcopy
import os
from typing import Dict, List

import numpy as np
import pandas as pd
import scipy as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, LeaveOneOut, StratifiedKFold
from scipy.stats import mode
from sklearn.svm import SVC
import tsfresh
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tsfresh.transformers import RelevantFeatureAugmenter, FeatureAugmenter, FeatureSelector

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from utils import Dataset, variance_thresholding, standardize, mcc, calculate_metrics, calculate_metrics_statistics

In [2]:
# parameters for saving data
PROCESSED_DATA_DIR = "processed_data24h"

|# Automated feature extraction

## Utilities and preprocessing

In [3]:
def basic_data_cleaning(data: List[pd.DataFrame]) -> List[pd.DataFrame]:
    """
    Assumes DataFrames with "timestamp", "date" and "activity" columns.
    
    Performs cleaning operations:
    - assure format YYYY-MM-DD HH:MM:SS for "timestamp"
    - drop redundant "date" column
    - assure float32 format for "activity"
    
    :param data: list of DataFrames
    :returns: list of cleaned DataFrames
    """
    data = [df.copy() for df in data]  # create copy to avoid side effects
    
    for df in data:
        df["timestamp"] = pd.to_datetime(df["timestamp"],
                                         format="%Y-%m-%d %H:%M:%S")
        df.drop("date", axis=1, inplace=True)
        df["activity"] = df["activity"].astype(np.float32)
    
    return data


def get_day_part(df: pd.DataFrame, part: str) -> pd.DataFrame:
    """
    For given DataFrame with "timestamp" column returns only those rows that
    correspond to the chosen part of day.
    
    Parts are "day" and "night", defined as:
    - "day": [8:00, 21:00)
    - "night": [21:00, 8:00)
    
    :param df: DataFrame to select rows from
    :param part: part of day, either "day" or "night"
    :returns: DataFrame, subset of rows of df
    """
    if part == "day":
        df = df.loc[(df["timestamp"].dt.hour >= 8) &
                    (df["timestamp"].dt.hour < 21)]
    elif part == "night":
        df = df.loc[(df["timestamp"].dt.hour >= 21) |
                    (df["timestamp"].dt.hour < 8)]
    else:
        raise ValueError(f'Part should be "day" or "night", got "{part}"')
        
    return df


def fill_missing_activity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Makes sure that "timestamp" column has minute resolution with no missing
    values from start to end and replaces all NaNs in "activity" column with
    mean average value.
    
    :param data: DataFrame with "timestamp" and "activity" columns
    :returns: cleaned DataFrame
    """
    df = df.copy()  # create copy to avoid side effects
    
    # resample to the basic frequency, i.e. minute; this will create NaNs for
    # any rows that may be missing
    df = df.resample("min", on="timestamp").mean()
    
    # recreate index and "timestamp" column
    df = df.reset_index()
    
    # fill any NaNs with mean activity value
    df["activity"] = df["activity"].fillna(df["activity"].mean())

    return df


def resample(df: pd.DataFrame, freq: str = "H") -> pd.DataFrame:
    """
    Resamples time series DataFrame with given frequency, aggregating each
    segment with a mean.

    :param df: DataFrame with "timestamp" and "activity" columns
    :param freq: resampling frequency passed to Pandas resample() function
    :returns: DataFrame with "timestamp" and "activity" columns
    """
    df = df.copy()  # create copy to avoid side effects
    
    # make sure that data has minute resolution with no missing parts from
    # start to end, with no missing values
    df = fill_missing_activity(df)
    
    # group with given frequency
    df = df.resample(freq, on="timestamp").mean()

    # recreate "timestamp" column
    df = df.reset_index()

    return df


def get_clean_dataframes(dfs: List[pd.DataFrame], freq: str = "H") \
        -> Dict[str, List[pd.DataFrame]]:
    """
    Cleans DataFrames, filling missing values and resampling with given
    frequency.
    
    Returns three lists of DataFrames:
    - full 24hs
    - days: [8:00, 21:00)
    - nights: [21:00, 8:00)
    
    :param dfs: list of DataFrames to clean; each one has to have "timestamp"
    and "activity" columns
    :param freq: resampling frequency
    :returns: dictionary with keys "full_24h", "day" and "night", corresponding
    to data from given parts of day
    """
    full_dfs = basic_data_cleaning(dfs)
    #print(full_dfs)
    full_dfs = [fill_missing_activity(df) for df in full_dfs]
    full_dfs = [resample(df, freq=freq) for df in full_dfs]
    
#    night_dfs = [get_day_part(df, part="night") for df in full_dfs]
    day_dfs = [get_day_part(df, part="day") for df in full_dfs]

    datasets = {
#        "day": full_dfs,
#        "night": night_dfs,
        "day": day_dfs
    }

    return datasets


def get_tsfresh_flat_format_df(dfs: List[pd.DataFrame]) -> pd.DataFrame:
    """
    Creates DataFrame in a "flat" format for tsfresh from list of DataFrames.
    Each one is assumed to have "timestamp" and "activity" columns.
    
    :param dfs: list of DataFrames; each one has to have "timestamp" and
    "activity" columns
    :returns: DataFrame in tsfresh "flat" format
    """
    dfs = deepcopy(dfs)  # create copy to avoid side effects
    
    flat_df = pd.DataFrame(columns=["id", "timestamp", "activity"])

    for idx, df in enumerate(dfs):
        df["id"] = idx
        flat_df = pd.concat([flat_df, df], ignore_index=True)

    flat_df = flat_df.reset_index(drop=True)
        
    return flat_df

## Parameters and constants

In [4]:
classifiers = {
    "LR": LogisticRegression(
        penalty="elasticnet",
        random_state=0,
        solver="saga",
        max_iter=50
    ),
    "SVM": SVC(
        kernel="rbf",
        cache_size=512
    ),
    "RF": RandomForestClassifier(
        n_estimators=50,
        criterion="entropy"
    ),
    "LGBM": LGBMClassifier(
        n_estimators=50,
        verbosity=-1,
        random_state=0
    ),
    "XGB": XGBClassifier(
        n_estimators=50,
        random_state=0 
    )
}


param_grids = {
    "LR": {
        "C": [0.5, 1, 2, 5, 10, 25],
        "class_weight": ["balanced"],
        "l1_ratio": [0.3, 0.4, 0.45,
                     0.55, 0.6,]
    },
    "SVM": {
        "C": [1, 10],
        "gamma": ["scale"],
        "class_weight": [None, "balanced"]
    },
    "RF": {
        "class_weight": [None, "balanced", "balanced_subsample"]
    },
    "LGBM": {
        "num_leaves": [31],
        "min_child_samples": [20],
        "class_weight": ["balanced"]
    },
    "XGB": {
        "learning_rate": [0.01, 0.1],
        "max_depth": [3, 6],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0],
        "scale_pos_weight": [1, 10]
    }
}

## tsfresh

### Utilities

In [5]:
def extract_tsfresh_features(dfs: List[pd.DataFrame], settings: Dict) \
        -> pd.DataFrame:
    """
    Performs feature extraction (only extraction, not selection) using tsfresh.
    
    :param dfs: list of DataFrames with time series, each with "timestamp" and
    "activity" columns
    :param settings: tsfresh settings, one of: ComprehensiveFCParameters,
    EfficientFCParameters, MinimalFCParameters
    :returns: DataFrame with extracted features, with one row per original
    DataFrame with time series (in the same order)
    """
    ts = get_tsfresh_flat_format_df(dfs)
    ids = ts["id"].unique()
    X = pd.DataFrame(index=ids)
    
    augmenter = FeatureAugmenter(
        default_fc_parameters=settings,
        column_id="id",
        column_sort="timestamp",
        column_value="activity",
        chunksize=1,
        n_jobs=4
    )
    
    augmenter.set_timeseries_container(ts)
    X = augmenter.transform(X)
    
    return X


class IncreasingFDRFeatureSelector(BaseEstimator, TransformerMixin):
    """
    Selects features using tsfresh feature selector and increasing FDR, if no
    features are selected at default FDR=0.05.
    """
    def __init__(self, verbose: bool = False):
        self.selector: FeatureSelector = None
        self.verbose: bool = verbose

    def fit(self, X, y):
        final_alpha = None
        for alpha in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5,
                      0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]:
            self.selector = FeatureSelector(
                fdr_level=alpha,
                n_jobs=4,
                chunksize=1
            )
            self.selector.fit(X, y)
            if len(self.selector.relevant_features) > 0:
                if self.verbose:
                    print("FDR:", final_alpha)
                return selector

        raise ValueError("Failed to select any features")
    
    def transform(self, X):
        return self.selector.transform(X)


class TsfreshTopNFeatureSelector(BaseEstimator, TransformerMixin):
    """
    Selects top N features using tsfresh feature selector.
    """
    def __init__(self, n: int = 10):
        self.n: int = n
        self.features: List[int] = None
    
    def fit(self, X, y):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        
        if not isinstance(y, pd.Series):
            y = pd.Series(y)
        
        relevance_table = calculate_relevance_table(X, y)
        relevance_table.sort_values("p_value", inplace=True)
        features = relevance_table.head(self.n)["feature"]
        self.features = list(features.values)
    
    def transform(self, X, y=None):
        return X[:, self.features]

### Feature extraction

In [6]:
dataset_str = "depresjon"  # "depresjon" or "psykose"

In [7]:
dataset = Dataset(dirpath=os.path.join("data_day", dataset_str))
condition = dataset.condition
#print(condition)
control = dataset.control
print(condition)

[               timestamp        date  activity
0    2003-05-08 08:00:00  2003-05-08         3
1    2003-05-08 08:01:00  2003-05-08         0
2    2003-05-08 08:02:00  2003-05-08         0
3    2003-05-08 08:03:00  2003-05-08         0
4    2003-05-08 08:04:00  2003-05-08         0
..                   ...         ...       ...
775  2003-05-08 20:55:00  2003-05-08        52
776  2003-05-08 20:56:00  2003-05-08        20
777  2003-05-08 20:57:00  2003-05-08         5
778  2003-05-08 20:58:00  2003-05-08         0
779  2003-05-08 20:59:00  2003-05-08         0

[780 rows x 3 columns],                timestamp        date  activity
0    2003-05-09 08:00:00  2003-05-09         0
1    2003-05-09 08:01:00  2003-05-09         0
2    2003-05-09 08:02:00  2003-05-09         0
3    2003-05-09 08:03:00  2003-05-09         0
4    2003-05-09 08:04:00  2003-05-09         0
..                   ...         ...       ...
775  2003-05-09 20:55:00  2003-05-09       568
776  2003-05-09 20:56:00  2003-05-

In [8]:
condition_parts_dfs = get_clean_dataframes(condition, freq="H")
control_parts_dfs = get_clean_dataframes(control, freq="H")

datasets = {}

for part in ["day"]:
    condition_dfs_list = condition_parts_dfs[part]
    control_dfs_list = control_parts_dfs[part]
    
    dfs_list = condition_dfs_list + control_dfs_list
    #print(len(condition_dfs_list))
    #print(dfs_list)
    #print(len(dfs_list))
    #entire_df = pd.concat([condition_dfs_list[0], control_dfs_list[0]], ignore_index=False)
    #print(entire_df)
    #print(condition_dfs_list)
#     print(entire_df)
    datasets[part] = dfs_list

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
y = y.values.ravel()

In [9]:
settings_dict = {"minimal": MinimalFCParameters(),}
                 #"efficient": EfficientFCParameters()}

for part, dfs in datasets.items():
    for settings_name, settings in settings_dict.items():
        X = extract_tsfresh_features(dfs, settings)
        filename = f"automatic_{dataset_str}_{settings_name}_{part}.csv"
        filepath = os.path.join(PROCESSED_DATA_DIR, filename)
        X.to_csv(filepath, index=False)

Feature Extraction: 100%|██████████| 1034/1034 [00:02<00:00, 416.27it/s]


# Feature extracgion - Psykose

In [10]:
dataset_str = "psykose"  # "depresjon" or "psykose"

In [11]:
dataset = Dataset(dirpath=os.path.join("data_day", dataset_str))
condition = dataset.condition
control = dataset.control

In [12]:
condition_parts_dfs = get_clean_dataframes(condition, freq="H")
control_parts_dfs = get_clean_dataframes(control, freq="H")

datasets = {}

for part in ["day"]:
    condition_dfs_list = condition_parts_dfs[part]
    control_dfs_list = control_parts_dfs[part]
    
    dfs_list = condition_dfs_list + control_dfs_list
    datasets[part] = dfs_list

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
y = y.values.ravel()

In [13]:
settings_dict = {"minimal": MinimalFCParameters(),}
                 #"efficient": EfficientFCParameters()}

for part, dfs in datasets.items():
    for settings_name, settings in settings_dict.items():
        X = extract_tsfresh_features(dfs, settings)
        filename = f"automatic_{dataset_str}_{settings_name}_{part}.csv"
        filepath = os.path.join(PROCESSED_DATA_DIR, filename)
        X.to_csv(filepath, index=False)

Feature Extraction: 100%|██████████| 1021/1021 [00:01<00:00, 612.38it/s]


In [14]:
# Feature extraction - Hyperaktiv

In [15]:
dataset_str = "hyperaktiv"  # "depresjon" or "psykose"

In [16]:
dataset = Dataset(dirpath=os.path.join("data_day", dataset_str))
condition = dataset.condition
control = dataset.control

In [17]:
condition_parts_dfs = get_clean_dataframes(condition, freq="H")
control_parts_dfs = get_clean_dataframes(control, freq="H")

datasets = {}

for part in ["day"]:
    condition_dfs_list = condition_parts_dfs[part]
    control_dfs_list = control_parts_dfs[part]
    
    dfs_list = condition_dfs_list + control_dfs_list
    datasets[part] = dfs_list

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
y = y.values.ravel()

In [18]:
settings_dict = {"minimal": MinimalFCParameters(),}
                 #"efficient": EfficientFCParameters()}

for part, dfs in datasets.items():
    for settings_name, settings in settings_dict.items():
        X = extract_tsfresh_features(dfs, settings)
        filename = f"automatic_{dataset_str}_{settings_name}_{part}.csv"
        filepath = os.path.join(PROCESSED_DATA_DIR, filename)
        X.to_csv(filepath, index=False)

Feature Extraction: 100%|██████████| 514/514 [00:02<00:00, 254.85it/s]


### Minimal settings

In [19]:
dataset_str = "depresjon"  # "depresjon" or "psykose"

In [20]:
import os

def sort_key(filename):
    parts = filename.split("_")
    # Wydobycie numerów jako liczb całkowitych
    condition_number = int(parts[1])  # Druga część np. "1" z "condition_1"
    segment_number = int(parts[3].split(".")[0])  # Trzecia część np. "1" z "segmen1.csv"
    return (condition_number, segment_number)

In [26]:
print("Wyniki dla Depresjon:")
for part in ["day"]:
    print(f"PART: {part}")
    
    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0)#.values
    
    folder_control = './data_day/depresjon/control'
    folder_condition = './data_day/depresjon/condition'
    z_control = []
    z_condition = []
    for filename in sorted(os.listdir(folder_control), key=sort_key):
        z_control.append(filename.split("_")[0] + filename.split("_")[1])
        
    for filename in sorted(os.listdir(folder_condition), key=sort_key):
        z_condition.append(filename.split("_")[0] + filename.split("_")[1])
        
    z_control = pd.Series(z_control)
    z_condition = pd.Series(z_condition)
    
    z = pd.concat([z_condition, z_control])
    z.index = X.index
    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()
    mask = X.notna().all(axis=1)
    X = X[mask]
    y = y[mask]
    #prin
    #z_hyperaktiv = z[mask]
    z_depresjon = z[mask]



    for clf_type in ["LR", "SVM", "RF", "LGBM", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        
        test_scores = []
        group_results = {}  
        
        for train_idx, test_idx in folds.split(X, y):
            #print(train_idx)
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] 
            y_train, y_test = y[train_idx], y[test_idx]
            
            #X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="f1_weighted",
                n_jobs=-1,
                refit=True,
                #cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            for i in range(len(test_idx)):
                
                condition_id = z_depresjon.iloc[test_idx[i]] 
                class_label = y[test_idx[i]] 
                unique_id = (condition_id, class_label)  

                if unique_id not in group_results:
                    group_results[unique_id] = []  
                group_results[unique_id].append(y_pred[i])  
            final_predictions = {}
            for unique_id, preds in group_results.items():
                condition_id, class_label = unique_id
                majority_vote = mode(preds, keepdims=False).mode
                final_predictions[unique_id] = majority_vote 
            y_true_grouped = []
            y_pred_grouped =  []
            for k,v in final_predictions.items():
                y_true_grouped.append(k[1])
                y_pred_grouped.append(v)
            #print(len(y_true_grouped))
            metrics = calculate_metrics(y_true_grouped, y_pred_grouped)
            test_scores.append(metrics)
            
        final_scores = calculate_metrics_statistics(test_scores)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

Wyniki dla Depresjon:
PART: day
(1034,)
  LR




    accuracy: 0.6842 +- 0.0727
    balanced_accuracy: 0.6902 +- 0.0868
    f1: 0.6546 +- 0.1216
    precision: 0.5958 +- 0.0706
    recall: 0.7366 +- 0.1831
    specificity: 0.6438 +- 0.0375
    ROC_AUC: 0.6902 +- 0.0868
    MCC: 0.3805 +- 0.1726

  SVM
    accuracy: 0.7560 +- 0.0386
    balanced_accuracy: 0.7202 +- 0.0457
    f1: 0.6218 +- 0.0802
    precision: 0.8990 +- 0.0500
    recall: 0.4779 +- 0.0811
    specificity: 0.9625 +- 0.0125
    ROC_AUC: 0.7202 +- 0.0457
    MCC: 0.5191 +- 0.0866

  RF
    accuracy: 0.7562 +- 0.0096
    balanced_accuracy: 0.7249 +- 0.0113
    f1: 0.6416 +- 0.0189
    precision: 0.8611 +- 0.0360
    recall: 0.5123 +- 0.0252
    specificity: 0.9375 +- 0.0198
    ROC_AUC: 0.7249 +- 0.0113
    MCC: 0.5117 +- 0.0261

  LGBM
    accuracy: 0.7598 +- 0.0151
    balanced_accuracy: 0.7471 +- 0.0175
    f1: 0.7009 +- 0.0278
    precision: 0.7479 +- 0.0318
    recall: 0.6630 +- 0.0523
    specificity: 0.8313 +- 0.0375
    ROC_AUC: 0.7471 +- 0.0175
    MCC: 0.5057 +

# Classification - Psykose

In [29]:
dataset_str = "psykose"  # "depresjon" or "psykose"

In [30]:

print("Wyniki dla Psykose:")
for part in ["day"]:
    print(f"PART: {part}")
    
    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0)#.values
    
    folder_control = './data_day/psykose/control'
    folder_condition = './data_day/psykose/condition'
    z_control = []
    z_condition = []
    for filename in sorted(os.listdir(folder_control), key=sort_key):
        z_control.append(filename.split("_")[0] + filename.split("_")[1])
        
    for filename in sorted(os.listdir(folder_condition), key=sort_key):
        z_condition.append(filename.split("_")[0] + filename.split("_")[1])
        
    z_control = pd.Series(z_control)
    z_condition = pd.Series(z_condition)
    
    z = pd.concat([z_condition, z_control])
    z.index = X.index
    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()
    mask = X.notna().all(axis=1)
    X = X[mask]
    y = y[mask]
    #prin
    z_psykose = z[mask]



    for clf_type in ["LR", "SVM", "RF", "LGBM", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        
        test_scores = []
        group_results = {}  
        
        for train_idx, test_idx in folds.split(X, y):
            #print(train_idx)
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] 
            y_train, y_test = y[train_idx], y[test_idx]
            
            #X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="f1_weighted",
                n_jobs=-1,
                refit=True,
                #cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            for i in range(len(test_idx)):
                
                condition_id = z_psykose.iloc[test_idx[i]] 
                class_label = y[test_idx[i]] 
                unique_id = (condition_id, class_label)  

                if unique_id not in group_results:
                    group_results[unique_id] = []  
                group_results[unique_id].append(y_pred[i])  
            final_predictions = {}
            for unique_id, preds in group_results.items():
                condition_id, class_label = unique_id
                majority_vote = mode(preds, keepdims=False).mode
                final_predictions[unique_id] = majority_vote 
            y_true_grouped = []
            y_pred_grouped =  []
            for k,v in final_predictions.items():
                y_true_grouped.append(k[1])
                y_pred_grouped.append(v)
            #print(len(y_true_grouped))
            metrics = calculate_metrics(y_true_grouped, y_pred_grouped)
            test_scores.append(metrics)
            
        final_scores = calculate_metrics_statistics(test_scores)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

Wyniki dla Psykose:
PART: day
(1021,)
  LR




    accuracy: 0.6896 +- 0.0348
    balanced_accuracy: 0.7154 +- 0.0290
    f1: 0.7001 +- 0.0256
    precision: 0.5873 +- 0.0344
    recall: 0.8684 +- 0.0024
    specificity: 0.5625 +- 0.0559
    ROC_AUC: 0.7154 +- 0.0290
    MCC: 0.4370 +- 0.0528

  SVM
    accuracy: 0.8428 +- 0.0388
    balanced_accuracy: 0.8270 +- 0.0481
    f1: 0.7913 +- 0.0682
    precision: 0.8685 +- 0.0418
    recall: 0.7352 +- 0.1061
    specificity: 0.9187 +- 0.0375
    ROC_AUC: 0.8270 +- 0.0481
    MCC: 0.6774 +- 0.0800

  RF
    accuracy: 0.8209 +- 0.0402
    balanced_accuracy: 0.7921 +- 0.0463
    f1: 0.7404 +- 0.0693
    precision: 0.9197 +- 0.0525
    recall: 0.6217 +- 0.0773
    specificity: 0.9625 +- 0.0234
    ROC_AUC: 0.7921 +- 0.0463
    MCC: 0.6403 +- 0.0875

  LGBM
    accuracy: 0.8573 +- 0.0560
    balanced_accuracy: 0.8368 +- 0.0653
    f1: 0.8017 +- 0.0937
    precision: 0.9184 +- 0.0583
    recall: 0.7174 +- 0.1167
    specificity: 0.9563 +- 0.0319
    ROC_AUC: 0.8368 +- 0.0653
    MCC: 0.7095 +

In [31]:
dataset_str = "hyperaktiv"  # "depresjon" or "psykose"

In [35]:

print("Wyniki dla hyperaktiv:")
for part in ["day"]:
    print(f"PART: {part}")
    
    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0)#.values
    
    folder_control = './data_day/hyperaktiv/control'
    folder_condition = './data_day/hyperaktiv/condition'
    z_control = []
    z_condition = []
    for filename in sorted(os.listdir(folder_control), key=sort_key):
        z_control.append(filename.split("_")[0] + filename.split("_")[1])
        
    for filename in sorted(os.listdir(folder_condition), key=sort_key):
        z_condition.append(filename.split("_")[0] + filename.split("_")[1])
        
    z_control = pd.Series(z_control)
    z_condition = pd.Series(z_condition)
    
    z = pd.concat([z_condition, z_control])
    z.index = X.index
    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()
    mask = X.notna().all(axis=1)
    X = X[mask]
    y = y[mask]
    #prin
    z_hyperaktiv = z[mask]



    for clf_type in ["LR", "SVM", "RF", "LGBM", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        
        test_scores = []
        group_results = {}  
        
        for train_idx, test_idx in folds.split(X, y):
            #print(train_idx)
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] 
            y_train, y_test = y[train_idx], y[test_idx]
            
            #X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="f1_weighted",
                n_jobs=-1,
                refit=True,
                #cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            for i in range(len(test_idx)):
                
                condition_id = z_hyperaktiv.iloc[test_idx[i]] 
                class_label = y[test_idx[i]] 
                unique_id = (condition_id, class_label)  

                if unique_id not in group_results:
                    group_results[unique_id] = []  
                group_results[unique_id].append(y_pred[i])  
            final_predictions = {}
            for unique_id, preds in group_results.items():
                condition_id, class_label = unique_id
                majority_vote = mode(preds, keepdims=False).mode
                final_predictions[unique_id] = majority_vote 
            y_true_grouped = []
            y_pred_grouped =  []
            for k,v in final_predictions.items():
                y_true_grouped.append(k[1])
                y_pred_grouped.append(v)
            #print(len(y_true_grouped))
            metrics = calculate_metrics(y_true_grouped, y_pred_grouped)
            test_scores.append(metrics)
            
        final_scores = calculate_metrics_statistics(test_scores)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

Wyniki dla hyperaktiv:
PART: day
(514,)
  LR




    accuracy: 0.5416 +- 0.0215
    balanced_accuracy: 0.5540 +- 0.0221
    f1: 0.5271 +- 0.0402
    precision: 0.6327 +- 0.0329
    recall: 0.4541 +- 0.0540
    specificity: 0.6540 +- 0.0584
    ROC_AUC: 0.5540 +- 0.0221
    MCC: 0.1096 +- 0.0452

  SVM
    accuracy: 0.5409 +- 0.0224
    balanced_accuracy: 0.5563 +- 0.0301
    f1: 0.5143 +- 0.0379
    precision: 0.6469 +- 0.0547
    recall: 0.4330 +- 0.0631
    specificity: 0.6795 +- 0.1052
    ROC_AUC: 0.5563 +- 0.0301
    MCC: 0.1180 +- 0.0663

  RF
    accuracy: 0.6204 +- 0.0150
    balanced_accuracy: 0.6325 +- 0.0163
    f1: 0.6163 +- 0.0214
    precision: 0.7203 +- 0.0242
    recall: 0.5389 +- 0.0236
    specificity: 0.7262 +- 0.0284
    ROC_AUC: 0.6325 +- 0.0163
    MCC: 0.2660 +- 0.0324

  LGBM
    accuracy: 0.5939 +- 0.0228
    balanced_accuracy: 0.6044 +- 0.0247
    f1: 0.5940 +- 0.0175
    precision: 0.6860 +- 0.0300
    recall: 0.5241 +- 0.0139
    specificity: 0.6846 +- 0.0450
    ROC_AUC: 0.6044 +- 0.0247
    MCC: 0.2091 +