# Utils

In [1]:
from copy import deepcopy
import os
from typing import Dict, List

import numpy as np
import pandas as pd
import scipy as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, LeaveOneOut, StratifiedKFold
from scipy.stats import mode
from sklearn.svm import SVC
import tsfresh
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tsfresh.transformers import RelevantFeatureAugmenter, FeatureAugmenter, FeatureSelector

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from utils import Dataset, variance_thresholding, standardize, mcc, calculate_metrics, calculate_metrics_statistics

In [2]:
# parameters for saving data
PROCESSED_DATA_DIR = "processed_data24h"

|# Automated feature extraction

## Utilities and preprocessing

In [3]:
def basic_data_cleaning(data: List[pd.DataFrame]) -> List[pd.DataFrame]:
    """
    Assumes DataFrames with "timestamp", "date" and "activity" columns.
    
    Performs cleaning operations:
    - assure format YYYY-MM-DD HH:MM:SS for "timestamp"
    - drop redundant "date" column
    - assure float32 format for "activity"
    
    :param data: list of DataFrames
    :returns: list of cleaned DataFrames
    """
    data = [df.copy() for df in data]  # create copy to avoid side effects
    
    for df in data:
        df["timestamp"] = pd.to_datetime(df["timestamp"],
                                         format="%Y-%m-%d %H:%M:%S")
        df.drop("date", axis=1, inplace=True)
        df["activity"] = df["activity"].astype(np.float32)
    
    return data


def get_day_part(df: pd.DataFrame, part: str) -> pd.DataFrame:
    """
    For given DataFrame with "timestamp" column returns only those rows that
    correspond to the chosen part of day.
    
    Parts are "day" and "night", defined as:
    - "day": [8:00, 21:00)
    - "night": [21:00, 8:00)
    
    :param df: DataFrame to select rows from
    :param part: part of day, either "day" or "night"
    :returns: DataFrame, subset of rows of df
    """
    if part == "day":
        df = df.loc[(df["timestamp"].dt.hour >= 8) &
                    (df["timestamp"].dt.hour < 21)]
    elif part == "night":
        df = df.loc[(df["timestamp"].dt.hour >= 21) |
                    (df["timestamp"].dt.hour < 8)]
    else:
        raise ValueError(f'Part should be "day" or "night", got "{part}"')
        
    return df


def fill_missing_activity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Makes sure that "timestamp" column has minute resolution with no missing
    values from start to end and replaces all NaNs in "activity" column with
    mean average value.
    
    :param data: DataFrame with "timestamp" and "activity" columns
    :returns: cleaned DataFrame
    """
    df = df.copy()  # create copy to avoid side effects
    
    # resample to the basic frequency, i.e. minute; this will create NaNs for
    # any rows that may be missing
    df = df.resample("min", on="timestamp").mean()
    
    # recreate index and "timestamp" column
    df = df.reset_index()
    
    # fill any NaNs with mean activity value
    df["activity"] = df["activity"].fillna(df["activity"].mean())

    return df


def resample(df: pd.DataFrame, freq: str = "H") -> pd.DataFrame:
    """
    Resamples time series DataFrame with given frequency, aggregating each
    segment with a mean.

    :param df: DataFrame with "timestamp" and "activity" columns
    :param freq: resampling frequency passed to Pandas resample() function
    :returns: DataFrame with "timestamp" and "activity" columns
    """
    df = df.copy()  # create copy to avoid side effects
    
    # make sure that data has minute resolution with no missing parts from
    # start to end, with no missing values
    df = fill_missing_activity(df)
    
    # group with given frequency
    df = df.resample(freq, on="timestamp").mean()

    # recreate "timestamp" column
    df = df.reset_index()

    return df


def get_clean_dataframes(dfs: List[pd.DataFrame], freq: str = "H") \
        -> Dict[str, List[pd.DataFrame]]:
    """
    Cleans DataFrames, filling missing values and resampling with given
    frequency.
    
    Returns three lists of DataFrames:
    - full 24hs
    - days: [8:00, 21:00)
    - nights: [21:00, 8:00)
    
    :param dfs: list of DataFrames to clean; each one has to have "timestamp"
    and "activity" columns
    :param freq: resampling frequency
    :returns: dictionary with keys "full_24h", "day" and "night", corresponding
    to data from given parts of day
    """
    full_dfs = basic_data_cleaning(dfs)
    #print(full_dfs)
    full_dfs = [fill_missing_activity(df) for df in full_dfs]
    full_dfs = [resample(df, freq=freq) for df in full_dfs]
    
#    night_dfs = [get_day_part(df, part="night") for df in full_dfs]
#    day_dfs = [get_day_part(df, part="day") for df in full_dfs]

    datasets = {
        "full_24h": full_dfs,
#        "night": night_dfs,
#        "day": day_dfs
    }

    return datasets


def get_tsfresh_flat_format_df(dfs: List[pd.DataFrame]) -> pd.DataFrame:
    """
    Creates DataFrame in a "flat" format for tsfresh from list of DataFrames.
    Each one is assumed to have "timestamp" and "activity" columns.
    
    :param dfs: list of DataFrames; each one has to have "timestamp" and
    "activity" columns
    :returns: DataFrame in tsfresh "flat" format
    """
    dfs = deepcopy(dfs)  # create copy to avoid side effects
    
    flat_df = pd.DataFrame(columns=["id", "timestamp", "activity"])

    for idx, df in enumerate(dfs):
        df["id"] = idx
        flat_df = pd.concat([flat_df, df], ignore_index=True)

    flat_df = flat_df.reset_index(drop=True)
        
    return flat_df

## Parameters and constants

In [4]:
classifiers = {
    "LR": LogisticRegression(
        penalty="elasticnet",
        random_state=0,
        solver="saga",
        max_iter=50
    ),
    "SVM": SVC(
        kernel="rbf",
        cache_size=512
    ),
    "RF": RandomForestClassifier(
        n_estimators=50,
        criterion="entropy"
    ),
    "LGBM": LGBMClassifier(
        n_estimators=50,
        verbosity=-1,
        random_state=0
    ),
    "XGB": XGBClassifier(
        n_estimators=50,
        random_state=0 
    )
}


param_grids = {
    "LR": {
        "C": [0.5, 1, 2, 5, 10, 25],
        "class_weight": ["balanced"],
        "l1_ratio": [0.3, 0.4, 0.45,
                     0.55, 0.6,]
    },
    "SVM": {
        "C": [1, 10],
        "gamma": ["scale"],
        "class_weight": [None, "balanced"]
    },
    "RF": {
        "class_weight": [None, "balanced", "balanced_subsample"]
    },
    "LGBM": {
        "num_leaves": [31],
        "min_child_samples": [20],
        "class_weight": ["balanced"]
    },
    "XGB": {
        "learning_rate": [0.01, 0.1],
        "max_depth": [3, 6],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0],
        "scale_pos_weight": [1, 10]
    }
}

## tsfresh

### Utilities

In [5]:
def extract_tsfresh_features(dfs: List[pd.DataFrame], settings: Dict) \
        -> pd.DataFrame:
    """
    Performs feature extraction (only extraction, not selection) using tsfresh.
    
    :param dfs: list of DataFrames with time series, each with "timestamp" and
    "activity" columns
    :param settings: tsfresh settings, one of: ComprehensiveFCParameters,
    EfficientFCParameters, MinimalFCParameters
    :returns: DataFrame with extracted features, with one row per original
    DataFrame with time series (in the same order)
    """
    ts = get_tsfresh_flat_format_df(dfs)
    ids = ts["id"].unique()
    X = pd.DataFrame(index=ids)
    
    augmenter = FeatureAugmenter(
        default_fc_parameters=settings,
        column_id="id",
        column_sort="timestamp",
        column_value="activity",
        chunksize=1,
        n_jobs=4
    )
    
    augmenter.set_timeseries_container(ts)
    X = augmenter.transform(X)
    
    return X


class IncreasingFDRFeatureSelector(BaseEstimator, TransformerMixin):
    """
    Selects features using tsfresh feature selector and increasing FDR, if no
    features are selected at default FDR=0.05.
    """
    def __init__(self, verbose: bool = False):
        self.selector: FeatureSelector = None
        self.verbose: bool = verbose

    def fit(self, X, y):
        final_alpha = None
        for alpha in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5,
                      0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]:
            self.selector = FeatureSelector(
                fdr_level=alpha,
                n_jobs=4,
                chunksize=1
            )
            self.selector.fit(X, y)
            if len(self.selector.relevant_features) > 0:
                if self.verbose:
                    print("FDR:", final_alpha)
                return selector

        raise ValueError("Failed to select any features")
    
    def transform(self, X):
        return self.selector.transform(X)


class TsfreshTopNFeatureSelector(BaseEstimator, TransformerMixin):
    """
    Selects top N features using tsfresh feature selector.
    """
    def __init__(self, n: int = 10):
        self.n: int = n
        self.features: List[int] = None
    
    def fit(self, X, y):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        
        if not isinstance(y, pd.Series):
            y = pd.Series(y)
        
        relevance_table = calculate_relevance_table(X, y)
        relevance_table.sort_values("p_value", inplace=True)
        features = relevance_table.head(self.n)["feature"]
        self.features = list(features.values)
    
    def transform(self, X, y=None):
        return X[:, self.features]

### Feature extraction

In [6]:
dataset_str = "depresjon"  # "depresjon" or "psykose"

In [7]:
dataset = Dataset(dirpath=os.path.join("data_24h", dataset_str))
condition = dataset.condition
#print(condition)
control = dataset.control
print(condition)

[                timestamp        date  activity
0     2003-05-07 12:00:00  2003-05-07         0
1     2003-05-07 12:01:00  2003-05-07       143
2     2003-05-07 12:02:00  2003-05-07         0
3     2003-05-07 12:03:00  2003-05-07        20
4     2003-05-07 12:04:00  2003-05-07       166
...                   ...         ...       ...
1435  2003-05-08 11:55:00  2003-05-08       259
1436  2003-05-08 11:56:00  2003-05-08       190
1437  2003-05-08 11:57:00  2003-05-08       306
1438  2003-05-08 11:58:00  2003-05-08        91
1439  2003-05-08 11:59:00  2003-05-08       296

[1440 rows x 3 columns],                 timestamp        date  activity
0     2003-05-08 12:00:00  2003-05-08       139
1     2003-05-08 12:01:00  2003-05-08       259
2     2003-05-08 12:02:00  2003-05-08       178
3     2003-05-08 12:03:00  2003-05-08       235
4     2003-05-08 12:04:00  2003-05-08       235
...                   ...         ...       ...
1435  2003-05-09 11:55:00  2003-05-09         3
1436  2003-05

In [8]:
condition_parts_dfs = get_clean_dataframes(condition, freq="H")
control_parts_dfs = get_clean_dataframes(control, freq="H")

datasets = {}

for part in ["full_24h"]:
    condition_dfs_list = condition_parts_dfs[part]
    control_dfs_list = control_parts_dfs[part]
    
    dfs_list = condition_dfs_list + control_dfs_list
    #print(len(condition_dfs_list))
    #print(dfs_list)
    #print(len(dfs_list))
    #entire_df = pd.concat([condition_dfs_list[0], control_dfs_list[0]], ignore_index=False)
    #print(entire_df)
    #print(condition_dfs_list)
#     print(entire_df)
    datasets[part] = dfs_list

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
y = y.values.ravel()

In [9]:
settings_dict = {"minimal": MinimalFCParameters(),}
                 #"efficient": EfficientFCParameters()}

for part, dfs in datasets.items():
    for settings_name, settings in settings_dict.items():
        X = extract_tsfresh_features(dfs, settings)
        filename = f"automatic_{dataset_str}_{settings_name}_{part}.csv"
        filepath = os.path.join(PROCESSED_DATA_DIR, filename)
        X.to_csv(filepath, index=False)

Feature Extraction: 100%|██████████| 1063/1063 [00:01<00:00, 708.43it/s]


# Feature extracgion - Psykose

In [10]:
dataset_str = "psykose"  # "depresjon" or "psykose"

In [11]:
dataset = Dataset(dirpath=os.path.join("data_24h", dataset_str))
condition = dataset.condition
control = dataset.control

In [12]:
condition_parts_dfs = get_clean_dataframes(condition, freq="H")
control_parts_dfs = get_clean_dataframes(control, freq="H")

datasets = {}

for part in ["full_24h"]:
    condition_dfs_list = condition_parts_dfs[part]
    control_dfs_list = control_parts_dfs[part]
    
    dfs_list = condition_dfs_list + control_dfs_list
    datasets[part] = dfs_list

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
y = y.values.ravel()

In [13]:
settings_dict = {"minimal": MinimalFCParameters(),}
                 #"efficient": EfficientFCParameters()}

for part, dfs in datasets.items():
    for settings_name, settings in settings_dict.items():
        X = extract_tsfresh_features(dfs, settings)
        filename = f"automatic_{dataset_str}_{settings_name}_{part}.csv"
        filepath = os.path.join(PROCESSED_DATA_DIR, filename)
        X.to_csv(filepath, index=False)

Feature Extraction: 100%|██████████| 1054/1054 [00:01<00:00, 733.36it/s]


In [14]:
# Feature extraction - Hyperaktiv

In [15]:
dataset_str = "hyperaktiv"  # "depresjon" or "psykose"

In [16]:
dataset = Dataset(dirpath=os.path.join("data_24h", dataset_str))
condition = dataset.condition
control = dataset.control

In [17]:
condition_parts_dfs = get_clean_dataframes(condition, freq="H")
control_parts_dfs = get_clean_dataframes(control, freq="H")

datasets = {}

for part in ["full_24h"]:
    condition_dfs_list = condition_parts_dfs[part]
    control_dfs_list = control_parts_dfs[part]
    
    dfs_list = condition_dfs_list + control_dfs_list
    datasets[part] = dfs_list

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
y = y.values.ravel()

In [18]:
settings_dict = {"minimal": MinimalFCParameters(),}
                 #"efficient": EfficientFCParameters()}

for part, dfs in datasets.items():
    for settings_name, settings in settings_dict.items():
        X = extract_tsfresh_features(dfs, settings)
        filename = f"automatic_{dataset_str}_{settings_name}_{part}.csv"
        filepath = os.path.join(PROCESSED_DATA_DIR, filename)
        X.to_csv(filepath, index=False)

Feature Extraction: 100%|██████████| 549/549 [00:02<00:00, 184.09it/s]


### Minimal settings

In [19]:
dataset_str = "depresjon"  # "depresjon" or "psykose"

In [20]:
import os

def sort_key(filename):
    parts = filename.split("_")
    # Wydobycie numerów jako liczb całkowitych
    condition_number = int(parts[1])  # Druga część np. "1" z "condition_1"
    segment_number = int(parts[3].split(".")[0])  # Trzecia część np. "1" z "segmen1.csv"
    return (condition_number, segment_number)

In [21]:
print("Wyniki dla Depresjon:")
for part in ["full_24h"]:
    print(f"PART: {part}")
    
    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0)#.values
    
    folder_control = './data_24h/depresjon/control'
    folder_condition = './data_24h/depresjon/condition'
    z_control = []
    z_condition = []
    for filename in sorted(os.listdir(folder_control), key=sort_key):
        z_control.append(filename.split("_")[0] + filename.split("_")[1])
        
    for filename in sorted(os.listdir(folder_condition), key=sort_key):
        z_condition.append(filename.split("_")[0] + filename.split("_")[1])
        
    z_control = pd.Series(z_control)
    z_condition = pd.Series(z_condition)
    
    z = pd.concat([z_condition, z_control])
    z.index = X.index
    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()
    mask = X.notna().all(axis=1)
    X = X[mask]
    y = y[mask]
    z_depresjon = z[mask]



    for clf_type in ["LR", "SVM", "RF", "LGBM", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        
        test_scores = []
        group_results = {}  
        
        for train_idx, test_idx in folds.split(X, y):
            #print(train_idx)
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] 
            y_train, y_test = y[train_idx], y[test_idx]
            
            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="f1_weighted",
                n_jobs=-1,
                refit=True,
                #cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            for i in range(len(test_idx)):
                
                condition_id = z_depresjon.iloc[test_idx[i]] 
                class_label = y[test_idx[i]] 
                unique_id = (condition_id, class_label)  

                if unique_id not in group_results:
                    group_results[unique_id] = []  
                group_results[unique_id].append(y_pred[i])  
            final_predictions = {}
            for unique_id, preds in group_results.items():
                condition_id, class_label = unique_id
                majority_vote = mode(preds, keepdims=False).mode
                final_predictions[unique_id] = majority_vote 
            y_true_grouped = []
            y_pred_grouped =  []
            for k,v in final_predictions.items():
                y_true_grouped.append(k[1])
                y_pred_grouped.append(v)
            #print(len(y_true_grouped))
            metrics = calculate_metrics(y_true_grouped, y_pred_grouped)
            test_scores.append(metrics)
            
        final_scores = calculate_metrics_statistics(test_scores)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

Wyniki dla Depresjon:
PART: full_24h
  LR
    accuracy: 0.7435 +- 0.0038
    balanced_accuracy: 0.7053 +- 0.0051
    f1: 0.6065 +- 0.0092
    precision: 0.8436 +- 0.0051
    recall: 0.4735 +- 0.0095
    specificity: 0.9371 +- 0.0008
    ROC_AUC: 0.7053 +- 0.0051
    MCC: 0.4780 +- 0.0094

  SVM
    accuracy: 0.7435 +- 0.0038
    balanced_accuracy: 0.7053 +- 0.0051
    f1: 0.6065 +- 0.0092
    precision: 0.8436 +- 0.0051
    recall: 0.4735 +- 0.0095
    specificity: 0.9371 +- 0.0008
    ROC_AUC: 0.7053 +- 0.0051
    MCC: 0.4780 +- 0.0094

  RF
    accuracy: 0.7435 +- 0.0038
    balanced_accuracy: 0.7053 +- 0.0051
    f1: 0.6065 +- 0.0092
    precision: 0.8436 +- 0.0051
    recall: 0.4735 +- 0.0095
    specificity: 0.9371 +- 0.0008
    ROC_AUC: 0.7053 +- 0.0051
    MCC: 0.4780 +- 0.0094

  LGBM
    accuracy: 0.7435 +- 0.0038
    balanced_accuracy: 0.7053 +- 0.0051
    f1: 0.6065 +- 0.0092
    precision: 0.8436 +- 0.0051
    recall: 0.4735 +- 0.0095
    specificity: 0.9371 +- 0.0008
    R

# Classification - Psykose

In [22]:
dataset_str = "psykose"  # "depresjon" or "psykose"

In [23]:

print("Wyniki dla Psykose:")
for part in ["full_24h"]:
    print(f"PART: {part}")
    
    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0)#.values
    
    folder_control = './data_24h/psykose/control'
    folder_condition = './data_24h/psykose/condition'
    z_control = []
    z_condition = []
    for filename in sorted(os.listdir(folder_control), key=sort_key):
        z_control.append(filename.split("_")[0] + filename.split("_")[1])
        
    for filename in sorted(os.listdir(folder_condition), key=sort_key):
        z_condition.append(filename.split("_")[0] + filename.split("_")[1])
        
    z_control = pd.Series(z_control)
    z_condition = pd.Series(z_condition)
    
    z = pd.concat([z_condition, z_control])
    z.index = X.index
    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()
    mask = X.notna().all(axis=1)
    X = X[mask]
    y = y[mask]
    z_psykose = z[mask]



    for clf_type in ["LR", "SVM", "RF", "LGBM", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        
        test_scores = []
        group_results = {}  
        
        for train_idx, test_idx in folds.split(X, y):
            #print(train_idx)
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] 
            y_train, y_test = y[train_idx], y[test_idx]
            
            #X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="f1_weighted",
                n_jobs=-1,
                refit=True,
                #cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            for i in range(len(test_idx)):
                
                condition_id = z_psykose.iloc[test_idx[i]] 
                class_label = y[test_idx[i]] 
                unique_id = (condition_id, class_label)  

                if unique_id not in group_results:
                    group_results[unique_id] = []  
                group_results[unique_id].append(y_pred[i])  
            final_predictions = {}
            for unique_id, preds in group_results.items():
                condition_id, class_label = unique_id
                majority_vote = mode(preds, keepdims=False).mode
                final_predictions[unique_id] = majority_vote 
            y_true_grouped = []
            y_pred_grouped =  []
            for k,v in final_predictions.items():
                y_true_grouped.append(k[1])
                y_pred_grouped.append(v)
            #print(len(y_true_grouped))
            metrics = calculate_metrics(y_true_grouped, y_pred_grouped)
            test_scores.append(metrics)
            
        final_scores = calculate_metrics_statistics(test_scores)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

Wyniki dla Psykose:
PART: full_24h
  LR




    accuracy: 0.8395 +- 0.0104
    balanced_accuracy: 0.8502 +- 0.0128
    f1: 0.8229 +- 0.0118
    precision: 0.7521 +- 0.0111
    recall: 0.9091 +- 0.0287
    specificity: 0.7913 +- 0.0129
    ROC_AUC: 0.8502 +- 0.0128
    MCC: 0.6892 +- 0.0242

  SVM
    accuracy: 0.8580 +- 0.0238
    balanced_accuracy: 0.8380 +- 0.0231
    f1: 0.8081 +- 0.0295
    precision: 0.9103 +- 0.0412
    recall: 0.7273 +- 0.0287
    specificity: 0.9488 +- 0.0272
    ROC_AUC: 0.8380 +- 0.0231
    MCC: 0.7089 +- 0.0505

  RF
    accuracy: 0.8652 +- 0.0299
    balanced_accuracy: 0.8430 +- 0.0304
    f1: 0.8140 +- 0.0403
    precision: 0.9404 +- 0.0396
    recall: 0.7182 +- 0.0445
    specificity: 0.9679 +- 0.0211
    ROC_AUC: 0.8430 +- 0.0304
    MCC: 0.7276 +- 0.0618

  LGBM
    accuracy: 0.8802 +- 0.0342
    balanced_accuracy: 0.8710 +- 0.0357
    f1: 0.8487 +- 0.0431
    precision: 0.8822 +- 0.0399
    recall: 0.8182 +- 0.0498
    specificity: 0.9238 +- 0.0260
    ROC_AUC: 0.8710 +- 0.0357
    MCC: 0.7517 +

In [24]:
dataset_str = "hyperaktiv"  # "depresjon" or "psykose"

In [25]:

print("Wyniki dla hyperaktiv:")
for part in ["full_24h"]:
    print(f"PART: {part}")
    
    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0)#.values
    
    folder_control = './data_24h/hyperaktiv/control'
    folder_condition = './data_24h/hyperaktiv/condition'
    z_control = []
    z_condition = []
    for filename in sorted(os.listdir(folder_control), key=sort_key):
        z_control.append(filename.split("_")[0] + filename.split("_")[1])
        
    for filename in sorted(os.listdir(folder_condition), key=sort_key):
        z_condition.append(filename.split("_")[0] + filename.split("_")[1])
        
    z_control = pd.Series(z_control)
    z_condition = pd.Series(z_condition)
    
    z = pd.concat([z_condition, z_control])
    z.index = X.index
    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()
    mask = X.notna().all(axis=1)
    X = X[mask]
    y = y[mask]
    z_hyperaktiv = z[mask]



    for clf_type in ["LR", "SVM", "RF", "LGBM", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        
        test_scores = []
        group_results = {}  
        
        for train_idx, test_idx in folds.split(X, y):
            #print(train_idx)
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] 
            y_train, y_test = y[train_idx], y[test_idx]
            
            #X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="f1_weighted",
                n_jobs=-1,
                refit=True,
                #cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            for i in range(len(test_idx)):
                
                condition_id = z_hyperaktiv.iloc[test_idx[i]] 
                class_label = y[test_idx[i]] 
                unique_id = (condition_id, class_label)  

                if unique_id not in group_results:
                    group_results[unique_id] = []  
                group_results[unique_id].append(y_pred[i])  
            final_predictions = {}
            for unique_id, preds in group_results.items():
                condition_id, class_label = unique_id
                majority_vote = mode(preds, keepdims=False).mode
                final_predictions[unique_id] = majority_vote 
            y_true_grouped = []
            y_pred_grouped =  []
            for k,v in final_predictions.items():
                y_true_grouped.append(k[1])
                y_pred_grouped.append(v)
            #print(len(y_true_grouped))
            metrics = calculate_metrics(y_true_grouped, y_pred_grouped)
            test_scores.append(metrics)
            
        final_scores = calculate_metrics_statistics(test_scores)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

Wyniki dla hyperaktiv:
PART: full_24h
  LR




    accuracy: 0.5559 +- 0.0140
    balanced_accuracy: 0.5593 +- 0.0135
    f1: 0.5244 +- 0.0207
    precision: 0.5949 +- 0.0140
    recall: 0.4705 +- 0.0357
    specificity: 0.6480 +- 0.0499
    ROC_AUC: 0.5593 +- 0.0135
    MCC: 0.1209 +- 0.0285

  SVM
    accuracy: 0.6171 +- 0.0256
    balanced_accuracy: 0.6168 +- 0.0269
    f1: 0.6279 +- 0.0186
    precision: 0.6389 +- 0.0336
    recall: 0.6189 +- 0.0262
    specificity: 0.6148 +- 0.0618
    ROC_AUC: 0.6168 +- 0.0269
    MCC: 0.2341 +- 0.0539

  RF
    accuracy: 0.6211 +- 0.0304
    balanced_accuracy: 0.6245 +- 0.0296
    f1: 0.5971 +- 0.0272
    precision: 0.6718 +- 0.0348
    recall: 0.5379 +- 0.0276
    specificity: 0.7112 +- 0.0445
    ROC_AUC: 0.6245 +- 0.0296
    MCC: 0.2528 +- 0.0614

  LGBM
    accuracy: 0.6194 +- 0.0325
    balanced_accuracy: 0.6219 +- 0.0316
    f1: 0.6075 +- 0.0323
    precision: 0.6574 +- 0.0275
    recall: 0.5649 +- 0.0362
    specificity: 0.6789 +- 0.0290
    ROC_AUC: 0.6219 +- 0.0316
    MCC: 0.2449 +