# Utils

In [22]:
from copy import deepcopy
import os
from typing import Dict, List

import numpy as np
import pandas as pd
import scipy as sp
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, LeaveOneOut, StratifiedKFold
from scipy.stats import mode
from sklearn.svm import SVC
import tsfresh
from tsfresh.feature_extraction.settings import ComprehensiveFCParameters, EfficientFCParameters, MinimalFCParameters
from tsfresh.feature_selection.relevance import calculate_relevance_table
from tsfresh.transformers import RelevantFeatureAugmenter, FeatureAugmenter, FeatureSelector

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from utils import Dataset, variance_thresholding, standardize, mcc, calculate_metrics, calculate_metrics_statistics

In [23]:
# parameters for saving data
PROCESSED_DATA_DIR = "processed_data24h"

|# Automated feature extraction

## Utilities and preprocessing

In [24]:
def basic_data_cleaning(data: List[pd.DataFrame]) -> List[pd.DataFrame]:
    """
    Assumes DataFrames with "timestamp", "date" and "activity" columns.
    
    Performs cleaning operations:
    - assure format YYYY-MM-DD HH:MM:SS for "timestamp"
    - drop redundant "date" column
    - assure float32 format for "activity"
    
    :param data: list of DataFrames
    :returns: list of cleaned DataFrames
    """
    data = [df.copy() for df in data]  # create copy to avoid side effects
    
    for df in data:
        df["timestamp"] = pd.to_datetime(df["timestamp"],
                                         format="%Y-%m-%d %H:%M:%S")
        df.drop("date", axis=1, inplace=True)
        df["activity"] = df["activity"].astype(np.float32)
    
    return data


def get_day_part(df: pd.DataFrame, part: str) -> pd.DataFrame:
    """
    For given DataFrame with "timestamp" column returns only those rows that
    correspond to the chosen part of day.
    
    Parts are "day" and "night", defined as:
    - "day": [8:00, 21:00)
    - "night": [21:00, 8:00)
    
    :param df: DataFrame to select rows from
    :param part: part of day, either "day" or "night"
    :returns: DataFrame, subset of rows of df
    """
    if part == "day":
        df = df.loc[(df["timestamp"].dt.hour >= 8) &
                    (df["timestamp"].dt.hour < 21)]
    elif part == "night":
        df = df.loc[(df["timestamp"].dt.hour >= 21) |
                    (df["timestamp"].dt.hour < 8)]
    else:
        raise ValueError(f'Part should be "day" or "night", got "{part}"')
        
    return df


def fill_missing_activity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Makes sure that "timestamp" column has minute resolution with no missing
    values from start to end and replaces all NaNs in "activity" column with
    mean average value.
    
    :param data: DataFrame with "timestamp" and "activity" columns
    :returns: cleaned DataFrame
    """
    df = df.copy()  # create copy to avoid side effects
    
    # resample to the basic frequency, i.e. minute; this will create NaNs for
    # any rows that may be missing
    df = df.resample("min", on="timestamp").mean()
    
    # recreate index and "timestamp" column
    df = df.reset_index()
    
    # fill any NaNs with mean activity value
    df["activity"] = df["activity"].fillna(df["activity"].mean())

    return df


def resample(df: pd.DataFrame, freq: str = "H") -> pd.DataFrame:
    """
    Resamples time series DataFrame with given frequency, aggregating each
    segment with a mean.

    :param df: DataFrame with "timestamp" and "activity" columns
    :param freq: resampling frequency passed to Pandas resample() function
    :returns: DataFrame with "timestamp" and "activity" columns
    """
    df = df.copy()  # create copy to avoid side effects
    
    # make sure that data has minute resolution with no missing parts from
    # start to end, with no missing values
    df = fill_missing_activity(df)
    
    # group with given frequency
    df = df.resample(freq, on="timestamp").mean()

    # recreate "timestamp" column
    df = df.reset_index()

    return df


def get_clean_dataframes(dfs: List[pd.DataFrame], freq: str = "H") \
        -> Dict[str, List[pd.DataFrame]]:
    """
    Cleans DataFrames, filling missing values and resampling with given
    frequency.
    
    Returns three lists of DataFrames:
    - full 24hs
    - days: [8:00, 21:00)
    - nights: [21:00, 8:00)
    
    :param dfs: list of DataFrames to clean; each one has to have "timestamp"
    and "activity" columns
    :param freq: resampling frequency
    :returns: dictionary with keys "night", "day" and "night", corresponding
    to data from given parts of day
    """
    full_dfs = basic_data_cleaning(dfs)
    #print(full_dfs)
    full_dfs = [fill_missing_activity(df) for df in full_dfs]
    full_dfs = [resample(df, freq=freq) for df in full_dfs]
    
    night_dfs = [get_day_part(df, part="night") for df in full_dfs]
#    day_dfs = [get_day_part(df, part="day") for df in full_dfs]

    datasets = {
#        "night": full_dfs,
        "night": night_dfs,
#        "day": day_dfs
    }

    return datasets


def get_tsfresh_flat_format_df(dfs: List[pd.DataFrame]) -> pd.DataFrame:
    """
    Creates DataFrame in a "flat" format for tsfresh from list of DataFrames.
    Each one is assumed to have "timestamp" and "activity" columns.
    
    :param dfs: list of DataFrames; each one has to have "timestamp" and
    "activity" columns
    :returns: DataFrame in tsfresh "flat" format
    """
    dfs = deepcopy(dfs)  # create copy to avoid side effects
    
    flat_df = pd.DataFrame(columns=["id", "timestamp", "activity"])

    for idx, df in enumerate(dfs):
        df["id"] = idx
        flat_df = pd.concat([flat_df, df], ignore_index=True)

    flat_df = flat_df.reset_index(drop=True)
        
    return flat_df

## Parameters and constants

In [25]:
classifiers = {
    "LR": LogisticRegression(
        penalty="elasticnet",
        random_state=0,
        solver="saga",
        max_iter=50
    ),
    "SVM": SVC(
        kernel="rbf",
        cache_size=512
    ),
    "RF": RandomForestClassifier(
        n_estimators=50,
        criterion="entropy"
    ),
    "LGBM": LGBMClassifier(
        n_estimators=50,
        verbosity=-1,
        random_state=0
    ),
    "XGB": XGBClassifier(
        n_estimators=50,
        random_state=0 
    )
}


param_grids = {
    "LR": {
        "C": [0.5, 1, 2, 5, 10, 25],
        "class_weight": ["balanced"],
        "l1_ratio": [0.3, 0.4, 0.45,
                     0.55, 0.6,]
    },
    "SVM": {
        "C": [1, 10],
        "gamma": ["scale"],
        "class_weight": [None, "balanced"]
    },
    "RF": {
        "class_weight": [None, "balanced", "balanced_subsample"]
    },
    "LGBM": {
        "num_leaves": [31],
        "min_child_samples": [20],
        "class_weight": ["balanced"]
    },
    "XGB": {
        "learning_rate": [0.01, 0.1],
        "max_depth": [3, 6],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0],
        "scale_pos_weight": [1, 10]
    }
}

## tsfresh

### Utilities

In [26]:
def extract_tsfresh_features(dfs: List[pd.DataFrame], settings: Dict) \
        -> pd.DataFrame:
    """
    Performs feature extraction (only extraction, not selection) using tsfresh.
    
    :param dfs: list of DataFrames with time series, each with "timestamp" and
    "activity" columns
    :param settings: tsfresh settings, one of: ComprehensiveFCParameters,
    EfficientFCParameters, MinimalFCParameters
    :returns: DataFrame with extracted features, with one row per original
    DataFrame with time series (in the same order)
    """
    ts = get_tsfresh_flat_format_df(dfs)
    ids = ts["id"].unique()
    X = pd.DataFrame(index=ids)
    
    augmenter = FeatureAugmenter(
        default_fc_parameters=settings,
        column_id="id",
        column_sort="timestamp",
        column_value="activity",
        chunksize=1,
        n_jobs=4
    )
    
    augmenter.set_timeseries_container(ts)
    X = augmenter.transform(X)
    
    return X


class IncreasingFDRFeatureSelector(BaseEstimator, TransformerMixin):
    """
    Selects features using tsfresh feature selector and increasing FDR, if no
    features are selected at default FDR=0.05.
    """
    def __init__(self, verbose: bool = False):
        self.selector: FeatureSelector = None
        self.verbose: bool = verbose

    def fit(self, X, y):
        final_alpha = None
        for alpha in [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5,
                      0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]:
            self.selector = FeatureSelector(
                fdr_level=alpha,
                n_jobs=4,
                chunksize=1
            )
            self.selector.fit(X, y)
            if len(self.selector.relevant_features) > 0:
                if self.verbose:
                    print("FDR:", final_alpha)
                return selector

        raise ValueError("Failed to select any features")
    
    def transform(self, X):
        return self.selector.transform(X)


class TsfreshTopNFeatureSelector(BaseEstimator, TransformerMixin):
    """
    Selects top N features using tsfresh feature selector.
    """
    def __init__(self, n: int = 10):
        self.n: int = n
        self.features: List[int] = None
    
    def fit(self, X, y):
        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)
        
        if not isinstance(y, pd.Series):
            y = pd.Series(y)
        
        relevance_table = calculate_relevance_table(X, y)
        relevance_table.sort_values("p_value", inplace=True)
        features = relevance_table.head(self.n)["feature"]
        self.features = list(features.values)
    
    def transform(self, X, y=None):
        return X[:, self.features]

### Feature extraction

In [27]:
dataset_str = "depresjon"  # "depresjon" or "psykose"

In [28]:
dataset = Dataset(dirpath=os.path.join("data_night", dataset_str))
condition = dataset.condition
#print(condition)
control = dataset.control
print(condition)

[               timestamp        date  activity
0    2003-05-07 21:00:00  2003-05-07       212
1    2003-05-07 21:01:00  2003-05-07       212
2    2003-05-07 21:02:00  2003-05-07        17
3    2003-05-07 21:03:00  2003-05-07        38
4    2003-05-07 21:04:00  2003-05-07        82
..                   ...         ...       ...
655  2003-05-08 07:55:00  2003-05-08         9
656  2003-05-08 07:56:00  2003-05-08        23
657  2003-05-08 07:57:00  2003-05-08         0
658  2003-05-08 07:58:00  2003-05-08         0
659  2003-05-08 07:59:00  2003-05-08        20

[660 rows x 3 columns],                timestamp        date  activity
0    2003-05-08 21:00:00  2003-05-08         0
1    2003-05-08 21:01:00  2003-05-08       166
2    2003-05-08 21:02:00  2003-05-08         0
3    2003-05-08 21:03:00  2003-05-08       242
4    2003-05-08 21:04:00  2003-05-08        18
..                   ...         ...       ...
655  2003-05-09 07:55:00  2003-05-09         0
656  2003-05-09 07:56:00  2003-05-

In [29]:
condition_parts_dfs = get_clean_dataframes(condition, freq="H")
control_parts_dfs = get_clean_dataframes(control, freq="H")

datasets = {}

for part in ["night"]:
    condition_dfs_list = condition_parts_dfs[part]
    control_dfs_list = control_parts_dfs[part]
    
    dfs_list = condition_dfs_list + control_dfs_list
    #print(len(condition_dfs_list))
    #print(dfs_list)
    #print(len(dfs_list))
    #entire_df = pd.concat([condition_dfs_list[0], control_dfs_list[0]], ignore_index=False)
    #print(entire_df)
    #print(condition_dfs_list)
#     print(entire_df)
    datasets[part] = dfs_list

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
y = y.values.ravel()

In [30]:
settings_dict = {"minimal": MinimalFCParameters(),}
                 #"efficient": EfficientFCParameters()}

for part, dfs in datasets.items():
    for settings_name, settings in settings_dict.items():
        X = extract_tsfresh_features(dfs, settings)
        filename = f"automatic_{dataset_str}_{settings_name}_{part}.csv"
        filepath = os.path.join(PROCESSED_DATA_DIR, filename)
        X.to_csv(filepath, index=False)

Feature Extraction: 100%|██████████| 1083/1083 [00:01<00:00, 621.01it/s]


# Feature extracgion - Psykose

In [31]:
dataset_str = "psykose"  # "depresjon" or "psykose"

In [32]:
dataset = Dataset(dirpath=os.path.join("data_night", dataset_str))
condition = dataset.condition
control = dataset.control

In [33]:
condition_parts_dfs = get_clean_dataframes(condition, freq="H")
control_parts_dfs = get_clean_dataframes(control, freq="H")

datasets = {}

for part in ["night"]:
    condition_dfs_list = condition_parts_dfs[part]
    control_dfs_list = control_parts_dfs[part]
    
    dfs_list = condition_dfs_list + control_dfs_list
    datasets[part] = dfs_list

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
y = y.values.ravel()

In [34]:
settings_dict = {"minimal": MinimalFCParameters(),}
                 #"efficient": EfficientFCParameters()}

for part, dfs in datasets.items():
    for settings_name, settings in settings_dict.items():
        X = extract_tsfresh_features(dfs, settings)
        filename = f"automatic_{dataset_str}_{settings_name}_{part}.csv"
        filepath = os.path.join(PROCESSED_DATA_DIR, filename)
        X.to_csv(filepath, index=False)

Feature Extraction: 100%|██████████| 1063/1063 [00:02<00:00, 525.48it/s]


In [35]:
# Feature extraction - Hyperaktiv

In [36]:
dataset_str = "hyperaktiv"  # "depresjon" or "psykose"

In [37]:
dataset = Dataset(dirpath=os.path.join("data_night", dataset_str))
condition = dataset.condition
control = dataset.control

In [38]:
condition_parts_dfs = get_clean_dataframes(condition, freq="H")
control_parts_dfs = get_clean_dataframes(control, freq="H")

datasets = {}

for part in ["night"]:
    condition_dfs_list = condition_parts_dfs[part]
    control_dfs_list = control_parts_dfs[part]
    
    dfs_list = condition_dfs_list + control_dfs_list
    datasets[part] = dfs_list

y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
y = y.values.ravel()

In [39]:
settings_dict = {"minimal": MinimalFCParameters(),}
                 #"efficient": EfficientFCParameters()}

for part, dfs in datasets.items():
    for settings_name, settings in settings_dict.items():
        X = extract_tsfresh_features(dfs, settings)
        filename = f"automatic_{dataset_str}_{settings_name}_{part}.csv"
        filepath = os.path.join(PROCESSED_DATA_DIR, filename)
        X.to_csv(filepath, index=False)

Feature Extraction: 100%|██████████| 572/572 [00:01<00:00, 389.42it/s]


### Minimal settings

In [40]:
dataset_str = "depresjon"  # "depresjon" or "psykose"

In [41]:
import os

def sort_key(filename):
    parts = filename.split("_")
    # Wydobycie numerów jako liczb całkowitych
    condition_number = int(parts[1])  # Druga część np. "1" z "condition_1"
    segment_number = int(parts[3].split(".")[0])  # Trzecia część np. "1" z "segmen1.csv"
    return (condition_number, segment_number)

In [50]:
print("Wyniki dla Depresjon:")
for part in ["night"]:
    print(f"PART: {part}")
    
    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0)#.values
    
    folder_control = './data_night/depresjon/control'
    folder_condition = './data_night/depresjon/condition'
    z_control = []
    z_condition = []
    for filename in sorted(os.listdir(folder_control), key=sort_key):
        z_control.append(filename.split("_")[0] + filename.split("_")[1])
        
    for filename in sorted(os.listdir(folder_condition), key=sort_key):
        z_condition.append(filename.split("_")[0] + filename.split("_")[1])
        
    z_control = pd.Series(z_control)
    z_condition = pd.Series(z_condition)
    
    z = pd.concat([z_condition, z_control])
    z.index = X.index
    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()
    mask = X.notna().all(axis=1)
    X = X[mask]
    mask = mask[:1063]
    #y = y[:106]
    y = y[mask]
    print(z.shape)
    z = z[:1063]
    X = X[:1063]
    #prin
    z_depresjon = z[mask]



    for clf_type in ["LR", "SVM", "RF", "LGBM", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        
        test_scores = []
        group_results = {}  
        
        for train_idx, test_idx in folds.split(X, y):
            #print(train_idx)
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] 
            y_train, y_test = y[train_idx], y[test_idx]
            
            #X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="f1_weighted",
                n_jobs=-1,
                refit=True,
                #cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            for i in range(len(test_idx)):
                
                condition_id = z_depresjon.iloc[test_idx[i]] 
                class_label = y[test_idx[i]] 
                unique_id = (condition_id, class_label)  

                if unique_id not in group_results:
                    group_results[unique_id] = []  
                group_results[unique_id].append(y_pred[i])  
            final_predictions = {}
            for unique_id, preds in group_results.items():
                condition_id, class_label = unique_id
                majority_vote = mode(preds, keepdims=False).mode
                final_predictions[unique_id] = majority_vote 
            y_true_grouped = []
            y_pred_grouped =  []
            for k,v in final_predictions.items():
                y_true_grouped.append(k[1])
                y_pred_grouped.append(v)
            #print(len(y_true_grouped))
            metrics = calculate_metrics(y_true_grouped, y_pred_grouped)
            test_scores.append(metrics)
            
        final_scores = calculate_metrics_statistics(test_scores)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

Wyniki dla Depresjon:
PART: night
(1083,)
  LR




    accuracy: 0.6857 +- 0.0267
    balanced_accuracy: 0.7096 +- 0.0152
    f1: 0.6875 +- 0.0138
    precision: 0.5861 +- 0.0359
    recall: 0.8435 +- 0.0806
    specificity: 0.5758 +- 0.0958
    ROC_AUC: 0.7096 +- 0.0152
    MCC: 0.4272 +- 0.0223

  SVM
    accuracy: 0.8214 +- 0.0196
    balanced_accuracy: 0.7918 +- 0.0215
    f1: 0.7419 +- 0.0313
    precision: 0.9115 +- 0.0306
    recall: 0.6261 +- 0.0348
    specificity: 0.9576 +- 0.0148
    ROC_AUC: 0.7918 +- 0.0215
    MCC: 0.6381 +- 0.0427

  RF
    accuracy: 0.7821 +- 0.0381
    balanced_accuracy: 0.7480 +- 0.0397
    f1: 0.6770 +- 0.0573
    precision: 0.8654 +- 0.0712
    recall: 0.5565 +- 0.0507
    specificity: 0.9394 +- 0.0332
    ROC_AUC: 0.7480 +- 0.0397
    MCC: 0.5535 +- 0.0871

  LGBM
    accuracy: 0.8214 +- 0.0375
    balanced_accuracy: 0.8024 +- 0.0395
    f1: 0.7615 +- 0.0516
    precision: 0.8426 +- 0.0537
    recall: 0.6957 +- 0.0550
    specificity: 0.9091 +- 0.0332
    ROC_AUC: 0.8024 +- 0.0395
    MCC: 0.6287 +

# Classification - Psykose

In [51]:
dataset_str = "psykose"  # "depresjon" or "psykose"

In [58]:

print("Wyniki dla Psykose:")
for part in ["night"]:
    print(f"PART: {part}")
    
    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0)#.values
    
    folder_control = './data_night/psykose/control'
    folder_condition = './data_night/psykose/condition'
    z_control = []
    z_condition = []
    for filename in sorted(os.listdir(folder_control), key=sort_key):
        z_control.append(filename.split("_")[0] + filename.split("_")[1])
        
    for filename in sorted(os.listdir(folder_condition), key=sort_key):
        z_condition.append(filename.split("_")[0] + filename.split("_")[1])
        
    z_control = pd.Series(z_control)
    z_condition = pd.Series(z_condition)
    
    z = pd.concat([z_condition, z_control])
    z.index = X.index
    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()
    mask = X.notna().all(axis=1)
    X = X[mask]
    y = y[mask]
    z_psykose = z[mask]



    for clf_type in ["LR", "SVM", "RF", "LGBM", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        
        test_scores = []
        group_results = {}  
        
        for train_idx, test_idx in folds.split(X, y):
            #print(train_idx)
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] 
            y_train, y_test = y[train_idx], y[test_idx]
            
            #X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="f1_weighted",
                n_jobs=-1,
                refit=True,
                #cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            for i in range(len(test_idx)):
                
                condition_id = z_psykose.iloc[test_idx[i]] 
                class_label = y[test_idx[i]] 
                unique_id = (condition_id, class_label)  

                if unique_id not in group_results:
                    group_results[unique_id] = []  
                group_results[unique_id].append(y_pred[i])  
            final_predictions = {}
            for unique_id, preds in group_results.items():
                condition_id, class_label = unique_id
                majority_vote = mode(preds, keepdims=False).mode
                final_predictions[unique_id] = majority_vote 
            y_true_grouped = []
            y_pred_grouped =  []
            for k,v in final_predictions.items():
                y_true_grouped.append(k[1])
                y_pred_grouped.append(v)
            #print(len(y_true_grouped))
            metrics = calculate_metrics(y_true_grouped, y_pred_grouped)
            test_scores.append(metrics)
            
        final_scores = calculate_metrics_statistics(test_scores)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

Wyniki dla Psykose:
PART: night
(1063,)
  LR




    accuracy: 0.7370 +- 0.0296
    balanced_accuracy: 0.7653 +- 0.0226
    f1: 0.7407 +- 0.0179
    precision: 0.6218 +- 0.0311
    recall: 0.9182 +- 0.0182
    specificity: 0.6125 +- 0.0612
    ROC_AUC: 0.7653 +- 0.0226
    MCC: 0.5346 +- 0.0343

  SVM
    accuracy: 0.8852 +- 0.0139
    balanced_accuracy: 0.8705 +- 0.0145
    f1: 0.8487 +- 0.0184
    precision: 0.9163 +- 0.0243
    recall: 0.7909 +- 0.0223
    specificity: 0.9500 +- 0.0153
    ROC_AUC: 0.8705 +- 0.0145
    MCC: 0.7626 +- 0.0296

  RF
    accuracy: 0.9259 +- 0.0262
    balanced_accuracy: 0.9134 +- 0.0303
    f1: 0.9021 +- 0.0364
    precision: 0.9688 +- 0.0256
    recall: 0.8455 +- 0.0545
    specificity: 0.9812 +- 0.0153
    ROC_AUC: 0.9134 +- 0.0303
    MCC: 0.8490 +- 0.0537

  LGBM
    accuracy: 0.9333 +- 0.0277
    balanced_accuracy: 0.9224 +- 0.0310
    f1: 0.9129 +- 0.0380
    precision: 0.9689 +- 0.0255
    recall: 0.8636 +- 0.0498
    specificity: 0.9812 +- 0.0153
    ROC_AUC: 0.9224 +- 0.0310
    MCC: 0.8633 +

In [61]:
dataset_str = "hyperaktiv"  # "depresjon" or "psykose"

In [63]:

print("Wyniki dla hyperaktiv:")
for part in ["night"]:
    print(f"PART: {part}")
    
    filename = f"automatic_{dataset_str}_minimal_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    X = pd.read_csv(filepath, header=0).fillna(0)#.values
    
    folder_control = './data_night/hyperaktiv/control'
    folder_condition = './data_night/hyperaktiv/condition'
    z_control = []
    z_condition = []
    for filename in sorted(os.listdir(folder_control), key=sort_key):
        z_control.append(filename.split("_")[0] + filename.split("_")[1])
        
    for filename in sorted(os.listdir(folder_condition), key=sort_key):
        z_condition.append(filename.split("_")[0] + filename.split("_")[1])
        
    z_control = pd.Series(z_control)
    z_condition = pd.Series(z_condition)
    
    z = pd.concat([z_condition, z_control])
    z.index = X.index
    y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, f"{dataset_str}_y.csv"), header=None, dtype=int)
    y = y.values.ravel()
    mask = X.notna().all(axis=1)
    X = X[mask]
    y = y[mask]
    z_hyperaktiv = z[mask]



    for clf_type in ["LR", "SVM", "RF", "LGBM", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        
        test_scores = []
        group_results = {}  
        
        for train_idx, test_idx in folds.split(X, y):
            #print(train_idx)
            X_train, X_test = X.iloc[train_idx], X.iloc[test_idx] 
            y_train, y_test = y[train_idx], y[test_idx]
            
            #X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="f1_weighted",
                n_jobs=-1,
                refit=True,
                #cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)

            clf = grid_search.best_estimator_
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)
            for i in range(len(test_idx)):
                
                condition_id = z_hyperaktiv.iloc[test_idx[i]] 
                class_label = y[test_idx[i]] 
                unique_id = (condition_id, class_label)  

                if unique_id not in group_results:
                    group_results[unique_id] = []  
                group_results[unique_id].append(y_pred[i])  
            final_predictions = {}
            for unique_id, preds in group_results.items():
                condition_id, class_label = unique_id
                majority_vote = mode(preds, keepdims=False).mode
                final_predictions[unique_id] = majority_vote 
            y_true_grouped = []
            y_pred_grouped =  []
            for k,v in final_predictions.items():
                y_true_grouped.append(k[1])
                y_pred_grouped.append(v)
            #print(len(y_true_grouped))
            metrics = calculate_metrics(y_true_grouped, y_pred_grouped)
            test_scores.append(metrics)
            
        final_scores = calculate_metrics_statistics(test_scores)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

Wyniki dla hyperaktiv:
PART: night
(572,)
  LR




    accuracy: 0.5485 +- 0.0109
    balanced_accuracy: 0.5551 +- 0.0127
    f1: 0.5164 +- 0.0142
    precision: 0.6048 +- 0.0186
    recall: 0.4519 +- 0.0277
    specificity: 0.6582 +- 0.0508
    ROC_AUC: 0.5551 +- 0.0127
    MCC: 0.1129 +- 0.0277

  SVM
    accuracy: 0.5366 +- 0.0326
    balanced_accuracy: 0.5430 +- 0.0326
    f1: 0.5018 +- 0.0475
    precision: 0.5885 +- 0.0412
    recall: 0.4393 +- 0.0574
    specificity: 0.6467 +- 0.0586
    ROC_AUC: 0.5430 +- 0.0326
    MCC: 0.0879 +- 0.0667

  RF
    accuracy: 0.5592 +- 0.0353
    balanced_accuracy: 0.5619 +- 0.0337
    f1: 0.5586 +- 0.0400
    precision: 0.5997 +- 0.0269
    recall: 0.5238 +- 0.0512
    specificity: 0.6000 +- 0.0307
    ROC_AUC: 0.5619 +- 0.0337
    MCC: 0.1239 +- 0.0673

  LGBM
    accuracy: 0.5563 +- 0.0568
    balanced_accuracy: 0.5596 +- 0.0556
    f1: 0.5521 +- 0.0683
    precision: 0.5960 +- 0.0583
    recall: 0.5155 +- 0.0784
    specificity: 0.6037 +- 0.0385
    ROC_AUC: 0.5596 +- 0.0556
    MCC: 0.1191 +