# Imports

In [1]:
import os
import warnings
warnings.filterwarnings("ignore")

from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
import scipy as sp
import scipy.signal
import scipy.stats
from scipy.stats.mstats import gmean

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV, LeaveOneOut, StratifiedKFold
from sklearn.svm import SVC

from utils import Dataset, variance_thresholding, standardize, mcc, calculate_metrics, calculate_metrics_statistics

In [2]:
# parameters for Welch's method for estimating power spectrum

NPERSEG = 11                    # length of segment
NOVERLAP = int(0.75 * NPERSEG)  # overlap of segments
NFFT = NPERSEG                  # length of FFT
WINDOW = "hann"                 # window function type

# parameters for saving data
PROCESSED_DATA_DIR = "processed_data_new"
DEPRESJON_PREFIX = "manual_depresjon"
HYPERAKTIV_PREFIX = "manual_hyperaktiv"
PSYKOSE_PREFIX = "manual_psykose"

# Manual feature extraction

## Helper functions

In [3]:
def basic_data_cleaning(data: List[pd.DataFrame]) -> List[pd.DataFrame]:
    """
    Assumes DataFrames with "timestamp", "date" and "activity" columns.
    
    Performs cleaning operations:
    - assure format YYYY-MM-DD HH:MM:SS for "timestamp"
    - drop redundant "date" column
    - assure float32 format for "activity"
    
    :param data: list of DataFrames
    :returns: list of cleaned DataFrames
    """
    data = [df.copy() for df in data]  # create copy to avoid side effects
    
    for df in data:
        df["timestamp"] = pd.to_datetime(df["timestamp"],
                                         format="%Y-%m-%d %H:%M:%S")
        df.drop("date", axis=1, inplace=True)
        df["activity"] = df["activity"].astype(np.float32)
    
    return data


def fill_missing_activity(df: pd.DataFrame) -> pd.DataFrame:
    """
    Makes sure that "timestamp" column has minute resolution with no missing
    values from start to end and replaces all NaNs in "activity" column with
    mean average value.
    
    :param data: DataFrame with "timestamp" and "activity" columns
    :returns: cleaned DataFrame
    """
    df = df.copy()  # create copy to avoid side effects
    
    # resample to the basic frequency, i.e. minute; this will create NaNs for
    # any rows that may be missing
    df = df.resample("min", on="timestamp").mean()
    
    # recreate index and "timestamp" column
    df = df.reset_index()
    
    # fill any NaNs with mean activity value
    df["activity"] = df["activity"].fillna(df["activity"].mean())

    return df


def resample(df: pd.DataFrame, freq: str = "H") -> pd.DataFrame:
    """
    Resamples time series DataFrame with given frequency, aggregating each
    segment with a mean.

    :param df: DataFrame with "timestamp" and "activity" columns
    :param freq: resampling frequency passed to Pandas resample() function
    :returns: DataFrame with "timestamp" and "activity" columns
    """
    df = df.copy()  # create copy to avoid side effects
    
    # group with given frequency
    df = df.resample(freq, on="timestamp").mean()

    # recreate "timestamp" column
    df = df.reset_index()

    return df


def proportion_of_zeros(x: np.ndarray) -> float:
    """
    Calculates proportion of zeros in given array, i.e. number of zeros divided
    by length of array.
    
    :param x: 1D Numpy array
    :returns: proportion of zeros
    """
    # we may be dealing with floating numbers, we can't use direct comparison
    zeros_count = np.sum(np.isclose(x, 0))
    return zeros_count / len(x)


def power_spectral_density(df: pd.DataFrame) -> np.ndarray:
    """
    Calculates power spectral density (PSD) from "activity" column of a
    DataFrame.
    
    :param df: DataFrame with "activity" column
    :returns: 1D Numpy array with power spectral density
    """
    psd = scipy.signal.welch(
        x=df["activity"].values,
        fs=(1/11),
        nperseg=11,
        noverlap=10,
        nfft=NFFT,
        window=WINDOW,
        scaling="density"
    )[1]
    return psd


def spectral_flatness(df: pd.DataFrame) -> float:
    """
    Calculates spectral flatness of a signal, i.e. a geometric mean of the
    power spectrum divided by the arithmetic mean of the power spectrum.
    
    If some frequency bins in the power spectrum are close to zero, they are
    removed prior to calculation of spectral flatness to avoid calculation of
    log(0).
    
    :param df: DataFrame with "activity" column
    :returns: spectral flatness value
    """
    power_spectrum = scipy.signal.welch(
        df["activity"].values,
        fs=(1/11),
        nperseg=11,
        noverlap=10,
        nfft=NFFT,
        window=WINDOW,
        scaling="spectrum"
    )[1]
    
    non_zeros_mask = ~np.isclose(power_spectrum, 0)
    power_spectrum = power_spectrum[non_zeros_mask]
    
    return scipy.stats.gmean(power_spectrum) / power_spectrum.mean()

## Feature extraction

In [4]:
def extract_time_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts features from activity signal in time domain.
    
    :param df_resampled: DataFrame with "activity" column
    :returns: DataFrame with a single row representing features
    """
    X = df["activity"].values
    
    features = {
        "minimum": np.min(X),
        "maximum": np.max(X),
        "mean": np.mean(X),
        "median": np.median(X),
        "variance": np.var(X, ddof=1),  # apply Bessel's correction
        "kurtosis": sp.stats.kurtosis(X),
        "skewness": sp.stats.skew(X),
        "coeff_of_var": sp.stats.variation(X),
        "iqr": sp.stats.iqr(X),
        "trimmed_mean": sp.stats.trim_mean(X, proportiontocut=0.1),
        "entropy": sp.stats.entropy(X, base=2),
        "proportion_of_zeros": proportion_of_zeros(X)
    }
    
    return pd.DataFrame([features])

In [5]:
def extract_frequency_features(df: pd.DataFrame) -> pd.DataFrame:
    """
    Extracts features from activity signal in frequency domain, i.e. calculated
    from its Power Spectral Density (PSD).
    
    :param df: DataFrame with "activity" column
    :returns: DataFrame with a single row representing features
    """
    X = power_spectral_density(df)
    
    features = {
        "minimum": np.min(X),
        "maximum": np.max(X),
        "mean": np.mean(X),
        "median": np.median(X),
        "variance": np.var(X),
        "kurtosis": sp.stats.kurtosis(X),
        "skewness": sp.stats.skew(X),
        "coeff_of_var": sp.stats.variation(X),
        "iqr": sp.stats.iqr(X),
        "trimmed_mean": sp.stats.trim_mean(X, proportiontocut=0.1),
        "entropy": sp.stats.entropy(X, base=2),
        "spectral_flatness": spectral_flatness(df)
    }
    
    return pd.DataFrame([features])

In [14]:
def extract_features_for_dataframes(dfs: List[pd.DataFrame], freq: str = "H") \
        -> Dict[str, pd.DataFrame]:
    """
    Calculates time and frequency features for given DataFrames. Uses given
    frequency for resampling.
    
    Calculates features separately for:
    - nights: [21:00, 8:00)
    
    :param dfs: list of DataFrames to extract features from; each one has to
    have "timestamp" and "activity" columns
    :param freq: resampling frequency
    :returns: dictionary with keys "night", corresponding
    to features from given parts of day
    """
    full_dfs = basic_data_cleaning(dfs)
    full_dfs = [fill_missing_activity(df) for df in full_dfs]
    full_dfs = [resample(df, freq=freq) for df in full_dfs]

    datasets = {}
    
    for part, list_of_dfs in [("night", full_dfs)]:
        features = []
        for df in list_of_dfs:
            time_features = extract_time_features(df)
            freq_features = extract_frequency_features(df)

            merged_features = pd.merge(
                time_features,
                freq_features,
                left_index=True,
                right_index=True,
                suffixes=["_time", "_freq"]
            )
            features.append(merged_features)

        datasets[part] = pd.concat(features)
        datasets[part].reset_index(drop=True, inplace=True)
    
    return datasets



## Depresjon

In [15]:
dataset = Dataset(dirpath=os.path.join("data_night", "depresjon"))
condition = dataset.condition
control = dataset.control

In [16]:
condition[0]

Unnamed: 0,timestamp,date,activity
0,2003-05-07 21:00:00,2003-05-07,212
1,2003-05-07 21:01:00,2003-05-07,212
2,2003-05-07 21:02:00,2003-05-07,17
3,2003-05-07 21:03:00,2003-05-07,38
4,2003-05-07 21:04:00,2003-05-07,82
...,...,...,...
655,2003-05-08 07:55:00,2003-05-08,9
656,2003-05-08 07:56:00,2003-05-08,23
657,2003-05-08 07:57:00,2003-05-08,0
658,2003-05-08 07:58:00,2003-05-08,0


In [17]:
condition_parts_dfs = extract_features_for_dataframes(condition, freq="H")
control_parts_dfs = extract_features_for_dataframes(control, freq="H")

datasets = {}

for part in ["night"]:
    condition_df = condition_parts_dfs[part]
    control_df = control_parts_dfs[part]
    
    entire_df = pd.concat([condition_df, control_df], ignore_index=True)
    
    # Przypisujemy wynik do słownika datasets
    datasets[part] = entire_df

In [18]:
for part, df in datasets.items():
    filename = f"{DEPRESJON_PREFIX}_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    df.to_csv(filepath, index=False)

In [19]:
y = np.concatenate((np.ones(len(condition)), np.zeros(len(control))))
y = pd.Series(y, dtype=int)

filepath = os.path.join(PROCESSED_DATA_DIR, f"depresjon_y.csv")
y.to_csv(filepath, header=False, index=False)

In [20]:
datasets["night"].dropna(inplace=True)
datasets["night"]

Unnamed: 0,minimum_time,maximum_time,mean_time,median_time,variance_time,kurtosis_time,skewness_time,coeff_of_var_time,iqr_time,trimmed_mean_time,...,mean_freq,median_freq,variance_freq,kurtosis_freq,skewness_freq,coeff_of_var_freq,iqr_freq,trimmed_mean_freq,entropy_freq,spectral_flatness
0,0.966667,148.733337,25.707579,10.000000,1821.470337,4.635292,2.443026,1.582900,17.500001,14.787038,...,13916.573242,1.013862e+04,9.799489e+07,1.124900,1.744361,0.711327,1125.172607,13916.574219,2.302669,0.847654
1,2.083333,120.050003,33.522724,12.650000,1455.296509,0.244883,1.169350,1.085026,51.141668,27.401852,...,13156.655273,7.906803e+03,2.379827e+08,0.439530,1.376406,1.172539,11228.363770,13156.653320,1.720446,0.405417
2,0.583333,178.116669,46.118183,17.783333,3332.064453,0.344395,1.195673,1.193406,74.924999,36.511112,...,39708.550781,2.489199e+04,1.254300e+09,-0.393184,0.991210,0.891901,37194.058594,39708.550781,2.048502,0.632379
3,0.916667,85.483330,25.483332,10.866667,968.310364,-0.284877,1.166769,1.164273,28.716667,21.546295,...,8917.659180,6.521941e+03,5.759565e+07,-0.566328,0.864708,0.851028,9563.873108,8917.659180,2.086632,0.667920
4,2.666667,111.233330,24.503029,9.816667,1099.974243,2.282830,1.835684,1.290551,20.741667,17.292593,...,9094.897461,6.941023e+03,4.072176e+07,0.514084,1.383717,0.701642,3578.424805,9094.897461,2.283463,0.816446
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1061,0.000000,244.866669,49.728786,0.000000,7793.731445,0.412738,1.430338,1.692655,56.075001,33.572220,...,76659.085938,6.107158e+04,1.471531e+09,1.152510,1.761784,0.500404,5097.581055,76659.085938,2.438116,0.916808
1063,0.000000,620.766663,57.430302,0.000000,34919.089844,6.094527,2.844448,3.102373,0.000000,1.218518,...,75711.507812,1.406139e+04,7.993649e+09,-1.399616,0.742290,1.180893,130002.110107,75711.500000,1.617533,0.423004
1064,0.000000,0.950000,0.086364,0.000000,0.082045,6.099999,2.846050,3.162278,0.000000,0.000000,...,0.173160,3.032454e-02,4.116758e-02,-1.421096,0.735065,1.171736,0.296610,0.173160,1.633390,0.432769
1076,0.000000,0.233333,0.021212,0.000000,0.004949,6.099999,2.846050,3.162278,0.000000,0.000000,...,0.009074,9.772276e-17,1.921239e-04,-0.367347,1.122264,1.527525,0.013611,0.009074,0.918296,0.942809


## HYPERAKTIV

In [21]:
dataset = Dataset(dirpath=os.path.join("data_night", "hyperaktiv"))
condition = dataset.condition
control = dataset.control

In [22]:
condition_parts_dfs = extract_features_for_dataframes(condition, freq="H")
control_parts_dfs = extract_features_for_dataframes(control, freq="H")

datasets = {}

for part in ["night"]:
    condition_df = condition_parts_dfs[part]
    control_df = control_parts_dfs[part]
    
    entire_df = pd.concat([condition_df, control_df], ignore_index=True)
    
    # Przypisujemy wynik do słownika datasets
    datasets[part] = entire_df

In [23]:
for part, df in datasets.items():
    filename = f"{HYPERAKTIV_PREFIX}_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    df.to_csv(filepath, index=False)

In [24]:
y = np.concatenate((np.ones(len(condition)), np.zeros(len(control))))
y = pd.Series(y, dtype=int)

filepath = os.path.join(PROCESSED_DATA_DIR, f"hyperaktiv_y.csv")
y.to_csv(filepath, header=False, index=False)

In [25]:
datasets["night"].dropna(inplace=True)
datasets["night"]

Unnamed: 0,minimum_time,maximum_time,mean_time,median_time,variance_time,kurtosis_time,skewness_time,coeff_of_var_time,iqr_time,trimmed_mean_time,...,mean_freq,median_freq,variance_freq,kurtosis_freq,skewness_freq,coeff_of_var_freq,iqr_freq,trimmed_mean_freq,entropy_freq,spectral_flatness
0,6.016667,230.183334,91.416679,65.266670,7912.653320,-1.510052,0.450157,0.927768,176.150001,85.487038,...,116072.601562,109778.625000,2.681830e+09,-1.578465,0.134022,0.446155,90220.271484,116072.593750,2.436998,0.894241
1,0.000000,73.683334,24.281818,16.816668,588.533997,-0.423892,0.953657,0.952595,30.108334,21.490740,...,12793.206055,10470.508789,1.164978e+08,-1.695204,0.203371,0.843683,20261.454712,12793.206055,1.983479,0.438438
2,0.000000,131.483337,42.448486,27.299999,2579.278076,-0.948834,0.820242,1.140750,70.308334,37.272224,...,48747.886719,56377.863281,9.152884e+08,-1.230018,-0.380039,0.620616,42063.210938,48747.890625,2.208131,0.535190
3,0.000000,250.916672,57.416664,16.816668,5966.505371,1.531150,1.564291,1.282702,77.891667,42.296295,...,37182.328125,37570.835938,4.365628e+08,-1.241227,-0.135190,0.561936,30318.465332,37182.332031,2.315746,0.757899
4,4.283333,166.449997,42.993935,16.766666,2538.044434,1.227072,1.483795,1.117239,50.774999,33.577774,...,68386.632812,43126.476562,3.511677e+09,-1.304017,0.637380,0.866535,89065.209961,68386.632812,2.046365,0.625485
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
567,6.650000,298.216675,90.040909,61.849998,8973.646484,0.482736,1.356871,1.003109,62.099999,76.175926,...,59718.839844,26482.294922,4.067366e+09,-1.308131,0.701559,1.067936,98704.467773,59718.839844,1.760489,0.440585
568,5.550000,513.766663,88.033333,41.150002,21160.406250,5.111922,2.553533,1.575500,74.858335,49.894444,...,103515.695312,25824.949219,1.592926e+10,-1.049457,0.854721,1.219247,159958.384033,103515.695312,1.541632,0.284065
569,5.216667,197.933334,65.021217,36.383335,4138.562500,-0.087868,1.107714,0.943351,59.875000,56.898151,...,52158.675781,23151.261719,3.426864e+09,0.262236,1.349471,1.122333,44557.569580,52158.675781,1.827806,0.558175
570,5.033333,426.116669,91.619698,28.716667,16389.708984,2.290914,1.819472,1.332295,97.125001,64.074074,...,111410.710938,11569.268555,2.198547e+10,-1.314444,0.771620,1.330886,208656.183105,111410.726562,1.297107,0.216173


## Psykose

In [26]:
dataset = Dataset(dirpath=os.path.join("data_night", "psykose"))
condition = dataset.condition
control = dataset.control

In [27]:
condition_parts_dfs = extract_features_for_dataframes(condition, freq="H")
control_parts_dfs = extract_features_for_dataframes(control, freq="H")

datasets = {}

for part in ["night"]:
    condition_df = condition_parts_dfs[part]
    control_df = control_parts_dfs[part]
    
    entire_df = pd.concat([condition_df, control_df], ignore_index=True)
    
    # Przypisujemy wynik do słownika datasets
    datasets[part] = entire_df

In [28]:
for part, df in datasets.items():
    filename = f"{PSYKOSE_PREFIX}_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    df.to_csv(filepath, index=False)

In [29]:
y = np.concatenate((np.ones(len(condition)), np.zeros(len(control))))
y = pd.Series(y, dtype=int)

filepath = os.path.join(PROCESSED_DATA_DIR, f"psykose_y.csv")
y.to_csv(filepath, header=False, index=False)

In [30]:
datasets["night"].dropna(inplace=True)
datasets["night"]

Unnamed: 0,minimum_time,maximum_time,mean_time,median_time,variance_time,kurtosis_time,skewness_time,coeff_of_var_time,iqr_time,trimmed_mean_time,...,mean_freq,median_freq,variance_freq,kurtosis_freq,skewness_freq,coeff_of_var_freq,iqr_freq,trimmed_mean_freq,entropy_freq,spectral_flatness
0,6.166667,160.016663,44.565151,30.316668,2629.267578,0.604993,1.425820,1.097048,35.958333,36.003704,...,13878.147461,1.240089e+04,5.287275e+07,0.144590,1.030644,0.523943,4096.194092,13878.147461,2.401149,0.876730
1,0.000000,297.399994,61.269691,12.966666,9292.070312,1.198874,1.569812,1.500079,75.141669,41.840740,...,75981.937500,8.027233e+04,1.023709e+09,-0.419269,-0.357850,0.421093,24955.370117,75981.937500,2.435629,0.870968
2,3.216667,305.216675,49.492420,10.200000,7874.186035,4.626294,2.438118,1.709495,44.483332,26.220371,...,24158.111328,2.403707e+04,1.292187e+08,-1.511788,-0.050919,0.470543,20195.407471,24158.115234,2.412895,0.869895
3,0.000000,82.000000,29.131819,17.466667,723.492310,-0.292331,0.925495,0.880345,22.491665,26.494446,...,13504.660156,1.379241e+04,9.577318e+07,-0.887764,0.263115,0.724667,11409.377197,13504.660156,2.122197,0.441411
4,0.000000,130.016663,47.121212,32.650002,1837.908447,-0.745079,0.753324,0.867459,60.408334,43.146294,...,18704.667969,1.590351e+04,2.954637e+08,0.331379,1.251495,0.918972,12192.484619,18704.666016,2.038111,0.633711
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1041,0.000000,244.866669,49.728786,0.000000,7793.731445,0.412738,1.430338,1.692655,56.075001,33.572220,...,76659.085938,6.107158e+04,1.471531e+09,1.152510,1.761784,0.500404,5097.581055,76659.085938,2.438116,0.916808
1043,0.000000,620.766663,57.430302,0.000000,34919.089844,6.094527,2.844448,3.102373,0.000000,1.218518,...,75711.507812,1.406139e+04,7.993649e+09,-1.399616,0.742290,1.180893,130002.110107,75711.500000,1.617533,0.423004
1044,0.000000,0.950000,0.086364,0.000000,0.082045,6.099999,2.846050,3.162278,0.000000,0.000000,...,0.173160,3.032454e-02,4.116758e-02,-1.421096,0.735065,1.171736,0.296610,0.173160,1.633390,0.432769
1056,0.000000,0.233333,0.021212,0.000000,0.004949,6.099999,2.846050,3.162278,0.000000,0.000000,...,0.009074,9.772276e-17,1.921239e-04,-0.367347,1.122264,1.527525,0.013611,0.009074,0.918296,0.942809


# Classification

## Classifiers, parameters, constants

In [31]:
classifiers = {
    "LR": LogisticRegression(
        penalty="elasticnet",
        random_state=0,
        solver="saga",
        max_iter=500
    ),
    "SVM": SVC(
        kernel="rbf",
        cache_size=512
    ),
    "RF": RandomForestClassifier(
        n_estimators=200,
        criterion="entropy"
    ),
    "LGBM": LGBMClassifier(
        n_estimators=200,
        verbosity=-1,
        random_state=0
    ),
    "XGB": XGBClassifier(
        n_estimators=200,
        random_state=0 
    )
}


param_grids = {
    "LR": {
        "C": [0.001, 0.01, 0.1, 0.5, 1, 2, 5, 10, 25, 50, 100, 500, 1000],
        "class_weight": ["balanced"],
        "l1_ratio": [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5,
                     0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1]
    },
    "SVM": {
        "C": [1, 10],
        "gamma": ["scale"],
        "class_weight": [None, "balanced"]
    },
    "RF": {
        "class_weight": [None, "balanced", "balanced_subsample"]
    },
    "LGBM": {
        "num_leaves": [31, 50],
        "min_child_samples": [10, 20],
        "class_weight": [None, "balanced"]
    },
    "XGB": {
        "learning_rate": [0.01, 0.1],
        "max_depth": [3, 6],
        "subsample": [0.8, 1.0],
        "colsample_bytree": [0.8, 1.0],
        "scale_pos_weight": [1, 10]
    }
}

## Classification Depresjon

In [32]:
dataset = DEPRESJON_PREFIX
y_filename = "depresjon_y.csv"

In [33]:
# Assuming PROCESSED_DATA_DIR and y_filename are already defined
datasets = {}

# Load datasets
for part in ["night"]:
    filename = f"{dataset}_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    datasets[part] = pd.read_csv(filepath, header=0).values

# Load y values
y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, y_filename), header=None, dtype=int)
y = y.values.ravel()

# Usuwanie wierszy, które zawierają NaN w danych
for part in datasets:
    # Indeksy wierszy, które zawierają NaN w dowolnej kolumnie
    nan_indices = np.isnan(datasets[part]).any(axis=1)
    
    # Usuwamy te wiersze z datasets i y
    datasets[part] = datasets[part][~nan_indices]
    y = y[~nan_indices]

# Sprawdzamy kształt danych po usunięciu NaN
#print(datasets['full_24h'].shape)
#print(y.shape)

In [34]:
print("Wyniki dla depresjon")
for part in ["night"]:
    print(f"PART: {part}")
    
    X = datasets[part]

    for clf_type in ["LR", "SVM", "RF", "LGBM", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        
        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            #print(train_idx, test_idx)
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                #cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)
            
            clf = grid_search.best_estimator_
            y_pred  = clf.predict(X_test)
            metrics = calculate_metrics(y_test, y_pred)
            #print(metrics)
            test_scores.append(metrics)
        
        final_scores = calculate_metrics_statistics(test_scores)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

Wyniki dla depresjon
PART: night
  LR
    accuracy: 0.5337 +- 0.0281
    balanced_accuracy: 0.5276 +- 0.0259
    f1: 0.4568 +- 0.0304
    precision: 0.4229 +- 0.0287
    recall: 0.4987 +- 0.0435
    specificity: 0.5565 +- 0.0490
    ROC_AUC: 0.5276 +- 0.0259
    MCC: 0.0542 +- 0.0510

  SVM
    accuracy: 0.6166 +- 0.0139
    balanced_accuracy: 0.5264 +- 0.0192
    f1: 0.1642 +- 0.0872
    precision: 0.5956 +- 0.1483
    recall: 0.1013 +- 0.0646
    specificity: 0.9515 +- 0.0380
    ROC_AUC: 0.5264 +- 0.0192
    MCC: 0.1021 +- 0.0656

  RF
    accuracy: 0.5336 +- 0.0333
    balanced_accuracy: 0.4976 +- 0.0371
    f1: 0.3546 +- 0.0571
    precision: 0.3886 +- 0.0488
    recall: 0.3280 +- 0.0667
    specificity: 0.6673 +- 0.0355
    ROC_AUC: 0.4976 +- 0.0371
    MCC: -0.0055 +- 0.0756

  LGBM
    accuracy: 0.5000 +- 0.0300
    balanced_accuracy: 0.4770 +- 0.0346
    f1: 0.3650 +- 0.0560
    precision: 0.3637 +- 0.0450
    recall: 0.3680 +- 0.0693
    specificity: 0.5859 +- 0.0349
    ROC_

# Classification HYPERAKTIV

In [35]:
dataset = HYPERAKTIV_PREFIX
y_filename = "hyperaktiv_y.csv"

In [36]:
# Assuming PROCESSED_DATA_DIR and y_filename are already defined
datasets = {}

# Load datasets
for part in ["night"]:
    filename = f"{dataset}_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    datasets[part] = pd.read_csv(filepath, header=0).values

# Load y values
y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, y_filename), header=None, dtype=int)
y = y.values.ravel()

# Usuwanie wierszy, które zawierają NaN w danych
for part in datasets:
    # Indeksy wierszy, które zawierają NaN w dowolnej kolumnie
    nan_indices = np.isnan(datasets[part]).any(axis=1)
    
    # Usuwamy te wiersze z datasets i y
    datasets[part] = datasets[part][~nan_indices]
    y = y[~nan_indices]

# Sprawdzamy kształt danych po usunięciu NaN
#print(datasets['full_24h'].shape)
#print(y.shape)

In [37]:
print("Wyniki dla depresjon")
for part in ["night"]:
    print(f"PART: {part}")
    
    X = datasets[part]

    for clf_type in ["LR", "SVM", "RF", "LGBM", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        
        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            #print(train_idx, test_idx)
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.03)
            X_train, X_test = standardize(X_train, X_test)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                #cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)
            
            clf = grid_search.best_estimator_
            y_pred  = clf.predict(X_test)
            metrics = calculate_metrics(y_test, y_pred)
            #print(metrics)
            test_scores.append(metrics)
        
        final_scores = calculate_metrics_statistics(test_scores)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

Wyniki dla depresjon
PART: night
  LR
    accuracy: 0.5088 +- 0.0055
    balanced_accuracy: 0.5073 +- 0.0070
    f1: 0.5362 +- 0.0704
    precision: 0.5168 +- 0.0081
    recall: 0.5897 +- 0.2073
    specificity: 0.4250 +- 0.2159
    ROC_AUC: 0.5073 +- 0.0070
    MCC: 0.0147 +- 0.0141

  SVM
    accuracy: 0.5193 +- 0.0301
    balanced_accuracy: 0.5173 +- 0.0296
    f1: 0.5695 +- 0.0457
    precision: 0.5219 +- 0.0226
    recall: 0.6310 +- 0.0849
    specificity: 0.4036 +- 0.0625
    ROC_AUC: 0.5173 +- 0.0296
    MCC: 0.0368 +- 0.0623

  RF
    accuracy: 0.5193 +- 0.0465
    balanced_accuracy: 0.5189 +- 0.0465
    f1: 0.5335 +- 0.0492
    precision: 0.5265 +- 0.0454
    recall: 0.5414 +- 0.0573
    specificity: 0.4964 +- 0.0497
    ROC_AUC: 0.5189 +- 0.0465
    MCC: 0.0380 +- 0.0931

  LGBM
    accuracy: 0.5298 +- 0.0181
    balanced_accuracy: 0.5296 +- 0.0181
    f1: 0.5403 +- 0.0250
    precision: 0.5378 +- 0.0174
    recall: 0.5448 +- 0.0444
    specificity: 0.5143 +- 0.0484
    ROC_A

# Classification Psykose

In [38]:
dataset = PSYKOSE_PREFIX
y_filename = "psykose_y.csv"

In [39]:
# Assuming PROCESSED_DATA_DIR and y_filename are already defined
datasets = {}

# Load datasets
for part in ["night"]:
    filename = f"{dataset}_{part}.csv"
    filepath = os.path.join(PROCESSED_DATA_DIR, filename)
    datasets[part] = pd.read_csv(filepath, header=0).values

# Load y values
y = pd.read_csv(os.path.join(PROCESSED_DATA_DIR, y_filename), header=None, dtype=int)
y = y.values.ravel()

# Usuwanie wierszy, które zawierają NaN w danych
for part in datasets:
    # Indeksy wierszy, które zawierają NaN w dowolnej kolumnie
    nan_indices = np.isnan(datasets[part]).any(axis=1)
    
    # Usuwamy te wiersze z datasets i y
    datasets[part] = datasets[part][~nan_indices]
    y = y[~nan_indices]

# Sprawdzamy kształt danych po usunięciu NaN
#print(datasets['full_24h'].shape)
#print(y.shape)

In [40]:
print("Wyniki dla psykose")
for part in ["night"]:
    print(f"PART: {part}")
    
    X = datasets[part]

    for clf_type in ["LR", "SVM", "RF", "LGBM", "XGB"]:
        print(f"  {clf_type}")
        folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
        
        test_scores = []
        for train_idx, test_idx in folds.split(X, y):
            #print(train_idx, test_idx)
            X_train, X_test = X[train_idx], X[test_idx]
            y_train, y_test = y[train_idx], y[test_idx]
            
            X_train, X_test = variance_thresholding(X_train, X_test, threshold=0.05)
            X_train, X_test = standardize(X_train, X_test)
            
            grid_search = GridSearchCV(
                estimator=classifiers[clf_type], 
                param_grid=param_grids[clf_type], 
                scoring="accuracy",
                n_jobs=-1,
                refit=True,
                #cv=LeaveOneOut()
            )
            grid_search.fit(X_train, y_train)
            
            clf = grid_search.best_estimator_
            y_pred  = clf.predict(X_test)
            metrics = calculate_metrics(y_test, y_pred)
            #print(metrics)
            test_scores.append(metrics)
        
        final_scores = calculate_metrics_statistics(test_scores)
        
        for metric, (mean, stddev) in final_scores.items():
            print(f"    {metric}: {mean:.4f} +- {stddev:.4f}")
        
        print()

Wyniki dla psykose
PART: night
  LR
    accuracy: 0.6279 +- 0.0388
    balanced_accuracy: 0.6375 +- 0.0366
    f1: 0.5710 +- 0.0358
    precision: 0.4960 +- 0.0372
    recall: 0.6738 +- 0.0361
    specificity: 0.6013 +- 0.0485
    ROC_AUC: 0.6375 +- 0.0366
    MCC: 0.2654 +- 0.0703

  SVM
    accuracy: 0.7134 +- 0.0459
    balanced_accuracy: 0.6407 +- 0.0555
    f1: 0.4805 +- 0.0986
    precision: 0.7046 +- 0.0885
    recall: 0.3682 +- 0.0966
    specificity: 0.9133 +- 0.0266
    ROC_AUC: 0.6407 +- 0.0555
    MCC: 0.3430 +- 0.1159

  RF
    accuracy: 0.6597 +- 0.0176
    balanced_accuracy: 0.6102 +- 0.0191
    f1: 0.4774 +- 0.0311
    precision: 0.5468 +- 0.0305
    recall: 0.4251 +- 0.0397
    specificity: 0.7954 +- 0.0254
    ROC_AUC: 0.6102 +- 0.0191
    MCC: 0.2356 +- 0.0397

  LGBM
    accuracy: 0.6410 +- 0.0195
    balanced_accuracy: 0.6057 +- 0.0179
    f1: 0.4916 +- 0.0206
    precision: 0.5122 +- 0.0272
    recall: 0.4730 +- 0.0198
    specificity: 0.7383 +- 0.0264
    ROC_AUC