Importy + config

In [1]:
import os
import pickle
import numpy as np
import pandas as pd

from scipy.signal import find_peaks
from scipy.stats import skew, kurtosis

from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

from tqdm import tqdm
from collections import defaultdict

In [2]:
DATA_ROOT = "/Volumes/blue_nateck/WESAD"
FS = 700  # sampling
PURITY_THRESHOLD = 0.9  # windows must have ≥ 90% purity

funkcje pomocnicze

In [3]:
def safe_1d(x):
    x = np.array(x)
    return x.reshape(-1)

ładowanie sygnałów

In [4]:
def load_subject(subject_id):
    path = os.path.join(DATA_ROOT, subject_id, f"{subject_id}.pkl")
    data = pickle.load(open(path, "rb"), encoding="latin1")

    acc = np.array(data["signal"]["chest"]["ACC"])  # (N,3)
    eda = safe_1d(data["signal"]["chest"]["EDA"])
    resp = safe_1d(data["signal"]["chest"]["Resp"])
    ecg = safe_1d(data["signal"]["chest"]["ECG"])
    labels = safe_1d(data["label"])

    min_len = min(len(acc), len(eda), len(resp), len(ecg), len(labels))

    return pd.DataFrame({
        "ACC_x": acc[:min_len, 0],
        "ACC_y": acc[:min_len, 1],
        "ACC_z": acc[:min_len, 2],
        "EDA": eda[:min_len],
        "RESP": resp[:min_len],
        "ECG": ecg[:min_len],
        "label": labels[:min_len],
        "subject": subject_id
    })

extract features

In [5]:
def compute_hr_hrv(ecg_signal):
    peaks, _ = find_peaks(ecg_signal, distance=int(0.2 * FS))
    if len(peaks) < 2:
        return np.nan, np.nan
    rr = np.diff(peaks) / FS
    hr = 60 / rr
    return np.mean(hr), np.std(rr)


# CACHE CECH: (subject, start, stop) → dict z cechami
feature_cache = {"df": {}, "feat": {}}


def extract_features_cached(subj, start, stop):
    key = (subj, start, stop)

    # jeśli już policzone → zwróć
    if key in feature_cache["feat"]:
        return feature_cache["feat"][key]

    df = feature_cache["df"][subj]
    w = df.iloc[start:stop]

    feats = {}
    cols = ["ACC_x", "ACC_y", "ACC_z", "EDA", "RESP", "ECG"]

    for col in cols:
        x = w[col].values
        feats[f"{col}_mean"] = np.mean(x)
        feats[f"{col}_std"] = np.std(x)
        feats[f"{col}_min"] = np.min(x)
        feats[f"{col}_max"] = np.max(x)
        feats[f"{col}_var"] = np.var(x)
        feats[f"{col}_skew"] = skew(x, nan_policy="omit")
        feats[f"{col}_kurtosis"] = kurtosis(x, nan_policy="omit")

    hr, hrv = compute_hr_hrv(w["ECG"].values)
    feats["HR_mean"] = hr
    feats["HRV_SDNN"] = hrv

    feature_cache["feat"][key] = feats
    return feats

windowing z purity

In [39]:
window_cache = defaultdict(dict)

In [40]:
def compute_purity(labels_window):
    labels_window = np.asarray(labels_window)
    vals, counts = np.unique(labels_window, return_counts=True)
    purity = counts.max() / counts.sum()
    dominant = vals[counts.argmax()]
    return purity, dominant

In [41]:
def precompute_windows(df_all, subjects, window_sec):
    if window_sec in window_cache:
        return window_cache[window_sec]

    win_size = int(window_sec * FS)
    stride = win_size // 2

    out = {}

    for subj in tqdm(subjects, desc=f"Windows {window_sec}s"):
        df = df_all[subj]
        labels = df["label"].values

        wins = []
        for start in range(0, len(df) - win_size, stride):
            stop = start + win_size
            purity, lab = compute_purity(labels[start:stop])

            if purity >= PURITY_THRESHOLD:
                lab_binary = 1 if lab == 2 else 0       # stress = 1, reszta = 0
                # mapowanie do 3 klas: baseline=0, amusement=1, stress=2
                mapping_3class = {0: 0, 3: 1, 2: 2}

                if lab not in mapping_3class:
                    continue  # pomiń okna z klasami 1,4,5,6,7

                lab_3class = mapping_3class[lab]

                wins.append((start, stop, lab, lab_binary, lab_3class))

        out[subj] = wins

    window_cache[window_sec] = out
    return out

wczytanie bazy

In [42]:
subjects = sorted([s for s in os.listdir(DATA_ROOT) if s.startswith("S")])
df_all = {s: load_subject(s) for s in subjects}

# inicjalizacja cache df
feature_cache["df"] = df_all.copy()

print("Loaded:", subjects)

Loaded: ['S10', 'S11', 'S13', 'S14', 'S15', 'S16', 'S17', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9']


funkcja loso 

In [43]:
def loso_fast(df_all, subjects, window_sec):
    wins = precompute_windows(df_all, subjects, window_sec)

    f1_binary_scores = []
    f1_3class_scores = []

    for test_s in tqdm(subjects, desc=f"LOSO {window_sec}s"):
        train_s = [s for s in subjects if s != test_s]

        X_train_bin, y_train_bin = [], []
        X_train_3c, y_train_3c = [], []

        X_test_bin,  y_test_bin  = [], []
        X_test_3c,   y_test_3c   = [], []

        # TRAIN
        for s in train_s:
            for (start, stop, lab, lab_binary, lab_3class) in wins[s]:
                feats = extract_features_cached(s, start, stop)

                # binary classifier
                X_train_bin.append(feats)
                y_train_bin.append(lab_binary)

                # 3-class classifier
                X_train_3c.append(feats)
                y_train_3c.append(lab_3class)

        # TEST
        for (start, stop, lab, lab_binary, lab_3class) in wins[test_s]:
            feats = extract_features_cached(test_s, start, stop)

            X_test_bin.append(feats)
            y_test_bin.append(lab_binary)

            X_test_3c.append(feats)
            y_test_3c.append(lab_3class)

        if len(y_test_bin) == 0 or len(y_test_3c) == 0:
            continue

        # =======================
        # CLASSIFIER — BINARY
        # =======================
        clf_bin = RandomForestClassifier(
            n_estimators=150,
            class_weight="balanced",
            random_state=42
        )
        clf_bin.fit(pd.DataFrame(X_train_bin), y_train_bin)
        pred_bin = clf_bin.predict(pd.DataFrame(X_test_bin))
        f1_binary_scores.append(f1_score(y_test_bin, pred_bin, average="macro"))

        # =======================
        # CLASSIFIER — 3-CLASS
        # =======================
        clf_3c = RandomForestClassifier(
            n_estimators=150,
            class_weight="balanced",
            random_state=42
        )
        clf_3c.fit(pd.DataFrame(X_train_3c), y_train_3c)
        pred_3c = clf_3c.predict(pd.DataFrame(X_test_3c))
        f1_3class_scores.append(f1_score(y_test_3c, pred_3c, average="macro"))

    return {
        "f1_binary": np.mean(f1_binary_scores) if len(f1_binary_scores) else None,
        "f1_3class": np.mean(f1_3class_scores) if len(f1_3class_scores) else None
    }


test okien od 5 sekund do 60 sekund

In [44]:
window_candidates = list(range(29, 61, 2))

results = []

for w in window_candidates:
    print(f"\n=== Testing {w}s ===")
    res = loso_fast(df_all, subjects, w)

    results.append({
        "window_sec": w,
        "f1_binary": res["f1_binary"],
        "f1_3class": res["f1_3class"]
    })

# DataFrame z wynikami + sortowanie najpierw po binary, potem po 3-class
results_df = (
    pd.DataFrame(results)
    .sort_values(by=["f1_binary", "f1_3class"], ascending=False)
)

results_df



=== Testing 29s ===


Windows 29s: 100%|██████████| 15/15 [00:00<00:00, 59.63it/s]
LOSO 29s: 100%|██████████| 15/15 [00:37<00:00,  2.50s/it]



=== Testing 31s ===


Windows 31s: 100%|██████████| 15/15 [00:00<00:00, 79.53it/s]
LOSO 31s: 100%|██████████| 15/15 [00:34<00:00,  2.27s/it]



=== Testing 33s ===


Windows 33s: 100%|██████████| 15/15 [00:00<00:00, 68.63it/s]
LOSO 33s: 100%|██████████| 15/15 [00:31<00:00,  2.10s/it]



=== Testing 35s ===


Windows 35s: 100%|██████████| 15/15 [00:00<00:00, 73.00it/s]
LOSO 35s: 100%|██████████| 15/15 [00:29<00:00,  1.96s/it]



=== Testing 37s ===


Windows 37s: 100%|██████████| 15/15 [00:00<00:00, 77.91it/s]
LOSO 37s: 100%|██████████| 15/15 [00:27<00:00,  1.82s/it]



=== Testing 39s ===


Windows 39s: 100%|██████████| 15/15 [00:00<00:00, 79.57it/s]
LOSO 39s: 100%|██████████| 15/15 [00:25<00:00,  1.71s/it]



=== Testing 41s ===


Windows 41s: 100%|██████████| 15/15 [00:00<00:00, 79.86it/s]
LOSO 41s: 100%|██████████| 15/15 [00:23<00:00,  1.60s/it]



=== Testing 43s ===


Windows 43s: 100%|██████████| 15/15 [00:00<00:00, 86.01it/s]
LOSO 43s: 100%|██████████| 15/15 [00:22<00:00,  1.51s/it]



=== Testing 45s ===


Windows 45s: 100%|██████████| 15/15 [00:00<00:00, 81.09it/s]
LOSO 45s: 100%|██████████| 15/15 [00:22<00:00,  1.49s/it]



=== Testing 47s ===


Windows 47s: 100%|██████████| 15/15 [00:00<00:00, 68.69it/s]
LOSO 47s: 100%|██████████| 15/15 [00:24<00:00,  1.66s/it]



=== Testing 49s ===


Windows 49s: 100%|██████████| 15/15 [00:00<00:00, 59.24it/s]
LOSO 49s: 100%|██████████| 15/15 [00:28<00:00,  1.87s/it]



=== Testing 51s ===


Windows 51s: 100%|██████████| 15/15 [00:00<00:00, 54.53it/s]
LOSO 51s: 100%|██████████| 15/15 [00:30<00:00,  2.01s/it]



=== Testing 53s ===


Windows 53s: 100%|██████████| 15/15 [00:00<00:00, 49.64it/s]
LOSO 53s: 100%|██████████| 15/15 [00:28<00:00,  1.90s/it]



=== Testing 55s ===


Windows 55s: 100%|██████████| 15/15 [00:00<00:00, 45.56it/s]
LOSO 55s: 100%|██████████| 15/15 [00:26<00:00,  1.77s/it]



=== Testing 57s ===


Windows 57s: 100%|██████████| 15/15 [00:00<00:00, 59.97it/s]
LOSO 57s: 100%|██████████| 15/15 [00:24<00:00,  1.63s/it]



=== Testing 59s ===


Windows 59s: 100%|██████████| 15/15 [00:00<00:00, 51.71it/s]
LOSO 59s: 100%|██████████| 15/15 [00:21<00:00,  1.45s/it]


Unnamed: 0,window_sec,f1_binary,f1_3class
13,55,0.690578,0.479765
9,47,0.680773,0.486478
14,57,0.679265,0.501074
10,49,0.678253,0.473998
11,51,0.677562,0.472378
6,41,0.672846,0.463151
15,59,0.671614,0.500371
4,37,0.669711,0.467653
12,53,0.668503,0.482913
7,43,0.667425,0.468365


Funkcje pomocnicze HRV, RMS, energia jerk, signal utils

In [45]:
import numpy as np
from scipy.signal import find_peaks, detrend
from scipy.stats import skew, kurtosis

FS = 700  # sampling rate

RMS i energia

In [46]:
def rms(x):
    return np.sqrt(np.mean(x**2))

def signal_energy(x):
    return np.sum(x**2)

HR + HRV (SDNN + RMSSD)

In [47]:
def compute_hrv_features(ecg, fs=FS):
    peaks, _ = find_peaks(ecg, distance=int(0.2 * fs))

    if len(peaks) < 3:
        return np.nan, np.nan, np.nan

    rr = np.diff(peaks) / fs  # RR intervals (seconds)

    sdnn = np.std(rr)
    rmssd = np.sqrt(np.mean(np.diff(rr)**2))
    hr = 60 / np.mean(rr)

    return hr, sdnn, rmssd

EDA tonic + phasic (prosta szybka wersja)

In [48]:
def eda_tonic_phasic(eda_sig):
    tonic = np.mean(eda_sig)
    phasic = np.std(eda_sig - tonic)
    return tonic, phasic

ACC magnitude + jerk

In [49]:
def acc_magnitude(df):
    return np.sqrt(df["ACC_x"]**2 + df["ACC_y"]**2 + df["ACC_z"]**2)

def acc_jerk(acc_mag, fs=FS):
    jerk = np.diff(acc_mag) * fs
    return np.mean(np.abs(jerk)), rms(jerk)

RESP amplitude + cycles/min

In [50]:
def resp_features(resp, fs=FS):
    resp_d = detrend(resp)
    peaks, _ = find_peaks(resp_d, distance=int(0.8 * fs))

    if len(peaks) < 2:
        return np.nan, np.nan

    amplitude = np.mean(resp_d[peaks])
    cycles_per_min = (len(peaks) / (len(resp) / fs)) * 60

    return amplitude, cycles_per_min

Ekstrakcja pełych cech z jednego okna

In [51]:
def extract_full_features(df_win):

    feats = {}
    cols = ["ACC_x", "ACC_y", "ACC_z", "EDA", "RESP", "ECG"]

    # --- BASIC STATS ---
    for col in cols:
        x = df_win[col].values
        feats[f"{col}_mean"] = np.mean(x)
        feats[f"{col}_std"] = np.std(x)
        feats[f"{col}_var"] = np.var(x)
        feats[f"{col}_min"] = np.min(x)
        feats[f"{col}_max"] = np.max(x)

        # percentyles
        feats[f"{col}_p25"] = np.percentile(x, 25)
        feats[f"{col}_p50"] = np.percentile(x, 50)
        feats[f"{col}_p75"] = np.percentile(x, 75)

        # shape
        feats[f"{col}_skew"] = skew(x)
        feats[f"{col}_kurtosis"] = kurtosis(x)

        # RMS + energy
        feats[f"{col}_rms"] = rms(x)
        feats[f"{col}_energy"] = signal_energy(x)

    # --- HR / HRV ---
    hr, sdnn, rmssd = compute_hrv_features(df_win["ECG"].values)
    feats["HR"] = hr
    feats["HRV_SDNN"] = sdnn
    feats["HRV_RMSSD"] = rmssd

    # --- EDA tonic/phasic ---
    tonic, phasic = eda_tonic_phasic(df_win["EDA"].values)
    feats["EDA_tonic"] = tonic
    feats["EDA_phasic"] = phasic

    # --- ACC magnitude + jerk ---
    acc_mag = acc_magnitude(df_win)
    feats["ACC_mag_mean"] = np.mean(acc_mag)
    feats["ACC_mag_std"] = np.std(acc_mag)
    feats["ACC_mag_energy"] = signal_energy(acc_mag)

    jerk_mean, jerk_rms = acc_jerk(acc_mag)
    feats["ACC_jerk_mean"] = jerk_mean
    feats["ACC_jerk_rms"] = jerk_rms

    # --- RESP ---
    amp, rpm = resp_features(df_win["RESP"].values)
    feats["RESP_amplitude"] = amp
    feats["RESP_cycles_per_min"] = rpm

    return feats

Generowanie pełnego DataFrame cech dla okna 55 sekund

In [52]:
BEST_WINDOW = 55  # sekund

wins = precompute_windows(df_all, subjects, BEST_WINDOW)

all_feats = []
all_labels = []
all_subjs = []

win_size = int(BEST_WINDOW * FS)

In [53]:
for subj in subjects:
    df_s = df_all[subj]

    for (start, stop, lab, lab_binary, lab_3class) in wins[subj]:
        df_win = df_s.iloc[start:stop]
        feats = extract_full_features(df_win)

        feats["subject"] = subj
        feats["label_3class"] = lab
        feats["label_binary"] = lab_binary
        feats["start"] = start
        feats["stop"] = stop

        all_feats.append(feats)


In [54]:
features_df = pd.DataFrame(all_feats)
features_df

Unnamed: 0,ACC_x_mean,ACC_x_std,ACC_x_var,ACC_x_min,ACC_x_max,ACC_x_p25,ACC_x_p50,ACC_x_p75,ACC_x_skew,ACC_x_kurtosis,...,ACC_mag_energy,ACC_jerk_mean,ACC_jerk_rms,RESP_amplitude,RESP_cycles_per_min,subject,label_3class,label_binary,start,stop
0,0.882208,0.031776,0.001010,0.5602,1.2294,0.8728,0.8782,0.8858,1.911054,13.533833,...,32654.400707,1.338237,3.960427,0.295490,66.545455,S10,0,0,0,38500
1,0.885211,0.033685,0.001135,0.7192,1.1518,0.8714,0.8858,0.8954,1.198394,8.304768,...,32591.043118,1.258852,1.675577,0.440130,64.363636,S10,0,0,19250,57750
2,0.891634,0.011030,0.000122,0.7794,0.9654,0.8900,0.8932,0.8958,-3.004685,20.811858,...,33052.766400,1.005625,1.268256,0.457992,67.636364,S10,0,0,981750,1020250
3,0.892550,0.004292,0.000018,0.8702,0.9146,0.8898,0.8926,0.8952,0.091347,1.283043,...,33058.478215,0.982568,1.234491,0.427230,66.545455,S10,0,0,1001000,1039500
4,0.891248,0.004149,0.000017,0.8702,0.9138,0.8888,0.8910,0.8938,0.179997,1.242828,...,33067.474499,0.982842,1.233404,0.423868,67.636364,S10,0,0,1020250,1058750
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1729,0.912857,0.009140,0.000084,0.8262,1.0620,0.9090,0.9140,0.9174,-1.013655,17.674208,...,32761.984246,0.981592,1.375539,0.600626,65.454545,S9,0,0,3214750,3253250
1730,0.896148,0.034823,0.001213,0.7370,0.9978,0.8978,0.9092,0.9158,-1.666698,1.657749,...,32840.873359,1.020449,1.305440,0.831908,63.272727,S9,0,0,3234000,3272500
1731,0.861278,0.139359,0.019421,0.3932,1.2442,0.8942,0.9106,0.9166,-2.446184,4.671934,...,33127.825459,1.179100,1.680459,0.562957,66.545455,S9,0,0,3561250,3599750
1732,0.914380,0.008983,0.000081,0.8344,1.0074,0.9118,0.9150,0.9178,-0.143045,16.367909,...,32762.361249,0.973480,1.245791,0.515182,67.636364,S9,0,0,3580500,3619000


EDA

In [55]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

sns.set_theme(style="whitegrid")
plt.rcParams["figure.dpi"] = 130

# === Folder główny ===
EDA_SAVE_DIR = "/Volumes/blue_nateck/EDA_plots"
os.makedirs(EDA_SAVE_DIR, exist_ok=True)

# === Subfoldery ===
subfolders = [
    "hist",
    "box_binary", "box_3class",
    "violin_binary", "violin_3class",
    "kde_binary", "kde_3class",
    "corr",
    "pca",
    "rf_importance"
]

for sf in subfolders:
    os.makedirs(os.path.join(EDA_SAVE_DIR, sf), exist_ok=True)

print("Plots will be saved to:", EDA_SAVE_DIR)

# === DataFrame z cechami ===
df = features_df.copy()

num_cols = [c for c in df.columns 
            if c not in ["subject", "label_3class", "label_binary", "start", "stop"]]


Plots will be saved to: /Volumes/blue_nateck/EDA_plots


In [56]:
print("Liczba okien:", len(df))
print("Liczba cech:", len(num_cols))
print("\nKlasy (binary):\n", df["label_binary"].value_counts())
print("\nKlasy (3-class):\n", df["label_3class"].value_counts())
print("\nBraki danych:\n", df.isna().sum().sort_values(ascending=False))


Liczba okien: 1734
Liczba cech: 84

Klasy (binary):
 label_binary
0    1397
1     337
Name: count, dtype: int64

Klasy (3-class):
 label_3class
0    1218
2     337
3     179
Name: count, dtype: int64

Braki danych:
 ACC_x_mean      0
EDA_kurtosis    0
ECG_p25         0
ECG_max         0
ECG_min         0
               ..
ACC_z_max       0
ACC_z_min       0
ACC_z_var       0
ACC_z_std       0
stop            0
Length: 89, dtype: int64


Histogramy

In [57]:
for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True, color="#1f77b4")
    plt.title(f"Histogram: {col}")
    plt.tight_layout()
    plt.savefig(os.path.join(EDA_SAVE_DIR, "hist", f"{col}.png"))
    plt.close()


Boxplot — binary (stress vs non-stress)

In [58]:
for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df["label_binary"], y=df[col], palette="Set2")
    plt.title(f"Boxplot (Binary): {col}")
    plt.xlabel("0 = non-stress, 1 = stress")
    plt.tight_layout()
    plt.savefig(os.path.join(EDA_SAVE_DIR, "box_binary", f"{col}.png"))
    plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=df["label_binary"], y=df[col], palette="Set2")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=df["label_binary"], y=df[col], palette="Set2")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=df["label_binary"], y=df[col], palette="Set2")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=df["label_binary"], y=df[col], palette="Set2")

Passing `palette` without assigning `hue` is deprecated and will be removed in 

Boxplot — 3-class

In [59]:
for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df["label_3class"], y=df[col], palette="Set3")
    plt.title(f"Boxplot (3-class): {col}")
    plt.tight_layout()
    plt.savefig(os.path.join(EDA_SAVE_DIR, "box_3class", f"{col}.png"))
    plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=df["label_3class"], y=df[col], palette="Set3")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=df["label_3class"], y=df[col], palette="Set3")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=df["label_3class"], y=df[col], palette="Set3")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=df["label_3class"], y=df[col], palette="Set3")

Passing `palette` without assigning `hue` is deprecated and will be removed in 

Violin — binary

In [60]:
for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.violinplot(x="label_binary", y=col, data=df, palette="Set2", cut=0)
    plt.title(f"Violin (Binary): {col}")
    plt.tight_layout()
    plt.savefig(os.path.join(EDA_SAVE_DIR, "violin_binary", f"{col}.png"))
    plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="label_binary", y=col, data=df, palette="Set2", cut=0)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="label_binary", y=col, data=df, palette="Set2", cut=0)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="label_binary", y=col, data=df, palette="Set2", cut=0)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="label_binary", y=col, data=df, palette="Set2", cut=0)

Passing `palette` without assigning

Violin — 3-class

In [61]:
for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.violinplot(x="label_3class", y=col, data=df, palette="Set3", cut=0)
    plt.title(f"Violin (3-class): {col}")
    plt.tight_layout()
    plt.savefig(os.path.join(EDA_SAVE_DIR, "violin_3class", f"{col}.png"))
    plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="label_3class", y=col, data=df, palette="Set3", cut=0)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="label_3class", y=col, data=df, palette="Set3", cut=0)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="label_3class", y=col, data=df, palette="Set3", cut=0)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="label_3class", y=col, data=df, palette="Set3", cut=0)

Passing `palette` without assigning

KDE — binary

In [62]:
for col in num_cols:
    plt.figure(figsize=(6, 4))
    for lab in [0, 1]:
        sns.kdeplot(df[df["label_binary"] == lab][col], label=f"{lab}")
    plt.title(f"KDE (Binary): {col}")
    plt.legend(title="class")
    plt.tight_layout()
    plt.savefig(os.path.join(EDA_SAVE_DIR, "kde_binary", f"{col}.png"))
    plt.close()


KDE — 3-class

In [63]:
for col in num_cols:
    plt.figure(figsize=(6, 4))
    for lab in sorted(df["label_3class"].unique()):
        sns.kdeplot(df[df["label_3class"] == lab][col], label=f"{lab}")
    plt.title(f"KDE (3-class): {col}")
    plt.legend(title="class")
    plt.tight_layout()
    plt.savefig(os.path.join(EDA_SAVE_DIR, "kde_3class", f"{col}.png"))
    plt.close()


Korelacja cech
heatmap

In [64]:
corr = df[num_cols].corr()

plt.figure(figsize=(14, 10))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.savefig(os.path.join(EDA_SAVE_DIR, "corr", "heatmap.png"))
plt.close()


clustermap

In [65]:
sns.clustermap(corr, cmap="coolwarm", figsize=(14, 14))
plt.savefig(os.path.join(EDA_SAVE_DIR, "corr", "cluster.png"))
plt.close()


PCA — 2D

In [66]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[num_cols])

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=df["label_binary"], palette="Set1")
plt.title("PCA (Binary labels)")
plt.tight_layout()
plt.savefig(os.path.join(EDA_SAVE_DIR, "pca", "pca_binary.png"))
plt.close()

plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=df["label_3class"], palette="Set2")
plt.title("PCA (3-class labels)")
plt.tight_layout()
plt.savefig(os.path.join(EDA_SAVE_DIR, "pca", "pca_3class.png"))
plt.close()


Feature importance — Random Forest

In [68]:
rf = RandomForestClassifier(
    n_estimators=300,
    class_weight="balanced",
    random_state=42
)

rf.fit(df[num_cols], df["label_binary"])

importances = pd.DataFrame({
    "feature": num_cols,
    "importance": rf.feature_importances_
}).sort_values("importance", ascending=False)

plt.figure(figsize=(8, 16))
sns.barplot(data=importances.head(30), y="feature", x="importance", palette="viridis")
plt.title("Top 30 Features (Binary)")
plt.tight_layout()
plt.savefig(os.path.join(EDA_SAVE_DIR, "rf_importance", "top30_binary.png"))
plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=importances.head(30), y="feature", x="importance", palette="viridis")


LOW VARIANCE REMOVAL

In [70]:
import numpy as np
import pandas as pd

# Kopia, żeby nie ruszać oryginału
df_lv = df.copy()

# Uwzględniamy tylko kolumny numeryczne
num_cols = [c for c in df_lv.columns 
            if c not in ["subject", "label_binary", "label_3class", "start", "stop"]]

# --- 1. Policzenie wariancji ---
variances = df_lv[num_cols].var()
median_var = variances.median()

# Kryteria:
crit_A = variances < 1e-5
crit_B = variances < (0.1 * median_var)

# Usuwamy jeśli spełnia A OR B
to_drop = variances.index[crit_A | crit_B].tolist()
to_keep = [c for c in num_cols if c not in to_drop]

print("======================")
print("LOW VARIANCE REMOVAL")
print("======================")
print(f"Liczba cech przed: {len(num_cols)}")
print(f"Liczba cech usuniętych: {len(to_drop)}")
print(f"Liczba cech po: {len(to_keep)}")
print(f"Procent usuniętych: {100 * len(to_drop) / len(num_cols):.1f}%")

print("\n--- Usunięte cechy ---")
for c in to_drop:
    print(c)

print("\n--- Zachowane cechy ---")
for c in to_keep:
    print(c)

# --- 2. Aktualizacja DataFrame ---
df_lv = df_lv[to_keep + ["label_binary", "label_3class", "subject", "start", "stop"]]


LOW VARIANCE REMOVAL
Liczba cech przed: 84
Liczba cech usuniętych: 24
Liczba cech po: 60
Procent usuniętych: 28.6%

--- Usunięte cechy ---
ACC_x_mean
ACC_x_std
ACC_x_var
ACC_x_p75
ACC_x_rms
ACC_y_mean
ACC_y_std
ACC_y_var
ACC_y_p50
ACC_y_p75
ACC_y_rms
ACC_z_std
ACC_z_var
ECG_mean
ECG_std
ECG_var
ECG_p25
ECG_p50
ECG_p75
ECG_rms
HRV_SDNN
HRV_RMSSD
ACC_mag_mean
ACC_mag_std

--- Zachowane cechy ---
ACC_x_min
ACC_x_max
ACC_x_p25
ACC_x_p50
ACC_x_skew
ACC_x_kurtosis
ACC_x_energy
ACC_y_min
ACC_y_max
ACC_y_p25
ACC_y_skew
ACC_y_kurtosis
ACC_y_energy
ACC_z_mean
ACC_z_min
ACC_z_max
ACC_z_p25
ACC_z_p50
ACC_z_p75
ACC_z_skew
ACC_z_kurtosis
ACC_z_rms
ACC_z_energy
EDA_mean
EDA_std
EDA_var
EDA_min
EDA_max
EDA_p25
EDA_p50
EDA_p75
EDA_skew
EDA_kurtosis
EDA_rms
EDA_energy
RESP_mean
RESP_std
RESP_var
RESP_min
RESP_max
RESP_p25
RESP_p50
RESP_p75
RESP_skew
RESP_kurtosis
RESP_rms
RESP_energy
ECG_min
ECG_max
ECG_skew
ECG_kurtosis
ECG_energy
HR
EDA_tonic
EDA_phasic
ACC_mag_energy
ACC_jerk_mean
ACC_jerk_rms
RESP_a

FEATURE CORRELATION

In [72]:
import numpy as np
import pandas as pd

# Kopia DataFrame po Low Variance Removal
df_corr = df_lv.copy()

num_cols = [c for c in df_corr.columns 
            if c not in ["subject", "label_binary", "label_3class", "start", "stop"]]

# --- 1. Korelacje ---
corr_matrix = df_corr[num_cols].corr().abs()  # wartości bezwzględne

# --- 2. Wariancje cech ---
variances = df_corr[num_cols].var()

# --- 3. Znajdź pary cech o |corr| > 0.9 ---
high_corr_pairs = []
threshold = 0.9

for i in range(len(num_cols)):
    for j in range(i + 1, len(num_cols)):
        c1 = num_cols[i]
        c2 = num_cols[j]
        corr_val = corr_matrix.iloc[i, j]

        if corr_val > threshold:
            high_corr_pairs.append((c1, c2, corr_val))

# --- 4. Wybór cech do usunięcia ---
to_drop_corr = set()

for c1, c2, co in high_corr_pairs:
    # zachowaj cechę o większej wariancji
    if variances[c1] < variances[c2]:
        to_drop_corr.add(c1)
    else:
        to_drop_corr.add(c2)

to_drop_corr = list(to_drop_corr)

to_keep_corr = [c for c in num_cols if c not in to_drop_corr]

# --- 5. Raport ---
print("=============================")
print("FEATURE CORRELATION REMOVAL")
print("=============================")
print(f"Liczba cech przed: {len(num_cols)}")
print(f"Usuniętych (corr > 0.9): {len(to_drop_corr)}")
print(f"Pozostało: {len(to_keep_corr)}")
print(f"Procent usuniętych: {100 * len(to_drop_corr) / len(num_cols):.1f}%")

print("\n--- Usunięte cechy (wysoka korelacja) ---")
for c in sorted(to_drop_corr):
    print(c)

print("\n--- Zachowane cechy ---")
for c in sorted(to_keep_corr):
    print(c)

# --- 6. Finalny DataFrame po usunięciu korelacji ---
df_corr = df_corr[to_keep_corr + ["label_binary", "label_3class", "subject", "start", "stop"]]


FEATURE CORRELATION REMOVAL
Liczba cech przed: 60
Usuniętych (corr > 0.9): 19
Pozostało: 41
Procent usuniętych: 31.7%

--- Usunięte cechy (wysoka korelacja) ---
ACC_x_energy
ACC_x_p25
ACC_x_p50
ACC_z_mean
ACC_z_p25
ACC_z_p50
ACC_z_rms
EDA_max
EDA_mean
EDA_min
EDA_p25
EDA_p50
EDA_p75
EDA_phasic
EDA_rms
EDA_tonic
RESP_rms
RESP_std
RESP_var

--- Zachowane cechy ---
ACC_jerk_mean
ACC_jerk_rms
ACC_mag_energy
ACC_x_kurtosis
ACC_x_max
ACC_x_min
ACC_x_skew
ACC_y_energy
ACC_y_kurtosis
ACC_y_max
ACC_y_min
ACC_y_p25
ACC_y_skew
ACC_z_energy
ACC_z_kurtosis
ACC_z_max
ACC_z_min
ACC_z_p75
ACC_z_skew
ECG_energy
ECG_kurtosis
ECG_max
ECG_min
ECG_skew
EDA_energy
EDA_kurtosis
EDA_skew
EDA_std
EDA_var
HR
RESP_amplitude
RESP_cycles_per_min
RESP_energy
RESP_kurtosis
RESP_max
RESP_mean
RESP_min
RESP_p25
RESP_p50
RESP_p75
RESP_skew


lista cech, które wchodzą do modelu

In [74]:
# === Zapis listy cech zachowanych ===
keep_path = os.path.join(EDA_SAVE_DIR, "selected_features_after_LV_and_corr.csv")
pd.DataFrame({"feature": sorted(to_keep_corr)}).to_csv(keep_path, index=False)

print("\nZapisano listę zachowanych cech do:")
print(keep_path)



Zapisano listę zachowanych cech do:
/Volumes/blue_nateck/EDA_plots/selected_features_after_LV_and_corr.csv


lista cech odrzuconych

In [75]:
# === Zapis listy cech usuniętych ===
drop_path = os.path.join(EDA_SAVE_DIR, "dropped_features_LV_and_corr.csv")
pd.DataFrame({"feature": sorted(list(to_drop_corr))}).to_csv(drop_path, index=False)

print("\nZapisano listę usuniętych cech do:")
print(drop_path)



Zapisano listę usuniętych cech do:
/Volumes/blue_nateck/EDA_plots/dropped_features_LV_and_corr.csv


finalny DataFrame gotowy do modelowania

In [76]:
clean_df_path = os.path.join(EDA_SAVE_DIR, "features_cleaned.csv")
df_corr.to_csv(clean_df_path, index=False)

print("\nZapisano pełny oczyszczony DataFrame do:")
print(clean_df_path)



Zapisano pełny oczyszczony DataFrame do:
/Volumes/blue_nateck/EDA_plots/features_cleaned.csv


In [77]:
!pip install scikit-posthocs

Collecting scikit-posthocs
  Obtaining dependency information for scikit-posthocs from https://files.pythonhosted.org/packages/d6/21/506f0ab734ad73f9215b09e04a05b393170c391349778f0c676a7b88cb7a/scikit_posthocs-0.11.4-py3-none-any.whl.metadata
  Downloading scikit_posthocs-0.11.4-py3-none-any.whl.metadata (5.8 kB)
Collecting statsmodels (from scikit-posthocs)
  Obtaining dependency information for statsmodels from https://files.pythonhosted.org/packages/05/30/affbabf3c27fb501ec7b5808230c619d4d1a4525c07301074eb4bda92fa9/statsmodels-0.14.6-cp312-cp312-macosx_11_0_arm64.whl.metadata
  Downloading statsmodels-0.14.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.6 (from statsmodels->scikit-posthocs)
  Obtaining dependency information for patsy>=0.5.6 from https://files.pythonhosted.org/packages/f1/70/ba4b949bdc0490ab78d545459acd7702b211dfccf7eb89bbc1060f52818d/patsy-1.0.2-py2.py3-none-any.whl.metadata
  Downloading patsy-1.0.2-py2.py3-none-any.whl.metadata

STATISTICAL TESTS

In [79]:
import pandas as pd
import numpy as np
from scipy.stats import kruskal, mannwhitneyu
import scikit_posthocs as sp

df_stats = df_corr.copy()

num_cols = [c for c in df_stats.columns 
            if c not in ["subject", "label_binary", "label_3class", "start", "stop"]]

results_kruskal = []
results_mann = []
results_posthoc = []

In [80]:
# ============================
# 1. KRUSKAL–WALLIS (3-class)
# ============================

print("Running Kruskal–Wallis tests...")

for col in num_cols:
    classes = []
    for cls in sorted(df_stats["label_3class"].unique()):
        classes.append(df_stats[df_stats["label_3class"] == cls][col].dropna())

    stat, p = kruskal(*classes)

    results_kruskal.append({
        "feature": col,
        "H_statistic": stat,
        "p_value": p
    })


df_kruskal = pd.DataFrame(results_kruskal)
df_kruskal["significant"] = df_kruskal["p_value"] < 0.05

# zapis
kruskal_path = os.path.join(EDA_SAVE_DIR, "stats_3class_kruskal.csv")
df_kruskal.to_csv(kruskal_path, index=False)


Running Kruskal–Wallis tests...


In [81]:
# ============================
# 2. POST-HOC DUNN TEST
# tylko dla cech istotnych
# ============================

print("Running post-hoc Dunn tests...")

significant_features = df_kruskal[df_kruskal["significant"]]["feature"].tolist()

for col in significant_features:
    data = df_stats[[col, "label_3class"]].dropna()

    posthoc = sp.posthoc_dunn(
        data, 
        val_col=col,
        group_col="label_3class",
        p_adjust="bonferroni"
    )

    posthoc["feature"] = col
    results_posthoc.append(posthoc)


# zapis
if len(results_posthoc) > 0:
    posthoc_all = pd.concat(results_posthoc)
    posthoc_path = os.path.join(EDA_SAVE_DIR, "stats_3class_posthoc_dunn.csv")
    posthoc_all.to_csv(posthoc_path)
else:
    posthoc_path = None



Running post-hoc Dunn tests...


In [82]:
# ============================
# 3. MANN–WHITNEY (binary)
# ============================

print("Running Mann–Whitney U tests...")

for col in num_cols:
    group0 = df_stats[df_stats["label_binary"] == 0][col].dropna()
    group1 = df_stats[df_stats["label_binary"] == 1][col].dropna()

    stat, p = mannwhitneyu(group0, group1, alternative="two-sided")

    results_mann.append({
        "feature": col,
        "U_statistic": stat,
        "p_value": p,
        "significant": p < 0.05
    })

df_mann = pd.DataFrame(results_mann)

# zapis
mann_path = os.path.join(EDA_SAVE_DIR, "stats_binary_mannwhitney.csv")
df_mann.to_csv(mann_path, index=False)


Running Mann–Whitney U tests...


In [83]:
# ============================
# PODSUMOWANIE
# ============================

print("\n=== STATISTICAL TESTS COMPLETED ===")
print(f"Kruskal–Wallis results saved to: {kruskal_path}")
print(f"Mann–Whitney results saved to: {mann_path}")

if posthoc_path:
    print(f"Dunn post-hoc results saved to: {posthoc_path}")
else:
    print("No significant features -> no post-hoc tests performed.")


=== STATISTICAL TESTS COMPLETED ===
Kruskal–Wallis results saved to: /Volumes/blue_nateck/EDA_plots/stats_3class_kruskal.csv
Mann–Whitney results saved to: /Volumes/blue_nateck/EDA_plots/stats_binary_mannwhitney.csv
Dunn post-hoc results saved to: /Volumes/blue_nateck/EDA_plots/stats_3class_posthoc_dunn.csv


In [84]:
# ============================
# AUTOMATIC STATISTICAL FEATURE SELECTION
# ============================

# 1) cechy istotne w 3-class (Kruskal–Wallis)
signif_3class = df_kruskal[df_kruskal["p_value"] < 0.05]["feature"].tolist()

# 2) cechy istotne w binary (Mann–Whitney)
signif_binary = df_mann[df_mann["p_value"] < 0.05]["feature"].tolist()

# 3) UNION — najlepszy wybór
signif_union = sorted(list(set(signif_3class) | set(signif_binary)))

# 4) INTERSECTION (opcjonalnie — bardziej restrykcyjne)
signif_intersection = sorted(list(set(signif_3class) & set(signif_binary)))




In [85]:
# ============================
# Raport
# ============================

print("\n===== STATISTICAL FEATURE SELECTION =====")
print(f"Liczba cech po LV + corr: {len(df_corr.columns) - 5}")
print(f"Cechy istotne w 3-class (p<0.05): {len(signif_3class)}")
print(f"Cechy istotne w binary (p<0.05): {len(signif_binary)}")
print(f"Cechy istotne w UNION (zalecane): {len(signif_union)}")
print(f"Cechy istotne w INTERSECTION (opcjonalne): {len(signif_intersection)}")

print("\n--- UNION FEATURES ---")
for c in signif_union:
    print(c)




===== STATISTICAL FEATURE SELECTION =====
Liczba cech po LV + corr: 41
Cechy istotne w 3-class (p<0.05): 37
Cechy istotne w binary (p<0.05): 28
Cechy istotne w UNION (zalecane): 37
Cechy istotne w INTERSECTION (opcjonalne): 28

--- UNION FEATURES ---
ACC_jerk_mean
ACC_jerk_rms
ACC_mag_energy
ACC_x_max
ACC_x_min
ACC_x_skew
ACC_y_energy
ACC_y_kurtosis
ACC_y_max
ACC_y_min
ACC_y_p25
ACC_y_skew
ACC_z_energy
ACC_z_kurtosis
ACC_z_max
ACC_z_min
ACC_z_p75
ACC_z_skew
ECG_energy
ECG_kurtosis
ECG_max
ECG_min
ECG_skew
EDA_energy
EDA_kurtosis
EDA_std
EDA_var
RESP_amplitude
RESP_cycles_per_min
RESP_energy
RESP_kurtosis
RESP_max
RESP_min
RESP_p25
RESP_p50
RESP_p75
RESP_skew


In [86]:
# ============================
# 5. Utworzenie finalnego DataFrame z wybranymi cechami
# ============================

df_stat = df_corr[signif_union + ["label_binary", "label_3class", "subject", "start", "stop"]]

In [87]:
# ============================
# 6. Zapis do CSV
# ============================

# lista cech
feat_path = os.path.join(EDA_SAVE_DIR, "selected_features_statistical.csv")
pd.DataFrame({"feature": signif_union}).to_csv(feat_path, index=False)

# finalny dataset
df_path = os.path.join(EDA_SAVE_DIR, "features_final_after_statistical_selection.csv")
df_stat.to_csv(df_path, index=False)

print("\nZapisano listę cech do:", feat_path)
print("Zapisano finalny dataset do:", df_path)


Zapisano listę cech do: /Volumes/blue_nateck/EDA_plots/selected_features_statistical.csv
Zapisano finalny dataset do: /Volumes/blue_nateck/EDA_plots/features_final_after_statistical_selection.csv


80/10/10 patient wise split

In [88]:
import numpy as np
import pandas as pd

# Wczytaj finalny dataset, który stworzyłaś
df = pd.read_csv("/Volumes/blue_nateck/EDA_plots/features_final_after_statistical_selection.csv")

In [89]:
# ----------------------------
# 1. Patient-wise split
# ----------------------------

subjects = df["subject"].unique()
np.random.seed(42)
np.random.shuffle(subjects)

n = len(subjects)

train_subj = subjects[:int(0.8 * n)]
val_subj   = subjects[int(0.8 * n):int(0.9 * n)]
test_subj  = subjects[int(0.9 * n):]

print("Train subjects:", train_subj)
print("Validation subjects:", val_subj)
print("Test subjects:", test_subj)



Train subjects: ['S4' 'S6' 'S10' 'S8' 'S16' 'S3' 'S13' 'S11' 'S9' 'S15' 'S2' 'S5']
Validation subjects: ['S7']
Test subjects: ['S14' 'S17']


In [90]:
# ----------------------------
# 2. Stwórz zbiory
# ----------------------------

train_df = df[df["subject"].isin(train_subj)]
val_df   = df[df["subject"].isin(val_subj)]
test_df  = df[df["subject"].isin(test_subj)]

print("Train size:", len(train_df))
print("Validation size:", len(val_df))
print("Test size:", len(test_df))



Train size: 1411
Validation size: 97
Test size: 226


In [91]:
# ----------------------------
# 3. Zapisz do CSV
# ----------------------------

train_path = "/Volumes/blue_nateck/EDA_plots/train_patientwise.csv"
val_path   = "/Volumes/blue_nateck/EDA_plots/val_patientwise.csv"
test_path  = "/Volumes/blue_nateck/EDA_plots/test_patientwise.csv"

train_df.to_csv(train_path, index=False)
val_df.to_csv(val_path, index=False)
test_df.to_csv(test_path, index=False)

print("\nZapisano:")
print("Train  →", train_path)
print("Val    →", val_path)
print("Test   →", test_path)


Zapisano:
Train  → /Volumes/blue_nateck/EDA_plots/train_patientwise.csv
Val    → /Volumes/blue_nateck/EDA_plots/val_patientwise.csv
Test   → /Volumes/blue_nateck/EDA_plots/test_patientwise.csv



Zapisano pełny oczyszczony DataFrame do:
/Volumes/blue_nateck/EDA_plots/features_cleaned.csv
