Importy + config

In [1]:
import os
import pickle
import numpy as np
import pandas as pd

from scipy.signal import find_peaks
from scipy.stats import skew, kurtosis

from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier

from tqdm import tqdm
from collections import defaultdict


In [2]:
DATA_ROOT = "/Volumes/blue_nateck/WESAD"
FS = 700  # sampling
PURITY_THRESHOLD = 0.9  # windows must have ≥ 90% purity


funkcje pomocnicze

In [3]:
def safe_1d(x):
    x = np.array(x)
    return x.reshape(-1)


ładowanie sygnałów

In [4]:
def load_subject(subject_id):
    path = os.path.join(DATA_ROOT, subject_id, f"{subject_id}.pkl")
    data = pickle.load(open(path, "rb"), encoding="latin1")

    acc = np.array(data["signal"]["chest"]["ACC"])  # (N,3)
    eda = safe_1d(data["signal"]["chest"]["EDA"])
    resp = safe_1d(data["signal"]["chest"]["Resp"])
    ecg = safe_1d(data["signal"]["chest"]["ECG"])
    labels = safe_1d(data["label"])

    min_len = min(len(acc), len(eda), len(resp), len(ecg), len(labels))

    return pd.DataFrame({
        "ACC_x": acc[:min_len, 0],
        "ACC_y": acc[:min_len, 1],
        "ACC_z": acc[:min_len, 2],
        "EDA": eda[:min_len],
        "RESP": resp[:min_len],
        "ECG": ecg[:min_len],
        "label": labels[:min_len],
        "subject": subject_id
    })


extract features

In [5]:
def compute_hr_hrv(ecg_signal):
    peaks, _ = find_peaks(ecg_signal, distance=int(0.2 * FS))
    if len(peaks) < 2:
        return np.nan, np.nan
    rr = np.diff(peaks) / FS
    hr = 60 / rr
    return np.mean(hr), np.std(rr)


# CACHE CECH: (subject, start, stop) → dict z cechami
feature_cache = {"df": {}, "feat": {}}


def extract_features_cached(subj, start, stop):
    key = (subj, start, stop)

    # jeśli już policzone → zwróć
    if key in feature_cache["feat"]:
        return feature_cache["feat"][key]

    df = feature_cache["df"][subj]
    w = df.iloc[start:stop]

    feats = {}
    cols = ["ACC_x", "ACC_y", "ACC_z", "EDA", "RESP", "ECG"]

    for col in cols:
        x = w[col].values
        feats[f"{col}_mean"] = np.mean(x)
        feats[f"{col}_std"] = np.std(x)
        feats[f"{col}_min"] = np.min(x)
        feats[f"{col}_max"] = np.max(x)
        feats[f"{col}_var"] = np.var(x)
        feats[f"{col}_skew"] = skew(x, nan_policy="omit")
        feats[f"{col}_kurtosis"] = kurtosis(x, nan_policy="omit")

    hr, hrv = compute_hr_hrv(w["ECG"].values)
    feats["HR_mean"] = hr
    feats["HRV_SDNN"] = hrv

    feature_cache["feat"][key] = feats
    return feats




windowing z purity

In [6]:
window_cache = defaultdict(dict)


In [7]:
def compute_purity(labels_window):
    labels_window = np.asarray(labels_window)
    vals, counts = np.unique(labels_window, return_counts=True)
    purity = counts.max() / counts.sum()
    dominant = vals[counts.argmax()]
    return purity, dominant


In [8]:
def precompute_windows(df_all, subjects, window_sec):
    if window_sec in window_cache:
        return window_cache[window_sec]

    win_size = int(window_sec * FS)
    stride = win_size // 2

    out = {}

    for subj in tqdm(subjects, desc=f"Windows {window_sec}s"):
        df = df_all[subj]
        labels = df["label"].values

        wins = []
        for start in range(0, len(df) - win_size, stride):
            stop = start + win_size
            purity, lab = compute_purity(labels[start:stop])
            if purity >= PURITY_THRESHOLD:
                wins.append((start, stop, lab))
        out[subj] = wins

    window_cache[window_sec] = out
    return out


wczytanie bazy

In [9]:
subjects = sorted([s for s in os.listdir(DATA_ROOT) if s.startswith("S")])
df_all = {s: load_subject(s) for s in subjects}

# inicjalizacja cache df
feature_cache["df"] = df_all.copy()

print("Loaded:", subjects)


Loaded: ['S10', 'S11', 'S13', 'S14', 'S15', 'S16', 'S17', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9']


funkcja loso 

In [10]:
def loso_fast(df_all, subjects, window_sec):
    wins = precompute_windows(df_all, subjects, window_sec)
    f1_scores = []

    for test_s in tqdm(subjects, desc=f"LOSO {window_sec}s"):
        train_s = [s for s in subjects if s != test_s]

        X_train, y_train = [], []
        X_test, y_test = [], []

        # TRAIN
        for s in train_s:
            for (start, stop, lab) in wins[s]:
                feats = extract_features_cached(s, start, stop)
                X_train.append(feats)
                y_train.append(lab)

        # TEST
        for (start, stop, lab) in wins[test_s]:
            feats = extract_features_cached(test_s, start, stop)
            X_test.append(feats)
            y_test.append(lab)

        if len(y_test) == 0:
            continue

        clf = RandomForestClassifier(
            n_estimators=150,
            class_weight="balanced",
            random_state=42
        )

        clf.fit(pd.DataFrame(X_train), y_train)
        pred = clf.predict(pd.DataFrame(X_test))
        f1 = f1_score(y_test, pred, average="macro")
        f1_scores.append(f1)

    if len(f1_scores) == 0:
        return None

    return np.mean(f1_scores)


test okien od 5 sekund do 60 sekund

In [11]:
window_candidates = list(range(5, 61, 2))

results = []
for w in window_candidates:
    print(f"\n=== Testing {w}s ===")
    f1 = loso_fast(df_all, subjects, w)
    results.append({"window_sec": w, "f1_macro": f1})

results_df = pd.DataFrame(results).sort_values("f1_macro", ascending=False)
results_df



=== Testing 5s ===


Windows 5s: 100%|██████████| 15/15 [00:00<00:00, 26.87it/s]
LOSO 5s: 100%|██████████| 15/15 [06:13<00:00, 24.87s/it]



=== Testing 7s ===


Windows 7s: 100%|██████████| 15/15 [00:00<00:00, 44.19it/s]
LOSO 7s: 100%|██████████| 15/15 [04:20<00:00, 17.37s/it]



=== Testing 9s ===


Windows 9s: 100%|██████████| 15/15 [00:00<00:00, 43.96it/s]
LOSO 9s: 100%|██████████| 15/15 [14:55<00:00, 59.70s/it] 



=== Testing 11s ===


Windows 11s: 100%|██████████| 15/15 [00:00<00:00, 52.81it/s]
LOSO 11s: 100%|██████████| 15/15 [02:38<00:00, 10.57s/it]



=== Testing 13s ===


Windows 13s: 100%|██████████| 15/15 [00:00<00:00, 51.19it/s]
LOSO 13s: 100%|██████████| 15/15 [02:12<00:00,  8.83s/it]



=== Testing 15s ===


Windows 15s: 100%|██████████| 15/15 [00:00<00:00, 74.73it/s]
LOSO 15s: 100%|██████████| 15/15 [01:52<00:00,  7.50s/it]



=== Testing 17s ===


Windows 17s: 100%|██████████| 15/15 [00:00<00:00, 66.00it/s]
LOSO 17s: 100%|██████████| 15/15 [01:36<00:00,  6.42s/it]



=== Testing 19s ===


Windows 19s: 100%|██████████| 15/15 [00:00<00:00, 62.97it/s]
LOSO 19s: 100%|██████████| 15/15 [01:25<00:00,  5.71s/it]



=== Testing 21s ===


Windows 21s: 100%|██████████| 15/15 [00:00<00:00, 32.89it/s]
LOSO 21s: 100%|██████████| 15/15 [01:16<00:00,  5.09s/it]



=== Testing 23s ===


Windows 23s: 100%|██████████| 15/15 [00:00<00:00, 62.66it/s]
LOSO 23s: 100%|██████████| 15/15 [01:09<00:00,  4.63s/it]



=== Testing 25s ===


Windows 25s: 100%|██████████| 15/15 [00:00<00:00, 67.76it/s]
LOSO 25s: 100%|██████████| 15/15 [01:04<00:00,  4.27s/it]



=== Testing 27s ===


Windows 27s: 100%|██████████| 15/15 [00:00<00:00, 63.16it/s]
LOSO 27s: 100%|██████████| 15/15 [00:58<00:00,  3.93s/it]



=== Testing 29s ===


Windows 29s: 100%|██████████| 15/15 [00:00<00:00, 67.19it/s]
LOSO 29s: 100%|██████████| 15/15 [00:55<00:00,  3.69s/it]



=== Testing 31s ===


Windows 31s: 100%|██████████| 15/15 [00:00<00:00, 99.09it/s] 
LOSO 31s: 100%|██████████| 15/15 [00:50<00:00,  3.40s/it]



=== Testing 33s ===


Windows 33s: 100%|██████████| 15/15 [00:00<00:00, 69.24it/s]
LOSO 33s: 100%|██████████| 15/15 [00:49<00:00,  3.33s/it]



=== Testing 35s ===


Windows 35s: 100%|██████████| 15/15 [00:00<00:00, 66.77it/s]
LOSO 35s: 100%|██████████| 15/15 [00:47<00:00,  3.17s/it]



=== Testing 37s ===


Windows 37s: 100%|██████████| 15/15 [00:00<00:00, 73.62it/s]
LOSO 37s: 100%|██████████| 15/15 [00:44<00:00,  2.99s/it]



=== Testing 39s ===


Windows 39s: 100%|██████████| 15/15 [00:00<00:00, 75.75it/s]
LOSO 39s: 100%|██████████| 15/15 [00:42<00:00,  2.81s/it]



=== Testing 41s ===


Windows 41s: 100%|██████████| 15/15 [00:00<00:00, 80.56it/s]
LOSO 41s: 100%|██████████| 15/15 [00:40<00:00,  2.67s/it]



=== Testing 43s ===


Windows 43s: 100%|██████████| 15/15 [00:00<00:00, 73.81it/s]
LOSO 43s: 100%|██████████| 15/15 [01:02<00:00,  4.15s/it]



=== Testing 45s ===


Windows 45s: 100%|██████████| 15/15 [00:00<00:00, 56.67it/s]
LOSO 45s: 100%|██████████| 15/15 [01:02<00:00,  4.17s/it]



=== Testing 47s ===


Windows 47s: 100%|██████████| 15/15 [00:00<00:00, 16.79it/s]
LOSO 47s: 100%|██████████| 15/15 [01:00<00:00,  4.01s/it]



=== Testing 49s ===


Windows 49s: 100%|██████████| 15/15 [00:00<00:00, 17.85it/s]
LOSO 49s: 100%|██████████| 15/15 [00:58<00:00,  3.92s/it]



=== Testing 51s ===


Windows 51s: 100%|██████████| 15/15 [00:00<00:00, 21.50it/s]
LOSO 51s: 100%|██████████| 15/15 [00:55<00:00,  3.73s/it]



=== Testing 53s ===


Windows 53s: 100%|██████████| 15/15 [00:00<00:00, 20.97it/s]
LOSO 53s: 100%|██████████| 15/15 [00:53<00:00,  3.57s/it]



=== Testing 55s ===


Windows 55s: 100%|██████████| 15/15 [00:00<00:00, 20.18it/s]
LOSO 55s: 100%|██████████| 15/15 [00:52<00:00,  3.47s/it]



=== Testing 57s ===


Windows 57s: 100%|██████████| 15/15 [00:00<00:00, 22.64it/s]
LOSO 57s: 100%|██████████| 15/15 [00:50<00:00,  3.34s/it]



=== Testing 59s ===


Windows 59s: 100%|██████████| 15/15 [00:00<00:00, 22.21it/s]
LOSO 59s: 100%|██████████| 15/15 [00:49<00:00,  3.27s/it]


Unnamed: 0,window_sec,f1_macro
25,55,0.379565
27,59,0.378855
26,57,0.37184
24,53,0.344166
23,51,0.337736
22,49,0.313165
20,45,0.311292
21,47,0.307171
19,43,0.292869
18,41,0.27878


sortowanie okien

In [12]:
results_df = results_df.sort_values("f1_macro", ascending=False)
results_df

Unnamed: 0,window_sec,f1_macro
25,55,0.379565
27,59,0.378855
26,57,0.37184
24,53,0.344166
23,51,0.337736
22,49,0.313165
20,45,0.311292
21,47,0.307171
19,43,0.292869
18,41,0.27878


Funkcje pomocnicze HRV, RMS, energia jerk, signal utils

In [13]:
import numpy as np
from scipy.signal import find_peaks, detrend
from scipy.stats import skew, kurtosis

FS = 700  # sampling rate

RMS i energia

In [14]:
def rms(x):
    return np.sqrt(np.mean(x**2))

def signal_energy(x):
    return np.sum(x**2)


HR + HRV (SDNN + RMSSD)

In [15]:
def compute_hrv_features(ecg, fs=FS):
    peaks, _ = find_peaks(ecg, distance=int(0.2 * fs))

    if len(peaks) < 3:
        return np.nan, np.nan, np.nan

    rr = np.diff(peaks) / fs  # RR intervals (seconds)

    sdnn = np.std(rr)
    rmssd = np.sqrt(np.mean(np.diff(rr)**2))
    hr = 60 / np.mean(rr)

    return hr, sdnn, rmssd


EDA tonic + phasic (prosta szybka wersja)

In [16]:
def eda_tonic_phasic(eda_sig):
    tonic = np.mean(eda_sig)
    phasic = np.std(eda_sig - tonic)
    return tonic, phasic


ACC magnitude + jerk

In [17]:
def acc_magnitude(df):
    return np.sqrt(df["ACC_x"]**2 + df["ACC_y"]**2 + df["ACC_z"]**2)

def acc_jerk(acc_mag, fs=FS):
    jerk = np.diff(acc_mag) * fs
    return np.mean(np.abs(jerk)), rms(jerk)


RESP amplitude + cycles/min

In [18]:
def resp_features(resp, fs=FS):
    resp_d = detrend(resp)
    peaks, _ = find_peaks(resp_d, distance=int(0.8 * fs))

    if len(peaks) < 2:
        return np.nan, np.nan

    amplitude = np.mean(resp_d[peaks])
    cycles_per_min = (len(peaks) / (len(resp) / fs)) * 60

    return amplitude, cycles_per_min


Ekstrakcja pełych cech z jednego okna

In [19]:
def extract_full_features(df_win):

    feats = {}
    cols = ["ACC_x", "ACC_y", "ACC_z", "EDA", "RESP", "ECG"]

    # --- BASIC STATS ---
    for col in cols:
        x = df_win[col].values
        feats[f"{col}_mean"] = np.mean(x)
        feats[f"{col}_std"] = np.std(x)
        feats[f"{col}_var"] = np.var(x)
        feats[f"{col}_min"] = np.min(x)
        feats[f"{col}_max"] = np.max(x)

        # percentyles
        feats[f"{col}_p25"] = np.percentile(x, 25)
        feats[f"{col}_p50"] = np.percentile(x, 50)
        feats[f"{col}_p75"] = np.percentile(x, 75)

        # shape
        feats[f"{col}_skew"] = skew(x)
        feats[f"{col}_kurtosis"] = kurtosis(x)

        # RMS + energy
        feats[f"{col}_rms"] = rms(x)
        feats[f"{col}_energy"] = signal_energy(x)

    # --- HR / HRV ---
    hr, sdnn, rmssd = compute_hrv_features(df_win["ECG"].values)
    feats["HR"] = hr
    feats["HRV_SDNN"] = sdnn
    feats["HRV_RMSSD"] = rmssd

    # --- EDA tonic/phasic ---
    tonic, phasic = eda_tonic_phasic(df_win["EDA"].values)
    feats["EDA_tonic"] = tonic
    feats["EDA_phasic"] = phasic

    # --- ACC magnitude + jerk ---
    acc_mag = acc_magnitude(df_win)
    feats["ACC_mag_mean"] = np.mean(acc_mag)
    feats["ACC_mag_std"] = np.std(acc_mag)
    feats["ACC_mag_energy"] = signal_energy(acc_mag)

    jerk_mean, jerk_rms = acc_jerk(acc_mag)
    feats["ACC_jerk_mean"] = jerk_mean
    feats["ACC_jerk_rms"] = jerk_rms

    # --- RESP ---
    amp, rpm = resp_features(df_win["RESP"].values)
    feats["RESP_amplitude"] = amp
    feats["RESP_cycles_per_min"] = rpm

    return feats


Generowanie pełnego DataFrame cech dla okna 55 sekund

In [20]:
BEST_WINDOW = 55  # sekund

wins = precompute_windows(df_all, subjects, BEST_WINDOW)

all_feats = []
all_labels = []
all_subjs = []

win_size = int(BEST_WINDOW * FS)


In [21]:
for subj in subjects:
    df_s = df_all[subj]

    for (start, stop, label) in wins[subj]:
        df_win = df_s.iloc[start:stop]
        feats = extract_full_features(df_win)

        feats["subject"] = subj
        feats["label"] = label
        feats["start"] = start
        feats["stop"] = stop

        all_feats.append(feats)


In [22]:
features_df = pd.DataFrame(all_feats)
features_df


Unnamed: 0,ACC_x_mean,ACC_x_std,ACC_x_var,ACC_x_min,ACC_x_max,ACC_x_p25,ACC_x_p50,ACC_x_p75,ACC_x_skew,ACC_x_kurtosis,...,ACC_mag_std,ACC_mag_energy,ACC_jerk_mean,ACC_jerk_rms,RESP_amplitude,RESP_cycles_per_min,subject,label,start,stop
0,0.882208,0.031776,0.001010,0.5602,1.2294,0.8728,0.8782,0.8858,1.911054,13.533833,...,0.034935,32654.400707,1.338237,3.960427,0.295490,66.545455,S10,0,0,38500
1,0.885211,0.033685,0.001135,0.7192,1.1518,0.8714,0.8858,0.8954,1.198394,8.304768,...,0.031420,32591.043118,1.258852,1.675577,0.440130,64.363636,S10,0,19250,57750
2,0.878179,0.013905,0.000193,0.7928,1.0168,0.8698,0.8828,0.8860,0.078198,7.897674,...,0.008580,32178.525802,0.981493,1.245254,0.401539,67.636364,S10,1,77000,115500
3,0.865818,0.014461,0.000209,0.7928,1.0168,0.8568,0.8606,0.8710,1.840900,9.670427,...,0.008511,32407.425611,1.015275,1.286601,0.273996,68.727273,S10,1,96250,134750
4,0.862190,0.005044,0.000025,0.8430,0.8794,0.8580,0.8630,0.8658,-0.239928,-0.584432,...,0.003931,32495.477823,1.004899,1.264256,0.349650,67.636364,S10,1,115500,154000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2738,0.459360,0.011326,0.000128,0.4170,0.4794,0.4536,0.4634,0.4674,-1.098955,0.440122,...,0.007149,35795.070747,1.507421,1.929509,0.212079,68.727273,S9,4,3503500,3542000
2739,0.456321,0.019967,0.000399,0.3772,0.4838,0.4504,0.4642,0.4692,-1.743806,2.549293,...,0.007574,35822.430763,1.522084,1.955053,0.319825,69.818182,S9,4,3522750,3561250
2740,0.861278,0.139359,0.019421,0.3932,1.2442,0.8942,0.9106,0.9166,-2.446184,4.671934,...,0.030971,33127.825459,1.179100,1.680459,0.562957,66.545455,S9,0,3561250,3599750
2741,0.914380,0.008983,0.000081,0.8344,1.0074,0.9118,0.9150,0.9178,-0.143045,16.367909,...,0.008638,32762.361249,0.973480,1.245791,0.515182,67.636364,S9,0,3580500,3619000


EDA

In [26]:
# ŚCIEŻKA NA DYSKU ZEWNĘTRZNYM
EDA_SAVE_DIR = "/Volumes/blue_nateck/EDA_plots"

# tworzymy folder jeśli nie istnieje
os.makedirs(EDA_SAVE_DIR, exist_ok=True)

print("Wykresy będą zapisywane tutaj:", EDA_SAVE_DIR)


Wykresy będą zapisywane tutaj: /Volumes/blue_nateck/EDA_plots


In [23]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier

sns.set_theme(style="whitegrid")
plt.rcParams["figure.dpi"] = 130

# folder na wykresy
os.makedirs("eda_plots", exist_ok=True)

# Twój DataFrame
df = features_df.copy()


Podstawowe informacje do pracy

In [24]:
print("Liczba okien:", len(df))
print("Liczba cech:", df.shape[1] - 3)  # bez label, subject, indices
print("\nKlasy:\n", df["label"].value_counts())
print("\nBraki danych:\n", df.isna().sum().sort_values(ascending=False))


Liczba okien: 2743
Liczba cech: 85
The history saving thread hit an unexpected error (OperationalError('unable to open database file')).History will not be written to the database.

Klasy:
 label
0    1218
1     616
4     378
2     337
3     179
7       6
5       5
6       4
Name: count, dtype: int64

Braki danych:
 ACC_x_mean    0
ACC_x_std     0
ECG_max       0
ECG_min       0
ECG_var       0
             ..
ACC_z_min     0
ACC_z_var     0
ACC_z_std     0
ACC_z_mean    0
stop          0
Length: 88, dtype: int64


Histogramy wszytskich cech

In [27]:
num_cols = [c for c in df.columns if c not in ["subject", "label", "start", "stop"]]

for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.histplot(df[col], kde=True, color="#1f77b4")
    plt.title(f"Histogram: {col}", fontsize=13)
    plt.tight_layout()
    plt.savefig(os.path.join(EDA_SAVE_DIR, f"hist_{col}.png"))
    plt.close()


boxploty wszystkich cech

In [28]:
for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df[col], color="#ff7f0e")
    plt.title(f"Boxplot: {col}", fontsize=13)
    plt.tight_layout()
    plt.savefig(os.path.join(EDA_SAVE_DIR, f"box_{col}.png"))
    plt.close()


boxploty per class

In [29]:
for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x=df["label"], y=df[col], palette="Set2")
    plt.title(f"Boxplot per class: {col}", fontsize=13)
    plt.tight_layout()
    plt.savefig(os.path.join(EDA_SAVE_DIR, f"box_class_{col}.png"))
    plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=df["label"], y=df[col], palette="Set2")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=df["label"], y=df[col], palette="Set2")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=df["label"], y=df[col], palette="Set2")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.boxplot(x=df["label"], y=df[col], palette="Set2")

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` vari

violin plots

In [30]:
for col in num_cols:
    plt.figure(figsize=(6, 4))
    sns.violinplot(x="label", y=col, data=df, palette="Set3", cut=0)
    plt.title(f"Violin plot per class: {col}")
    plt.tight_layout()
    plt.savefig(os.path.join(EDA_SAVE_DIR, f"violin_{col}.png"))
    plt.close()



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="label", y=col, data=df, palette="Set3", cut=0)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="label", y=col, data=df, palette="Set3", cut=0)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="label", y=col, data=df, palette="Set3", cut=0)

Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  sns.violinplot(x="label", y=col, data=df, palette="Set3", cut=0)

Passing `palette` without assigning `hue` is deprecated and wil

KDE

In [31]:
for col in num_cols:
    plt.figure(figsize=(6, 4))
    for lab in sorted(df["label"].unique()):
        sns.kdeplot(df[df["label"] == lab][col], label=f"class {lab}")
    plt.title(f"KDE distribution: {col}")
    plt.legend()
    plt.tight_layout()
    plt.savefig(os.path.join(EDA_SAVE_DIR, f"kde_{col}.png"))
    plt.close()


korelacja cech

In [32]:
corr = df[num_cols].corr()

plt.figure(figsize=(14, 10))
sns.heatmap(corr, cmap="coolwarm", center=0)
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.savefig(os.path.join(EDA_SAVE_DIR, "corr_heatmap.png"))
plt.close()


klasteryzowana heatmapa

In [33]:
plt.figure(figsize=(14, 12))
sns.clustermap(corr, cmap="coolwarm", figsize=(14, 14))
plt.title("Clustered Feature Correlation")
plt.savefig(os.path.join(EDA_SAVE_DIR, "corr_cluster.png"))
plt.close()


<Figure size 1820x1560 with 0 Axes>

outlier detection

In [34]:
outlier_stats = {}

for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1

    low = Q1 - 1.5*IQR
    high = Q3 + 1.5*IQR

    outliers = ((df[col] < low) | (df[col] > high)).sum()

    outlier_stats[col] = outliers

outliers_df = pd.DataFrame.from_dict(outlier_stats, orient="index", columns=["outliers"])
outliers_df.sort_values("outliers", ascending=False)


Unnamed: 0,outliers
ECG_max,610
EDA_var,514
ACC_z_var,483
EDA_phasic,401
EDA_std,401
...,...
EDA_mean,50
ACC_z_p75,48
ACC_z_max,31
ECG_kurtosis,0


PCA 2d wizualizacja separacji klas

In [35]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(df[num_cols])

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

plt.figure(figsize=(6, 5))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=df["label"], palette="Set1")
plt.title("PCA: Feature space projection")
plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.1f}%)")
plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.1f}%)")
plt.tight_layout()
plt.savefig(os.path.join(EDA_SAVE_DIR, "pca.png"))
plt.close()


Feature importance RF - wstepna selekcja cech

In [36]:
rf = RandomForestClassifier(n_estimators=300, class_weight="balanced", random_state=42)

rf.fit(df[num_cols], df["label"])

importances = pd.DataFrame({
    "feature": num_cols,
    "importance": rf.feature_importances_
}).sort_values("importance", ascending=False)

plt.figure(figsize=(8, 16))
sns.barplot(data=importances.head(30), y="feature", x="importance", palette="viridis")
plt.title("Top 30 Most Informative Features")
plt.tight_layout()
plt.savefig(os.path.join(EDA_SAVE_DIR, "feature_importance_top30.png"))
plt.close()

importances.head(30)



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(data=importances.head(30), y="feature", x="importance", palette="viridis")


Unnamed: 0,feature,importance
56,RESP_skew,0.042486
54,RESP_p50,0.04031
13,ACC_y_std,0.025734
78,ACC_mag_std,0.023711
14,ACC_y_var,0.022742
83,RESP_cycles_per_min,0.018378
3,ACC_x_min,0.018053
53,RESP_p25,0.017247
24,ACC_z_mean,0.01721
27,ACC_z_min,0.017134
