In [1]:
import os
import pickle
import numpy as np
import pandas as pd
from scipy.signal import find_peaks
from scipy.stats import skew, kurtosis

In [2]:
DATA_ROOT = "/Volumes/blue_nateck/WESAD"

subjects = [d for d in os.listdir(DATA_ROOT) if d.startswith("S")]
subjects


['S10',
 'S11',
 'S13',
 'S14',
 'S15',
 'S16',
 'S17',
 'S2',
 'S3',
 'S4',
 'S5',
 'S6',
 'S7',
 'S8',
 'S9']

In [3]:
def compute_hr_hrv(ecg_signal, fs=700):
    peaks, _ = find_peaks(ecg_signal, distance=int(0.2 * fs))
    if len(peaks) < 2:
        return np.nan, np.nan

    rr_intervals = np.diff(peaks) / fs
    hr_values = 60 / rr_intervals
    
    hr_mean = np.mean(hr_values)
    hrv_sdnn = np.std(rr_intervals)
    
    return hr_mean, hrv_sdnn


In [4]:
def compute_purity(labels_window):
    """
    labels_window: array of labels inside a window
    purity = fraction of samples belonging to the dominant class
    """
    labels_window = np.asarray(labels_window)
    values, counts = np.unique(labels_window, return_counts=True)
    dominant_label = values[counts.argmax()]
    purity = counts.max() / counts.sum()
    return purity, dominant_label


In [5]:
def extract_features_from_window(window_df, fs=700):
    """
    Compute features from one window (DataFrame).
    window_df: contains raw signals in columns:
       ACC_x, ACC_y, ACC_z, EDA, RESP, ECG, HR_mean, HRV_SDNN
    Returns: dict with features for ML.
    """

    feats = {}

    signal_cols = ["ACC_x", "ACC_y", "ACC_z", "EDA", "RESP", "ECG"]

    # --- Basic statistics for each signal ---
    for col in signal_cols:
        x = window_df[col].values
        feats[f"{col}_mean"] = np.mean(x)
        feats[f"{col}_std"] = np.std(x)
        feats[f"{col}_min"] = np.min(x)
        feats[f"{col}_max"] = np.max(x)
        feats[f"{col}_var"] = np.var(x)
        feats[f"{col}_skew"] = skew(x, nan_policy="omit")
        feats[f"{col}_kurtosis"] = kurtosis(x, nan_policy="omit")

    # --- ACC magnitude ---
    acc_mag = np.sqrt(
        window_df["ACC_x"].values**2 +
        window_df["ACC_y"].values**2 +
        window_df["ACC_z"].values**2
    )
    feats["ACC_mag_mean"] = np.mean(acc_mag)
    feats["ACC_mag_std"] = np.std(acc_mag)

    # --- HR and HRV (computed inside the window) ---
    hr_mean, hrv_sdnn = compute_hr_hrv(window_df["ECG"].values, fs=fs)
    feats["HR_mean"] = hr_mean
    feats["HRV_SDNN"] = hrv_sdnn

    return feats

In [6]:
def safe_1d(x):
    """
    Convert any array to a 1-D numpy array.
    Acceptable shapes:
      - (N,)
      - (N,1)
      - (1,N)
    If more dimensions → flatten.
    """
    x = np.array(x)
    if x.ndim == 1:
        return x
    elif x.ndim == 2:
        # e.g. (N,1) or (1,N)
        return x.reshape(-1)
    else:
        # emergency fallback
        return x.flatten()


def load_subject_full(subject_id):
    print(f"Loading subject: {subject_id}")
    path = os.path.join(DATA_ROOT, subject_id, f"{subject_id}.pkl")

    with open(path, "rb") as f:
        data = pickle.load(f, encoding="latin1")

    acc = np.array(data["signal"]["chest"]["ACC"])
    eda = safe_1d(data["signal"]["chest"]["EDA"])
    resp = safe_1d(data["signal"]["chest"]["Resp"])
    ecg = safe_1d(data["signal"]["chest"]["ECG"])
    labels = safe_1d(data["label"])

    min_len = min(len(acc), len(eda), len(resp), len(ecg), len(labels))

    acc = acc[:min_len]
    eda = eda[:min_len]
    resp = resp[:min_len]
    ecg = ecg[:min_len]
    labels = labels[:min_len]

    df = pd.DataFrame({
        "ACC_x": acc[:, 0],
        "ACC_y": acc[:, 1],
        "ACC_z": acc[:, 2],
        "EDA": eda,
        "RESP": resp,
        "ECG": ecg,
        "label": labels,
        "subject": subject_id
    })

    return df



In [7]:
all_data = []

for s in subjects:
    df_s = load_subject_full(s)
    all_data.append(df_s)

df_all = pd.concat(all_data, axis=0)
df_all.head()


Loading subject: S10
Loading subject: S11
Loading subject: S13
Loading subject: S14
Loading subject: S15
Loading subject: S16
Loading subject: S17
Loading subject: S2
Loading subject: S3
Loading subject: S4
Loading subject: S5
Loading subject: S6
Loading subject: S7
Loading subject: S8
Loading subject: S9


Unnamed: 0,ACC_x,ACC_y,ACC_z,EDA,RESP,ECG,label,subject
0,1.1278,0.152,0.3416,0.716019,0.213623,-1.333694,0,S10
1,1.0932,0.1888,0.2922,0.714493,0.192261,-1.327744,0,S10
2,1.0354,0.2094,0.1858,0.715637,0.205994,-1.322067,0,S10
3,0.9666,0.2118,0.0412,0.714874,0.193787,-1.316345,0,S10
4,0.8916,0.204,-0.1228,0.715256,0.172424,-1.310257,0,S10


STAŁY PODZIAL 

In [8]:
import random

subjects = sorted([d for d in os.listdir(DATA_ROOT) if d.startswith("S")])

random.seed(42)            # stała losowość → zawsze ten sam podział
random.shuffle(subjects)

split_idx = int(0.7 * len(subjects))
train_subjects = subjects[:split_idx]
test_subjects = subjects[split_idx:]

print("TRAIN:", train_subjects)
print("TEST:", test_subjects)


TRAIN: ['S3', 'S8', 'S2', 'S17', 'S9', 'S7', 'S16', 'S13', 'S4', 'S14']
TEST: ['S15', 'S6', 'S10', 'S11', 'S5']


WINDOWING


In [9]:
def window_subject_with_purity(df_s, window_sec, fs=700, purity_threshold=0.9):
    window_size = int(window_sec * fs)
    stride = window_size // 2  # 50% overlap

    features = []
    y = []

    labels_array = df_s["label"].values

    for start in range(0, len(df_s) - window_size, stride):
        stop = start + window_size

        w_df = df_s.iloc[start:stop].reset_index(drop=True)
        w_labels = labels_array[start:stop]

        purity, dominant_label = compute_purity(w_labels)

        if purity >= purity_threshold:
            feats = extract_features_from_window(w_df)
            features.append(feats)
            y.append(dominant_label)

    return features, y

In [13]:
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier


def loso_evaluate_window(window_sec, df_all, subjects):
    print(f"\n===== LOSO evaluation for window {window_sec} sec =====")

    f1_scores = []
    n_windows_total = 0

    for test_subj in subjects:
        train_subjs = [s for s in subjects if s != test_subj]

        X_train, y_train = [], []
        X_test, y_test = [], []

        # --- Windowing for TRAIN subjects ---
        for s in train_subjs:
            df_s = df_all[df_all["subject"] == s].reset_index(drop=True)
            feats, labels = window_subject_with_purity(df_s, window_sec)
            X_train.extend(feats)
            y_train.extend(labels)

        # --- Windowing for TEST subject ---
        df_s_test = df_all[df_all["subject"] == test_subj].reset_index(drop=True)
        feats_test, labels_test = window_subject_with_purity(df_s_test, window_sec)
        X_test.extend(feats_test)
        y_test.extend(labels_test)

        # skip if no windows for this subject
        if len(y_test) == 0 or len(y_train) == 0:
            print(f"Skipping subject {test_subj} - no valid windows.")
            continue

        X_train_df = pd.DataFrame(X_train)
        X_test_df = pd.DataFrame(X_test)
        y_train = np.array(y_train)
        y_test = np.array(y_test)

        # --- Train classifier ---
        clf = RandomForestClassifier(
            n_estimators=300,
            random_state=42,
            class_weight="balanced",
            n_jobs=-1
        )

        clf.fit(X_train_df, y_train)
        y_pred = clf.predict(X_test_df)

        f1 = f1_score(y_test, y_pred, average="macro")
        f1_scores.append(f1)
        n_windows_total += len(y_test)

        print(f" - Test subject {test_subj}: F1_macro = {f1:.4f} (n={len(y_test)})")

    if len(f1_scores) == 0:
        print("No LOSO results for this window size!")
        return None

    return {
        "window_sec": window_sec,
        "f1_macro": np.mean(f1_scores),
        "n_windows": n_windows_total
    }


In [14]:
window_candidates = list(range(3, 16, 2))
results = []

for w in window_candidates:
    res = loso_evaluate_window(w, df_all, subjects)
    if res is not None:
        results.append(res)

results_df = pd.DataFrame(results)
results_df.sort_values("f1_macro", ascending=False)



===== LOSO evaluation for window 3 sec =====
 - Test subject S3: F1_macro = 0.1056 (n=4300)
 - Test subject S8: F1_macro = 0.1805 (n=3618)
 - Test subject S2: F1_macro = 0.1802 (n=4027)
 - Test subject S17: F1_macro = 0.1786 (n=3921)
 - Test subject S9: F1_macro = 0.2323 (n=3460)
 - Test subject S7: F1_macro = 0.1119 (n=3462)
 - Test subject S16: F1_macro = 0.1823 (n=3724)
 - Test subject S13: F1_macro = 0.1814 (n=3664)
 - Test subject S4: F1_macro = 0.0826 (n=4261)
 - Test subject S14: F1_macro = 0.1848 (n=3667)
 - Test subject S15: F1_macro = 0.1667 (n=3479)
 - Test subject S6: F1_macro = 0.1185 (n=4689)
 - Test subject S10: F1_macro = 0.1021 (n=3636)
 - Test subject S11: F1_macro = 0.1009 (n=3459)
 - Test subject S5: F1_macro = 0.1707 (n=4141)

===== LOSO evaluation for window 5 sec =====
 - Test subject S3: F1_macro = 0.1217 (n=2569)
 - Test subject S8: F1_macro = 0.2004 (n=2164)
 - Test subject S2: F1_macro = 0.1811 (n=2404)
 - Test subject S17: F1_macro = 0.1671 (n=2344)
 - Test

Unnamed: 0,window_sec,f1_macro,n_windows
6,15,0.22206,11179
5,13,0.219463,12952
4,11,0.206863,15387
3,9,0.18488,18911
2,7,0.181833,24391
1,5,0.162704,34368
0,3,0.151936,57508


In [15]:
from collections import defaultdict

# cache: window_size → subject_id → (features, labels)
window_cache = defaultdict(dict)

def get_cached_windows(df_all, subjects, window_sec):
    if window_sec in window_cache:
        return window_cache[window_sec]
    
    print(f"\n[ CACHE BUILD ] Computing windows for {window_sec} sec...")
    this_win = {}
    
    for s in subjects:
        df_s = df_all[df_all["subject"] == s].reset_index(drop=True)
        feats, labels = window_subject_with_purity(df_s, window_sec)
        this_win[s] = (feats, labels)
    
    window_cache[window_sec] = this_win
    return this_win


In [16]:
def loso_fast(window_sec, df_all, subjects):
    print(f"\n===== LOSO for window {window_sec} sec =====")

    win_data = get_cached_windows(df_all, subjects, window_sec)

    f1_scores = []

    for test_subj in subjects:
        train_subjs = [s for s in subjects if s != test_subj]

        # TRAIN
        X_train, y_train = [], []
        for s in train_subjs:
            feats, labels = win_data[s]
            X_train.extend(feats)
            y_train.extend(labels)

        # TEST
        X_test, y_test = win_data[test_subj]
        if len(y_test) == 0 or len(y_train) == 0:
            print(f"Skipping {test_subj} — no windows.")
            continue

        X_train = pd.DataFrame(X_train)
        X_test = pd.DataFrame(X_test)
        y_train = np.array(y_train)
        y_test = np.array(y_test)

        # MODEL
        clf = RandomForestClassifier(
            n_estimators=200,
            random_state=42,
            class_weight="balanced",
            n_jobs=-1
        )

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        f1 = f1_score(y_test, y_pred, average="macro")
        f1_scores.append(f1)

        print(f" - Subject {test_subj}: F1 = {f1:.4f}")

    return {
        "window_sec": window_sec,
        "f1_macro": np.mean(f1_scores),
        "n_subjects": len(f1_scores)
    }


In [17]:
window_sizes = [15, 17, 19, 21, 23, 25, 27, 29]
results = []

for w in window_sizes:
    res = loso_fast(w, df_all, subjects)
    results.append(res)

results_df = pd.DataFrame(results)
results_df.sort_values("f1_macro", ascending=False)



===== LOSO for window 15 sec =====

[ CACHE BUILD ] Computing windows for 15 sec...
 - Subject S3: F1 = 0.1470
 - Subject S8: F1 = 0.2283
 - Subject S2: F1 = 0.1847
 - Subject S17: F1 = 0.2720
 - Subject S9: F1 = 0.2790
 - Subject S7: F1 = 0.2309
 - Subject S16: F1 = 0.2883
 - Subject S13: F1 = 0.1914
 - Subject S4: F1 = 0.0884
 - Subject S14: F1 = 0.2294
 - Subject S15: F1 = 0.2282
 - Subject S6: F1 = 0.3321
 - Subject S10: F1 = 0.2093
 - Subject S11: F1 = 0.2360
 - Subject S5: F1 = 0.2116

===== LOSO for window 17 sec =====

[ CACHE BUILD ] Computing windows for 17 sec...
 - Subject S3: F1 = 0.1506
 - Subject S8: F1 = 0.2327
 - Subject S2: F1 = 0.1824
 - Subject S17: F1 = 0.2734
 - Subject S9: F1 = 0.2859
 - Subject S7: F1 = 0.2313
 - Subject S16: F1 = 0.2886
 - Subject S13: F1 = 0.2244
 - Subject S4: F1 = 0.0928
 - Subject S14: F1 = 0.2691
 - Subject S15: F1 = 0.2116
 - Subject S6: F1 = 0.3595
 - Subject S10: F1 = 0.2330
 - Subject S11: F1 = 0.2608
 - Subject S5: F1 = 0.2224

=====

Unnamed: 0,window_sec,f1_macro,n_subjects
7,29,0.258431,15
4,23,0.251656,15
6,27,0.250792,15
5,25,0.246602,15
2,19,0.242947,15
3,21,0.240244,15
1,17,0.234572,15
0,15,0.223779,15


STATYSTYKI

In [18]:
def window_subject_with_purity(df_s, window_sec, fs=700, purity_threshold=0.9):
    window_size = int(window_sec * fs)
    stride = window_size // 2  # 50% overlap

    features = []
    labels_out = []

    label_arr = df_s["label"].values

    for start in range(0, len(df_s) - window_size, stride):
        stop = start + window_size

        w_df = df_s.iloc[start:stop].reset_index(drop=True)
        w_labels = label_arr[start:stop]

        purity, dominant_label = compute_purity(w_labels)
        if purity >= purity_threshold:
            feats = extract_features_from_window(w_df)
            features.append(feats)
            labels_out.append(dominant_label)

    return features, labels_out


In [19]:
window_cache = defaultdict(dict)

def get_cached_windows(df_all, subject_list, window_sec):
    if window_sec in window_cache:
        return window_cache[window_sec]

    print(f"\n[ CACHE BUILD ] Computing windows for {window_sec} sec...")
    cache_entry = {}

    for s in subject_list:
        df_s = df_all[df_all["subject"] == s].reset_index(drop=True)
        feats, labels = window_subject_with_purity(df_s, window_sec)
        cache_entry[s] = (feats, labels)

    window_cache[window_sec] = cache_entry
    return cache_entry


In [20]:
def loso_fast(window_sec, df_all, subject_list):
    print(f"\n===== LOSO for window {window_sec} sec =====")

    win_data = get_cached_windows(df_all, subject_list, window_sec)
    f1_scores = []

    for test_s in subject_list:
        train_s = [s for s in subject_list if s != test_s]

        X_train, y_train = [], []
        for s in train_s:
            feats, labels = win_data[s]
            X_train.extend(feats)
            y_train.extend(labels)

        X_test, y_test = win_data[test_s]

        if len(y_test) == 0 or len(y_train) == 0:
            print(f"Skipping {test_s}")
            continue

        X_train_df = pd.DataFrame(X_train)
        X_test_df = pd.DataFrame(X_test)

        clf = RandomForestClassifier(
            n_estimators=300,
            class_weight="balanced",
            random_state=42,
            n_jobs=-1
        )

        clf.fit(X_train_df, y_train)
        y_pred = clf.predict(X_test_df)

        f1 = f1_score(y_test, y_pred, average="macro")
        f1_scores.append(f1)
        print(f" - {test_s}: F1 = {f1:.4f}")

    return np.mean(f1_scores)


In [21]:
window_sizes = [5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29]
results = []

for w in window_sizes:
    f1 = loso_fast(w, df_all, train_subjects)
    results.append((w, f1))

results_df = pd.DataFrame(results, columns=["window_sec", "f1_macro"])
results_df.sort_values("f1_macro", ascending=False)



===== LOSO for window 5 sec =====

[ CACHE BUILD ] Computing windows for 5 sec...
 - S3: F1 = 0.0885
 - S8: F1 = 0.2067
 - S2: F1 = 0.1911
 - S17: F1 = 0.1004
 - S9: F1 = 0.1939
 - S7: F1 = 0.0833
 - S16: F1 = 0.1310
 - S13: F1 = 0.1761
 - S4: F1 = 0.0899
 - S14: F1 = 0.2433

===== LOSO for window 7 sec =====

[ CACHE BUILD ] Computing windows for 7 sec...
 - S3: F1 = 0.0903
 - S8: F1 = 0.2262
 - S2: F1 = 0.2052
 - S17: F1 = 0.1057
 - S9: F1 = 0.2257
 - S7: F1 = 0.1160
 - S16: F1 = 0.1684
 - S13: F1 = 0.1787
 - S4: F1 = 0.0902
 - S14: F1 = 0.2175

===== LOSO for window 9 sec =====

[ CACHE BUILD ] Computing windows for 9 sec...
 - S3: F1 = 0.0939
 - S8: F1 = 0.2338
 - S2: F1 = 0.2115
 - S17: F1 = 0.1253
 - S9: F1 = 0.2919
 - S7: F1 = 0.1249
 - S16: F1 = 0.1836
 - S13: F1 = 0.1768
 - S4: F1 = 0.0917
 - S14: F1 = 0.2354

===== LOSO for window 11 sec =====

[ CACHE BUILD ] Computing windows for 11 sec...
 - S3: F1 = 0.1083
 - S8: F1 = 0.2422
 - S2: F1 = 0.2031
 - S17: F1 = 0.1184
 - S9: 

Unnamed: 0,window_sec,f1_macro
11,27,0.229351
9,23,0.227866
8,21,0.227088
12,29,0.224465
10,25,0.223801
6,17,0.209207
7,19,0.206032
5,15,0.20157
4,13,0.192108
3,11,0.187149


In [22]:
best_window = results_df.iloc[results_df["f1_macro"].idxmax()]["window_sec"]
print("BEST WINDOW =", best_window)


BEST WINDOW = 27.0


In [25]:
def train_final_model(df_all, train_subjects, test_subjects, window_sec):
    print(f"\n=== FINAL MODEL at {window_sec}s ===")

    # build cache for train + test separately
    win_train = get_cached_windows(df_all, train_subjects, window_sec)
    win_test  = get_cached_windows(df_all, test_subjects, window_sec)

    X_train, y_train = [], []
    for s in train_subjects:
        feats, labels = win_train.get(s, ([], []))
        if len(labels) == 0:
            print(f"[WARNING] Train subject {s} has no valid windows. Skipping.")
            continue
        X_train.extend(feats)
        y_train.extend(labels)

    X_test, y_test = [], []
    for s in test_subjects:
        feats, labels = win_test.get(s, ([], []))
        if len(labels) == 0:
            print(f"[WARNING] Test subject {s} has no valid windows. Skipping.")
            continue
        X_test.extend(feats)
        y_test.extend(labels)

    if len(y_train) == 0 or len(y_test) == 0:
        raise ValueError("No train or test windows – cannot train final model.")

    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)

    clf = RandomForestClassifier(
        n_estimators=300,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    )

    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    f1 = f1_score(y_test, y_pred, average="macro")
    print(f"\n=== FINAL 70/30 F1_macro = {f1:.4f} ===")

    return f1


In [27]:
def window_valid_for_test(window_sec, df_all, test_subjects):
    """
    Returns True if every test subject has at least 1 valid window.
    """
    win_test = get_cached_windows(df_all, test_subjects, window_sec)

    for s in test_subjects:
        feats, labels = win_test.get(s, ([], []))
        if len(labels) == 0:
            print(f"[INFO] Window {window_sec}s INVALID for test subject {s} (0 windows)")
            return False

    return True


In [28]:
valid_windows = []

for w in window_sizes:
    if window_valid_for_test(w, df_all, test_subjects):
        valid_windows.append(w)

print("\nVALID WINDOWS:", valid_windows)


[INFO] Window 5s INVALID for test subject S15 (0 windows)
[INFO] Window 7s INVALID for test subject S15 (0 windows)
[INFO] Window 9s INVALID for test subject S15 (0 windows)
[INFO] Window 11s INVALID for test subject S15 (0 windows)
[INFO] Window 13s INVALID for test subject S15 (0 windows)
[INFO] Window 15s INVALID for test subject S15 (0 windows)
[INFO] Window 17s INVALID for test subject S15 (0 windows)
[INFO] Window 19s INVALID for test subject S15 (0 windows)
[INFO] Window 21s INVALID for test subject S15 (0 windows)
[INFO] Window 23s INVALID for test subject S15 (0 windows)
[INFO] Window 25s INVALID for test subject S15 (0 windows)
[INFO] Window 27s INVALID for test subject S15 (0 windows)
[INFO] Window 29s INVALID for test subject S15 (0 windows)

VALID WINDOWS: []


In [26]:
final_f1 = train_final_model(df_all, train_subjects, test_subjects, best_window)



=== FINAL MODEL at 27.0s ===


ValueError: No train or test windows – cannot train final model.

In [12]:
from scipy.stats import skew, kurtosis

def compute_stats(series):
    return pd.Series({
        "mean": series.mean(),
        "std": series.std(),
        "min": series.min(),
        "max": series.max(),
        "var": series.var(),
        "skew": skew(series, nan_policy="omit"),
        "kurtosis": kurtosis(series, nan_policy="omit")
    })


# Compute statistics per subject
stats_all = df_all.groupby("subject").apply(
    lambda g: g.drop(columns=["subject"]).apply(compute_stats).T
)

stats_all


  "skew": skew(series, nan_policy="omit"),
  "kurtosis": kurtosis(series, nan_policy="omit")
  "skew": skew(series, nan_policy="omit"),
  "kurtosis": kurtosis(series, nan_policy="omit")
  "skew": skew(series, nan_policy="omit"),
  "kurtosis": kurtosis(series, nan_policy="omit")
  "skew": skew(series, nan_policy="omit"),
  "kurtosis": kurtosis(series, nan_policy="omit")
  "skew": skew(series, nan_policy="omit"),
  "kurtosis": kurtosis(series, nan_policy="omit")
  "skew": skew(series, nan_policy="omit"),
  "kurtosis": kurtosis(series, nan_policy="omit")
  "skew": skew(series, nan_policy="omit"),
  "kurtosis": kurtosis(series, nan_policy="omit")
  "skew": skew(series, nan_policy="omit"),
  "kurtosis": kurtosis(series, nan_policy="omit")
  "skew": skew(series, nan_policy="omit"),
  "kurtosis": kurtosis(series, nan_policy="omit")
  "skew": skew(series, nan_policy="omit"),
  "kurtosis": kurtosis(series, nan_policy="omit")
  "skew": skew(series, nan_policy="omit"),
  "kurtosis": kurtosis(seri

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,std,min,max,var,skew,kurtosis
subject,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
S10,ACC_x,0.811878,1.041634e-01,0.482000,1.476800,1.085001e-02,-0.675613,-0.799878
S10,ACC_y,0.007225,6.156533e-02,-0.431000,0.480800,3.790290e-03,-0.392816,0.016027
S10,ACC_z,-0.325397,2.849468e-01,-1.852000,1.533200,8.119470e-02,0.297307,-0.911466
S10,EDA,1.281778,4.552230e-01,0.416183,2.257919,2.072280e-01,0.262706,-1.610727
S10,RESP,0.050904,3.450750e+00,-26.930237,31.895447,1.190767e+01,0.279096,3.911248
...,...,...,...,...,...,...,...,...
S9,EDA,2.768166,6.799757e-01,1.320267,4.241943,4.623670e-01,-0.234411,-0.865254
S9,RESP,0.063847,4.397762e+00,-45.826721,32.972717,1.934031e+01,0.089212,6.994388
S9,ECG,0.001533,2.279383e-01,-1.499680,1.499954,5.195585e-02,1.642037,13.266309
S9,HR_mean,252.610795,2.842171e-14,252.610795,252.610795,8.077938e-28,,


teraz usuwamy cechy w ktorych wariancja jest za mala

In [13]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler

# -----------------------------
# 1. Select only numeric columns
# -----------------------------
numeric_cols = df_all.drop(columns=["subject"]).select_dtypes(include=[np.number]).columns
X = df_all[numeric_cols]



In [14]:
# -----------------------------
# 2. Standardize features
# -----------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=numeric_cols)



In [None]:
# -----------------------------
# 3. Compute variances
# -----------------------------
variances = X_scaled_df.var()

# absolute threshold (1e-5)
threshold_absolute = 1e-5

# relative threshold (10% of median variance)
threshold_relative = 0.1 * np.median(variances)

print("Absolute threshold:", threshold_absolute)
print("Relative threshold:", threshold_relative)



In [None]:
# -----------------------------
# 4. Identify low-variance features
# -----------------------------
low_variance_features = variances[
    (variances < threshold_absolute) |
    (variances < threshold_relative)
].index.tolist()

print("\nLow-variance features:")
for f in low_variance_features:
    print(" -", f)




In [None]:
# -----------------------------
# 5. Remove low-variance features from dataset
# -----------------------------
X_reduced = X_scaled_df.drop(columns=low_variance_features)

print("\nShape before:", X_scaled_df.shape)
print("Shape after:", X_reduced.shape)