In [6]:
import pandas as pd
import numpy as np
from scipy import stats
import os

#Load dataset
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
df = pd.concat([train, test], ignore_index=True)

print("Initial shape:", df.shape)

#Select physiological signals
physio_cols = ['datasetId', 'condition', 'HR', 'RMSSD', 'LF_HF', 'sampen']
df = df[physio_cols].copy()

#Handle missing values
print("\nMissing values before cleaning:")
print(df.isna().sum())

df = df.dropna(subset=['HR', 'RMSSD', 'LF_HF', 'sampen'])

#Remove outliers using z-score (per subject)
def remove_outliers_groupwise(df, cols, z_thresh=3.0):
    df2 = df.copy()
    bad_idx = set()
    for pid, g in df2.groupby('datasetId'):
        for col in cols:
            if g[col].nunique() > 1:
                z = np.abs(stats.zscore(g[col], nan_policy='omit'))
                bad_idx.update(g.index[z > z_thresh])
    return df2.drop(index=list(bad_idx))

clean_cols = ['HR', 'RMSSD', 'LF_HF', 'sampen']
df_clean = remove_outliers_groupwise(df, clean_cols, z_thresh=3.0)
print("\nShape after cleaning:", df_clean.shape)

#Save cleaned dataset
os.makedirs("../outputs", exist_ok=True)
df_clean.to_csv("../outputs/cleaned_data.csv", index=False)
print("Cleaned dataset saved at ../outputs/cleaned_data.csv")

Initial shape: (410322, 36)

Missing values before cleaning:
datasetId    0
condition    0
HR           0
RMSSD        0
LF_HF        0
sampen       0
dtype: int64

Shape after cleaning: (391137, 6)
Cleaned dataset saved at ../outputs/cleaned_data.csv
