In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

In [None]:
# --- Load the Master Dataset ---
DATA_PATH = '../data/processed/features_master_dataset.csv'
df = pd.read_csv(DATA_PATH)

# --- Prepare Data for ML ---
# 1. Encode labels: Convert 'calm' and 'stress' to 0 and 1
df['label'] = df['label'].map({'calm': 0, 'stress': 1})

# 2. Define features (X) and target (y)
# We drop 'subject' as it's an identifier, not a feature for the model to learn
X = df.drop(['label', 'subject'], axis=1)
y = df['label']

print("Master dataset loaded and prepared:")
print(f"Total number of samples: {len(df)}")
print(f"Class distribution:\n{df['label'].value_counts(normalize=True)}")
display(X.head())

Dataset loaded and prepared:
Number of samples: 145
Class distribution:
label
0    113
1     32
Name: count, dtype: int64


Unnamed: 0,scr_count,mean_scr_amplitude,gsr_mean,gsr_std,gsr_range,mean_hr,rmssd,sdnn,pnn50
0,5,0.043601,0.397778,0.037204,0.224417,74.818325,306.824533,240.57273,57.534247
1,3,0.025483,0.395624,0.035254,0.224417,77.362637,294.686576,234.772513,50.0
2,0,0.0,0.394509,0.034478,0.219094,80.799579,251.914888,213.196455,35.443038
3,0,0.0,0.380959,0.008688,0.044596,84.006326,198.304071,192.294531,28.04878
4,0,0.0,0.378745,0.002554,0.0148,86.450262,168.201041,161.388352,24.705882


In [None]:
# Split the data: 80% for training, 20% for testing
# stratify=y ensures both sets have a balanced representation of 'calm' and 'stress' samples
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Training set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")

Training set shape: (116, 9)
Testing set shape: (29, 9)


In [None]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Fit the scaler ONLY on the training data to prevent information leakage
X_train_scaled = scaler.fit_transform(X_train)

# Transform the test data using the fitted scaler
X_test_scaled = scaler.transform(X_test)