In [5]:
import numpy as np
import pandas as pd
import sys
import os

# Add the src directory to the Python path
sys.path.append(os.path.abspath(os.path.join('..')))

from src.data_loader import load_wesad_subject
from src.processing import wavelet_denoise
from src.feature_extraction import extract_gsr_features, extract_hrv_features

# Set pandas display options to see all columns
pd.set_option('display.max_columns', None)

In [6]:
# --- Parameters ---
WINDOW_SIZE_SEC = 60  # 60-second window
STEP_SIZE_SEC = 10    # 10-second step (creates 50 seconds of overlap)
RAW_DATA_DIR = '../data/raw/'

# --- Main list to hold DataFrames from all subjects ---
all_subjects_df_list = []

# --- Loop through all .pkl files in the raw data directory ---
for filename in sorted(os.listdir(RAW_DATA_DIR)):
    if filename.endswith(".pkl"):
        subject_id = filename.split('.')[0]
        print(f"--- Processing Subject: {subject_id} ---")
        
        # --- Load Data ---
        data_path = os.path.join(RAW_DATA_DIR, filename)
        data = load_wesad_subject(data_path)
        
        if not data:
            print(f"Could not load data for {subject_id}. Skipping.")
            continue

        # --- The rest of the processing logic from before ---
        gsr_signal_raw = data['signal']['wrist']['EDA']
        bvp_signal_raw = data['signal']['wrist']['BVP']
        labels = data['label']
        
        subject_features = []

        # Process both "calm" (1) and "stress" (3) conditions
        for label_id, label_name in [(1, 'calm'), (3, 'stress')]:
            # This inner logic is the same as the function we wrote before
            label_rate = 700; gsr_rate = 4; bvp_rate = 64
            condition_indices = np.where(labels == label_id)[0]

            if len(condition_indices) == 0:
                print(f"No data for condition '{label_name}' in {subject_id}. Skipping.")
                continue

            start_idx_label = condition_indices[0]; end_idx_label = condition_indices[-1]
            start_idx_gsr = int(start_idx_label * (gsr_rate/label_rate)); end_idx_gsr = int(end_idx_label * (gsr_rate/label_rate))
            gsr_segment = gsr_signal_raw[start_idx_gsr:end_idx_gsr].flatten()
            start_idx_bvp = int(start_idx_label * (bvp_rate/label_rate)); end_idx_bvp = int(end_idx_label * (bvp_rate/label_rate))
            bvp_segment = bvp_signal_raw[start_idx_bvp:end_idx_bvp].flatten()
            
            gsr_denoised = wavelet_denoise(gsr_segment); bvp_denoised = wavelet_denoise(bvp_segment)
            
            window_samples_gsr = WINDOW_SIZE_SEC * gsr_rate; step_samples_gsr = STEP_SIZE_SEC * gsr_rate
            window_samples_bvp = WINDOW_SIZE_SEC * bvp_rate; step_samples_bvp = STEP_SIZE_SEC * bvp_rate
            
            num_windows = int((len(gsr_denoised) - window_samples_gsr) / step_samples_gsr) + 1
            for i in range(num_windows):
                start_gsr = i * step_samples_gsr; end_gsr = start_gsr + window_samples_gsr
                start_bvp = i * step_samples_bvp; end_bvp = start_bvp + window_samples_bvp
                gsr_window = gsr_denoised[start_gsr:end_gsr]; bvp_window = bvp_denoised[start_bvp:end_bvp]
                
                gsr_features = extract_gsr_features(gsr_window, gsr_rate)
                hrv_features = extract_hrv_features(bvp_window, bvp_rate)
                
                combined_features = {**gsr_features, **hrv_features}
                combined_features['label'] = label_name
                combined_features['subject'] = subject_id # Add subject ID for tracking
                subject_features.append(combined_features)

        if subject_features:
            all_subjects_df_list.append(pd.DataFrame(subject_features))

# --- Combine all data into a single master DataFrame ---
master_df = pd.concat(all_subjects_df_list, ignore_index=True)

print("\n\n--- Master Dataset Creation Complete! ---")
display(master_df)
print(f"Total records created: {len(master_df)}")

--- Processing Subject: S10 ---
--- Processing Subject: S11 ---
--- Processing Subject: S13 ---
--- Processing Subject: S14 ---
--- Processing Subject: S15 ---
--- Processing Subject: S16 ---
--- Processing Subject: S17 ---
--- Processing Subject: S2 ---
--- Processing Subject: S3 ---
--- Processing Subject: S4 ---
--- Processing Subject: S5 ---
--- Processing Subject: S6 ---
--- Processing Subject: S7 ---
--- Processing Subject: S8 ---
--- Processing Subject: S9 ---


--- Master Dataset Creation Complete! ---


Unnamed: 0,scr_count,mean_scr_amplitude,gsr_mean,gsr_std,gsr_range,mean_hr,rmssd,sdnn,pnn50,label,subject
0,5,0.043601,0.397778,0.037204,0.224417,74.818325,306.824533,240.572730,57.534247,calm,S10
1,3,0.025483,0.395624,0.035254,0.224417,77.362637,294.686576,234.772513,50.000000,calm,S10
2,0,0.000000,0.394509,0.034478,0.219094,80.799579,251.914888,213.196455,35.443038,calm,S10
3,0,0.000000,0.380959,0.008688,0.044596,84.006326,198.304071,192.294531,28.048780,calm,S10
4,0,0.000000,0.378745,0.002554,0.014800,86.450262,168.201041,161.388352,24.705882,calm,S10
...,...,...,...,...,...,...,...,...,...,...,...
2155,0,0.000000,0.391336,0.001702,0.007638,87.409733,58.874690,49.851223,8.139535,stress,S9
2156,0,0.000000,0.391706,0.001607,0.007638,87.869542,73.074390,56.117888,12.790698,stress,S9
2157,0,0.000000,0.391630,0.001566,0.007638,87.365079,73.170078,53.541617,12.941176,stress,S9
2158,0,0.000000,0.391272,0.001491,0.007638,86.831604,72.490250,48.730227,8.333333,stress,S9


Total records created: 2160


In [7]:
# Create the processed data directory if it doesn't exist
output_dir = '../data/processed'
os.makedirs(output_dir, exist_ok=True)

# Save the DataFrame to a CSV file
output_path = f'{output_dir}/features_master_dataset.csv'
master_df.to_csv(output_path, index=False)

print(f"Master dataset saved successfully to: {output_path}")

Master dataset saved successfully to: ../data/processed/features_master_dataset.csv
