# **2) Preprocessing**

In [6]:
import os
import csv
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import mode

from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, lit, pandas_udf
from pyspark.sql.functions import pandas_udf, PandasUDFType
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

## Combining the self report questionnaires: Generating the all_questionnaires.csv file, which comes from every subjects S'X'quest.csv

In [2]:
base_path = "../ialtamirano/raw_data/WESAD"

In [5]:
# check order of experimental conditions in S2_quest.csv
quest_path = '../ialtamirano/raw_data/WESAD/S2/S2_quest.csv'

# map questionnaire column numbers to experimental conditions
with open(quest_path) as f:
    for line in f:
        if line.startswith('# ORDER'):
            order_tokens = line.strip().split(';') 
            conditions = order_tokens[1:6] # takes 5 conditions
            for idx, cond in enumerate(conditions, start=1):  # pair each condition with 1-based index
                print(f'{idx}: {cond}')
            break

    # read the rest of the file into a list, keeping  non-blank lines
    raw_lines = [line.strip() for line in f if line.strip()]

# Ge'# TAGS from raw_lines list
tags = set()
for line in raw_lines:
    if line.startswith('#'):
        tag = line.split(';')[0]
        tags.add(tag)
print(f'Tags Present: {tags}')   

1: Base
2: TSST
3: Medi 1
4: Fun
5: Medi 2
Tags Present: {'# DIM', '# PANAS', '# END', '# START', '# SSSQ', '# STAI'}


In [8]:
# combining all questionnaries and storing in new csv
QUESTIONNAIRE_CSV_PATH = './all_questionnaires.csv'

# Defining names of each of the items based on pdf data file 

# PANAS: 24 items
PANAS_QUESTIONS = [q.lower() for q in [
    'Active', 'Distressed', 'Interested', 'Inspired', 
    'Annoyed', 'Strong', 'Guilty', 'Scared', 'Hostile', 'Excited', 
    'Proud', 'Irritable', 'Enthusiastic', 'Ashamed', 'Alert', 'Nervous', 
    'Determined', 'Attentive', 'Jittery', 'Afraid', 'Stressed', 'Frustrated',
    'Happy', 'Sad']]

# STAI: 6 items
STAI_QUESTIONS = [q.lower() for q in ['I_feel_at_ease', 'I_feel_nervous', 'I_am_jittery', 'I_am_relaxed',
              'I_am_worried', 'I_feel_pleasant']]
    
# SAM: 2 items
SAM_QUESTIONS = [q.lower() for q in ['Valence', 'Arousal']]

#SSSQ: 6 items
SSSQ_QUESTIONS = [q.lower() for q in [
    'Committed_to_goals', 'Wanted_to_succeed', 'Motivated', 
    'Reflected_about_self', 'Worried_about_others', 'Concerned_about_impression']]


# map each of 4 target conditions to questionnaire indix
condition_index_map = {1:0, 2:1, 3:3, 4:2}
condition_name_map = {1:'baseline', 2:'stress', 3:'amusement', 4:'meditation'}

# initialize dictionary for each (subject, condition) pair. 
questionnaire_records = []

# process ea. subjects questionnaire CSV (exclude invalid)
for subject_id in sorted(os.listdir(base_path)):
    if not subject_id.startswith('S') or subject_id in {'S1','S12'}:
        continue

    #setting questionaire file path, read all non-empty lines 
    csv_path = os.path.join(base_path, subject_id, f'{subject_id}_quest.csv')
    with open(csv_path) as f:
        raw_line = [line.strip() for line in f if line.strip()]

    # filter each raw line into 1 of the 4 possible lists:
    panas_lines = [line for line in raw_line if line.startswith('# PANAS')]
    stais_lines = [line for line in raw_line if line.startswith('# STAI')]
    sams_lines = [line for line in raw_line if line.startswith('# DIM')]
    sssq_lines = [line for line in raw_line if line.startswith('# SSSQ')]

    #fcn return tokens of items w/out tag
    def split_answer_tokens(line):
        return line.split(';')[1:] # return only numeric string-no tag


    #one record per condition
    for condition_id in range(1,5):
        record = {
            'subject_id': subject_id,
            'condition_id': condition_id,
            'condition_name': condition_name_map[condition_id]}
            
        idx = condition_index_map[condition_id]

    #for all 4 questionnaires: lets get line, split off tag, get tokens & convert into int
        # PANAS: 
        answers = split_answer_tokens(panas_lines[idx])
        
        for j, questions in enumerate(PANAS_QUESTIONS):
            tokens = answers[j]
            try:
                record[f'panas_{questions}'] = int(tokens)
            except ValueError:
                record[f'panas_{questions}'] = pd.NA

        # STAI: 
        answers = split_answer_tokens(stais_lines[idx])
        for j, questions in enumerate(STAI_QUESTIONS):
            tokens = answers[j]
            try:
                record[f'stai_{questions}'] = int(tokens)
            except ValueError:
                record[f'stai_{questions}'] = pd.NA
        
        # SAM:
        answers = split_answer_tokens(sams_lines[idx])

        for j, questions in enumerate(SAM_QUESTIONS):
            tokens = answers[j]
            try:
                record[f'sam_{questions}'] = int(tokens)
            except ValueError:
                record[f'sam_{questions}'] = pd.NA

        # SSSQ: ONLY for stress(condition 2)
        if condition_id == 2:
            answers = split_answer_tokens(sssq_lines[0])
            for j, questions in enumerate(SSSQ_QUESTIONS):
                tokens = answers[j]
                try:
                    record[f'sssq_{questions}'] = int(tokens)
                except ValueError:
                    record[f'sssq_{questions}'] = pd.NA
    # for all other conditions, fill SSSQ fields w/ NA nonapplicable                 
        else:                                           
            for questions in SSSQ_QUESTIONS:
                record[f'sssq_{questions}'] = pd.NA  

        questionnaire_records.append(record) # append subject-condition dictionary to list 


# combine & save all records into a DF
df_all = pd.DataFrame(questionnaire_records)
df_all.to_csv(QUESTIONNAIRE_CSV_PATH, index=False)

print('Saved combined to ', QUESTIONNAIRE_CSV_PATH)

Saved all questionnaires to ./all_questionnaires.csv


## Combining the sensor data: generating the df_['wrist/chest']_['modality'].csv files from every subjects S'X'.pkl

In [1]:
# made the process more clean and iterative:

In [1]:
import os
import pickle
import numpy as np
import pandas as pd

# C
subject_ids = [f"S{i}" for i in range(2, 18) if i != 12]
base_path = "../ialtamirano/raw_data/WESAD"
export_path = "combined_pkl_csv"
os.makedirs(export_path, exist_ok=True)
valid_labels = [1, 2, 3, 4]

# functions

def load_data(subject):
    file_path = os.path.join(base_path, subject, f"{subject}.pkl")
    with open(file_path, "rb") as f:
        return pickle.load(f, encoding="latin1")

def downsample_labels(labels, target_len):
    factor = len(labels) // target_len
    return np.array([np.bincount(labels[i*factor:(i+1)*factor]).argmax() for i in range(target_len)])

def extract_signal_rows(subject, data, signal_path, is_acc=False, downsample=False):
    signal = get_nested(data['signal'], signal_path)
    labels = data['label']
    if downsample:
        labels = downsample_labels(labels, len(signal))
    N = min(len(signal), len(labels))

    rows = []
    for i in range(N):
        label = int(labels[i])
        if label not in valid_labels:
            continue
        if is_acc:
            rows.append({
                "subject": subject, "label": label,
                f"{signal_path[-1]}_x": float(signal[i][0]),
                f"{signal_path[-1]}_y": float(signal[i][1]),
                f"{signal_path[-1]}_z": float(signal[i][2]),
                "sample": i
            })
        else:
            rows.append({
                "subject": subject, "label": label,
                "value": float(signal[i]), "sample": i
            })
    return rows

def get_nested(d, keys):
    for key in keys:
        d = d[key]
    return d

def process_modalities(modalities, downsample=False):
    for name, signal_path in modalities.items():
        print(f"Processing {name}...")
        is_acc = signal_path[-1] == 'ACC'
        all_rows = []
        for subject in subject_ids:
            data = load_data(subject)
            rows = extract_signal_rows(subject, data, signal_path, is_acc=is_acc, downsample=downsample)
            all_rows.extend(rows)
        df = pd.DataFrame(all_rows)
        df.to_csv(os.path.join(export_path, f"df_{name}.csv"), index=False)
        print(f"{name} shape: {df.shape}")
        display(df.head())

# config.

chest_modalities = {
    "chest_ecg": ["chest", "ECG"],
    "chest_emg": ["chest", "EMG"],
    "chest_eda": ["chest", "EDA"],
    "chest_temp": ["chest", "Temp"],
    "chest_resp": ["chest", "Resp"],
    "chest_acc": ["chest", "ACC"]
}

wrist_modalities = {
    "wrist_bvp": ["wrist", "BVP"],
    "wrist_eda": ["wrist", "EDA"],
    "wrist_temp": ["wrist", "TEMP"],
    "wrist_acc": ["wrist", "ACC"]
}

### process modalities

process_modalities(chest_modalities, downsample=False)
process_modalities(wrist_modalities, downsample=True)


Processing chest_ecg...
chest_ecg shape: (31470603, 4)


Unnamed: 0,subject,label,value,sample
0,S2,1,0.030945,214583
1,S2,1,0.033646,214584
2,S2,1,0.033005,214585
3,S2,1,0.031815,214586
4,S2,1,0.03035,214587


Processing chest_emg...
chest_emg shape: (31470603, 4)


Unnamed: 0,subject,label,value,sample
0,S2,1,-0.003708,214583
1,S2,1,-0.014145,214584
2,S2,1,0.010208,214585
3,S2,1,0.012634,214586
4,S2,1,0.00206,214587


Processing chest_eda...
chest_eda shape: (31470603, 4)


Unnamed: 0,subject,label,value,sample
0,S2,1,5.710983,214583
1,S2,1,5.719376,214584
2,S2,1,5.706406,214585
3,S2,1,5.712509,214586
4,S2,1,5.727005,214587


Processing chest_temp...
chest_temp shape: (31470603, 4)


Unnamed: 0,subject,label,value,sample
0,S2,1,29.083618,214583
1,S2,1,29.122437,214584
2,S2,1,29.115234,214585
3,S2,1,29.126709,214586
4,S2,1,29.100861,214587


Processing chest_resp...
chest_resp shape: (31470603, 4)


Unnamed: 0,subject,label,value,sample
0,S2,1,1.191711,214583
1,S2,1,1.139832,214584
2,S2,1,1.141357,214585
3,S2,1,1.15509,214586
4,S2,1,1.133728,214587


Processing chest_acc...
chest_acc shape: (31470603, 6)


Unnamed: 0,subject,label,ACC_x,ACC_y,ACC_z,sample
0,S2,1,0.8914,-0.1102,-0.2576,214583
1,S2,1,0.8926,-0.1086,-0.2544,214584
2,S2,1,0.893,-0.1094,-0.258,214585
3,S2,1,0.8934,-0.1082,-0.2538,214586
4,S2,1,0.893,-0.1096,-0.257,214587


Processing wrist_bvp...
wrist_bvp shape: (2857002, 4)


Unnamed: 0,subject,label,value,sample
0,S2,1,28.52,21458
1,S2,1,-47.98,21459
2,S2,1,-113.26,21460
3,S2,1,-157.08,21461
4,S2,1,-183.7,21462


Processing wrist_eda...
wrist_eda shape: (179832, 4)


Unnamed: 0,subject,label,value,sample
0,S2,1,1.645664,1226
1,S2,1,1.640539,1227
2,S2,1,1.634132,1228
3,S2,1,1.614912,1229
4,S2,1,1.591848,1230


Processing wrist_temp...
wrist_temp shape: (179832, 4)


Unnamed: 0,subject,label,value,sample
0,S2,1,35.81,1226
1,S2,1,35.81,1227
2,S2,1,35.81,1228
3,S2,1,35.81,1229
4,S2,1,35.81,1230


Processing wrist_acc...
wrist_acc shape: (1481709, 6)


Unnamed: 0,subject,label,ACC_x,ACC_y,ACC_z,sample
0,S2,1,42.0,-21.0,39.0,10218
1,S2,1,43.0,-22.0,39.0,10219
2,S2,1,43.0,-22.0,41.0,10220
3,S2,1,44.0,-21.0,39.0,10221
4,S2,1,44.0,-21.0,40.0,10222


In [3]:
#### showing that all labels and all subjects are present for each modality

import os
import glob
import pandas as pd

# csv from previous cell
export_path = "combined_pkl_csv"

modality_dfs = {
    os.path.basename(f).replace(".csv", ""): pd.read_csv(f)
    for f in glob.glob(os.path.join(export_path, "*.csv"))
}

# Show label counts per subject for each modality
for name, df in modality_dfs.items():
    print(f"\nLabel counts for {name}:")
    label_counts = df.groupby(['subject', 'label']).size().reset_index(name='count')
    print(label_counts)



Label counts for df_wrist_acc:
   subject  label  count
0      S10      1  39334
1      S10      2  24166
2      S10      3  12400
3      S10      4  25143
4      S11      1  39333
5      S11      2  22667
6      S11      3  12267
7      S11      4  24236
8      S13      1  39334
9      S13      2  22133
10     S13      3  12733
11     S13      4  25521
12     S14      1  39334
13     S14      2  22500
14     S14      3  12400
15     S14      4  24929
16     S15      1  39167
17     S15      2  22866
18     S15      3  12400
19     S15      4  25189
20     S16      1  39333
21     S16      2  22433
22     S16      3  12267
23     S16      4  26213
24     S17      1  39367
25     S17      2  24100
26     S17      3  12400
27     S17      4  24367
28      S2      1  38134
29      S2      2  20500
30      S2      3  12067
31      S2      4  24343
32      S3      1  38000
33      S3      2  21333
34      S3      3  12500
35      S3      4  26000
36      S4      1  38600
37      S4      2 

### Model-specific preprocessing is including at the top of each model