# Step 1: Imports

In [177]:
import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import os 

# Step 2: Load Files

In [179]:
demographics = [
    'P07', 'P10', 'P11', 'P13', 'P14', 'P16', 'P17', 'P18', 'P22', 'P23', 
    'P24', 'P25', 'P26', 'P27', 'P28', 'P29', 'P30', 'P31', 'P32', 'P33', 
    'P34', 'P35', 'P36'
]

## Step 2a: Load LSL Files

In [181]:
timings = []
for subject in demographics:
    path = os.path.join("data", "lsl", subject+"_lsl.tri")
    timing = pd.read_csv(path, sep=";", header=None, names=['time', 'onset', 'code'])
    timing['time'] = pd.to_datetime(timing['time']).dt.strftime('%H:%M:%S')
    timings.append(timing)
print(timings[0].shape)
print(timings[0].head())


(362, 3)
       time  onset  code
0  16:54:02   1583    30
1  16:54:16   1638    40
2  16:54:26   1675    90
3  16:54:35   1709    20
4  16:54:48   1757    50


## Step 2b: Load fNIRS Files

In [182]:
data = {}
for subject in demographics:
    path = os.path.join("data", "data_csvs", subject+".csv")
    fnir = pd.read_csv(path, sep=",", header=0)
    fnir = fnir.drop(columns=['Time'])
    data[subject] = fnir
print(data['P07'].head())

         Ch1       Ch2        Ch3       Ch4        Ch5       Ch6       Ch7  \
0 -30.737259  4.217945  -9.949267  7.276555 -25.171749  0.132245 -6.699986   
1 -29.338383  4.719302 -12.477225  6.479262 -25.561359 -0.462352 -4.677611   
2 -27.876869  5.192062 -14.931240  5.624817 -25.899460 -0.941451 -2.413918   
3 -26.310300  5.614543 -17.221782  4.683419 -26.157453 -1.199534  0.300224   
4 -24.616998  5.967590 -19.276144  3.636096 -26.324703 -1.152255  3.624535   

        Ch8        Ch9      Ch10  ...      Ch169     Ch170      Ch171  \
0  3.965802 -38.122370 -9.459359  ... -19.952898 -6.985181 -11.016354   
1  3.911549 -34.141558 -7.101328  ... -14.057330 -3.168199  -5.667243   
2  3.984652 -30.318783 -4.837616  ...  -8.359472  0.555269  -0.175635   
3  4.304991 -26.864080 -2.788756  ...  -3.120284  4.050139   5.529127   
4  4.968803 -23.961938 -1.059800  ...   1.411449  7.182750  11.466906   

       Ch172        Ch173       Ch174      Ch175      Ch176       Ch177  \
0  -2.944079 -191

## Step 2c: Load Workload File

In [186]:
#load the workload labels
workload = pd.read_csv("data/load/workload.csv")
workload['timestamp'] = workload['timestamp'].str.replace('.', ':')

workload.head()

Unnamed: 0,timestamp,participant_number,trial,condition,accuracy,extrinsic_load,intrinsic_load,condition_factor,last_trial,trial_clipped,response,pred_prob,pred_side,load_label
0,17:06:55,7,1,A3,1,0,1,A3,0,1,1,0.842584,right,optimal
1,17:07:01,7,1,A3,1,0,1,A3,0,1,1,0.842584,right,optimal
2,17:07:13,7,1,A3,1,0,1,A3,0,1,1,0.842584,right,optimal
3,17:07:22,7,1,A3,0,0,1,A3,0,1,0,0.842584,right,overload
4,17:07:29,7,1,A3,1,0,1,A3,0,1,1,0.842584,right,optimal


In [187]:
#separate the data into the different participants
workload_dict = {}
for i in range(len(demographics)):
    workload_dict[demographics[i]] = workload[workload['participant_number'] == int(demographics[i][1:])].iloc[:, :]
    workload_dict['P07']
workload_dict['P07']


Unnamed: 0,timestamp,participant_number,trial,condition,accuracy,extrinsic_load,intrinsic_load,condition_factor,last_trial,trial_clipped,response,pred_prob,pred_side,load_label
0,17:06:55,7,1,A3,1,0,1,A3,0,1,1,0.842584,right,optimal
1,17:07:01,7,1,A3,1,0,1,A3,0,1,1,0.842584,right,optimal
2,17:07:13,7,1,A3,1,0,1,A3,0,1,1,0.842584,right,optimal
3,17:07:22,7,1,A3,0,0,1,A3,0,1,0,0.842584,right,overload
4,17:07:29,7,1,A3,1,0,1,A3,0,1,1,0.842584,right,optimal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
157,17:36:12,7,7,S2,0,1,0,S2,1,6,0,0.533406,right,overload
158,17:36:41,7,7,S2,0,1,0,S2,1,6,0,0.533406,right,overload
159,17:36:48,7,7,S2,0,1,0,S2,1,6,0,0.533406,right,overload
160,17:36:52,7,7,S2,0,1,0,S2,1,6,0,0.533406,right,overload


## Sanity Checks

In [185]:
P07 = data['P07']
onsets = timings[0]
print(P07.shape)
sample = onsets.iloc[0, 1]
fnirs_sample = P07.iloc[sample,:]
fnirs_sample.shape

(11521, 178)


(178,)

In [183]:
print(len(timings))
print(len(data))
print(len(demographics))

23
23
23


In [188]:
unique_participant_numbers = workload['participant_number'].unique()
print(unique_participant_numbers)

[ 7  1 15 16 17 18 23 24 25]


In [184]:
sampling_rate = 3.8147
def get_onset(timing, sampling_rate):
    timing = timing / sampling_rate
    return timing

# Step 3: Filter LSL --> Align Workload Timestamps with LSL Timestamps --> Output is Combined Dictionary

In [189]:
# Filter matched_df to get rows where the timestamp is in the 'time' column of the first DataFrame in timings
for i in range(len(demographics)):
    matched_times = timings[i][timings[i]['time'].isin(workload_dict[demographics[i]]['timestamp'])]
    workload_dict[demographics[i]] = workload_dict[demographics[i]].merge(matched_times, left_on='timestamp', right_on='time', how='left')
workload_dict['P07'].head()

Unnamed: 0,timestamp,participant_number,trial,condition,accuracy,extrinsic_load,intrinsic_load,condition_factor,last_trial,trial_clipped,response,pred_prob,pred_side,load_label,time,onset,code
0,17:06:55,7,1,A3,1,0,1,A3,0,1,1,0.842584,right,optimal,17:06:55,4533,103
1,17:07:01,7,1,A3,1,0,1,A3,0,1,1,0.842584,right,optimal,17:07:01,4556,53
2,17:07:13,7,1,A3,1,0,1,A3,0,1,1,0.842584,right,optimal,17:07:13,4600,153
3,17:07:22,7,1,A3,0,0,1,A3,0,1,0,0.842584,right,overload,17:07:22,4634,163
4,17:07:29,7,1,A3,1,0,1,A3,0,1,1,0.842584,right,optimal,17:07:29,4661,153


In [190]:
#collating data and labels

#fnirs data
X_test = []
X_train = []
#labels
y_test = []
y_train = []

#iterate through the subjects
for sub in demographics:
    num_samples = 0
    sub_metadata = workload_dict[sub]
    sub_data = data[sub].to_numpy()
    for row in sub_metadata.itertuples():
        onset = row.onset
        label = row.load_label

        #Get the corresponding fnirs sample (onset-1 because of matlab indexing)?
        fnirs_sample = sub_data[onset-1]

        if np.isnan(fnirs_sample).any():
            continue
        if sub == 'P07' or sub == 'P18':
            X_test.append(fnirs_sample)
            y_test.append(label)
        else:
            X_train.append(fnirs_sample)
            y_train.append(label)
        num_samples += 1
    print(f"Subject {sub} has {num_samples} samples")
X_train = np.array(X_train)
X_test = np.array(X_test)
y_train = np.array(y_train)
y_test = np.array(y_test)

print(X_train.shape)
print(X_test.shape)
# Check for NaN values




Subject P07 has 190 samples
Subject P10 has 0 samples
Subject P11 has 0 samples
Subject P13 has 0 samples
Subject P14 has 0 samples
Subject P16 has 0 samples
Subject P17 has 0 samples
Subject P18 has 198 samples
Subject P22 has 0 samples
Subject P23 has 0 samples
Subject P24 has 230 samples
Subject P25 has 366 samples
Subject P26 has 0 samples
Subject P27 has 0 samples
Subject P28 has 0 samples
Subject P29 has 0 samples
Subject P30 has 0 samples
Subject P31 has 0 samples
Subject P32 has 0 samples
Subject P33 has 0 samples
Subject P34 has 0 samples
Subject P35 has 0 samples
Subject P36 has 0 samples
(596, 178)
(388, 178)


In [191]:
#change from labels to ints
y_train = [0 if i == 'optimal' else 1 for i in y_train]
y_test = [0 if i == 'optimal' else 1 for i in y_test]  


In [192]:
print(X_train.shape)
print(X_test.shape)

(596, 178)
(388, 178)


In [193]:
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.model_selection import StratifiedKFold

# I wanted to try some features from my old assignment models (1/2)
X_train, y_train = shuffle(X_train, y_train, random_state =1)
X_test, y_test = shuffle(X_test, y_test, random_state=1)

# Specific participants are being separated above
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html
sm = SMOTE(random_state=42)
X_train, y_train = sm.fit_resample(X_train, y_train)

# Should we standardize? Yes but only for SVM and not RF?
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
# scaler = StandardScaler()
# X_train = scaler.fit_transform(X_train)
# X_test = scaler.transform(X_test)

# I wanted to try some features from my old assignment models (2/2)
# Run only a subset to find the optimal hyperparameters
# subset_size = int(0.3 * X_train.shape[0])
# X_train = X_train[:subset_size]
# y_train = y_train[:subset_size]
# # subset_size = int(0.3 * X_test.shape[0])
# X_test = X_test[:subset_size]
# y_test = y_test[:subset_size]
# Define parameter options
parameters = [{'C': [1, 10], 'gamma': [0.1, 1, 'scale'], 'kernel': ['rbf', 'linear']},]
# KFold
cv = StratifiedKFold(n_splits=5, random_state=1, shuffle=True)
# GridSearch to find best hyperparameters
clf = GridSearchCV(SVC(class_weight='balanced'), param_grid=parameters, scoring='accuracy', cv=cv)

# I tested out RF temporarily and it performed worse (at time of code - status may change)
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
# clf = RandomForestClassifier(max_depth=2, random_state=0)

# clf = svm.SVC()
clf.fit(X_train, y_train)

print("Best hyperparameters: ", clf.best_params_)

y_train_pred = clf.predict(X_train)

print("Training Accuracy:", accuracy_score(y_train, y_train_pred))


y_pred = clf.predict(X_test)
print(y_pred)

print("Testing Accuracy:", accuracy_score(y_test, y_pred))

print(classification_report(y_test, y_pred))


