# Sample data preprocessing script

This script prepares the sample data used for debugging. The data is the EEG data from the [Kaggle](https://www.kaggle.com/datasets/wanghaohan/confused-eeg) repository. 

**References:**
Wang, H., Li, Y., Hu, X., Yang, Y., Meng, Z., & Chang, K. M. (2013, June). Using EEG to Improve Massive Open Online Courses Feedback Interaction. In AIED Workshops.

In [122]:
import pandas as pd
import numpy as np

# Specify the path to the dataset (i.e. the downloaded Kaggle dataset) and the path where to save the processed dataset
DATA_PATH = '~/Documents/Thesis/sample_data/EEG_data.csv'
SAVE_PATH = '~/Git/thesis-repo/data/sample/data.csv'
N_SEGMENTS = 4
NUMERIC_FEATURES = ['Attention', 'Mediation', 'Raw', 'Delta', 'Theta', 'Alpha1', 'Alpha2', 'Beta1', 'Beta2', 'Gamma1', 'Gamma2']

In [123]:
# Load the dataset from the specified path
df = pd.read_csv(DATA_PATH)
# Show the sample of the dataset
df.head()

Unnamed: 0,SubjectID,VideoID,Attention,Mediation,Raw,Delta,Theta,Alpha1,Alpha2,Beta1,Beta2,Gamma1,Gamma2,predefinedlabel,user-definedlabeln
0,0.0,0.0,56.0,43.0,278.0,301963.0,90612.0,33735.0,23991.0,27946.0,45097.0,33228.0,8293.0,0.0,0.0
1,0.0,0.0,40.0,35.0,-50.0,73787.0,28083.0,1439.0,2240.0,2746.0,3687.0,5293.0,2740.0,0.0,0.0
2,0.0,0.0,47.0,48.0,101.0,758353.0,383745.0,201999.0,62107.0,36293.0,130536.0,57243.0,25354.0,0.0,0.0
3,0.0,0.0,47.0,57.0,-5.0,2012240.0,129350.0,61236.0,17084.0,11488.0,62462.0,49960.0,33932.0,0.0,0.0
4,0.0,0.0,44.0,53.0,-8.0,1005145.0,354328.0,37102.0,88881.0,45307.0,99603.0,44790.0,29749.0,0.0,0.0


In [124]:
# Store the new dataset in the csv file
df.drop(columns = ['predefinedlabel']).to_csv(SAVE_PATH)

## Plan for preprocessing

1. We take the data and group by participant / video
2. Split into n segments (where n is the number of segments)
3. Compute 5 second-order parameters (if n = 1, then compute those for the entire watched video)
4. Make the train/validation split

The idea is to do that in the `dataloader` file instead of here, so that the preprocessing can be integrated into the pipeline and configured within the program rather than in this script.

In [77]:
df1 = df.groupby(by = ['SubjectID', 'VideoID', 'user-definedlabeln'], as_index = False)

<bound method GroupBy.head of <pandas.core.groupby.generic.DataFrameGroupBy object at 0x12306c760>>

In [111]:
df2 = pd.DataFrame()

for group in df1.groups:
    segments = np.array_split(df1.get_group(group), N_SEGMENTS)
    for index in range(N_SEGMENTS):
        segments[index] = segments[index][NUMERIC_FEATURES].agg(['min', 'max', 'mean', 'std', 'median'])
        segments[index] = segments[index].stack()
        segments[index].index = ['_'.join(row) for row in segments[index].index.values]

        segments[index] = pd.concat([segments[index], pd.Series([group[2]], index = ['user-definedlabeln'])])

    df2 = pd.concat([df2, pd.DataFrame(segments)], ignore_index = True, copy = False)

df2

Unnamed: 0,min_Attention,min_Mediation,min_Raw,min_Delta,min_Theta,min_Alpha1,min_Alpha2,min_Beta1,min_Beta2,min_Gamma1,...,median_Raw,median_Delta,median_Theta,median_Alpha1,median_Alpha2,median_Beta1,median_Beta2,median_Gamma1,median_Gamma2,user-definedlabeln
0,20.0,21.0,-156.0,33290.0,7200.0,1439.0,809.0,1277.0,3186.0,3266.0,...,25.0,622906.5,131288.5,35418.5,21702.5,17617.5,40962.5,42807.5,20183.0,0.0
1,51.0,34.0,-156.0,2290.0,1945.0,512.0,1337.0,1257.0,4455.0,2097.0,...,42.5,90538.0,21862.5,7753.0,12375.5,11447.0,30614.0,20850.0,10438.0,0.0
2,48.0,38.0,-197.0,2444.0,3495.0,1813.0,774.0,1710.0,3596.0,1915.0,...,41.0,160402.5,46055.0,8541.5,12171.0,10291.5,28849.0,23470.0,13145.0,0.0
3,13.0,17.0,-186.0,93861.0,6062.0,374.0,1389.0,1904.0,3279.0,6189.0,...,71.0,1019599.5,137944.0,34598.5,22235.0,22888.5,32780.5,43892.5,14921.5,0.0
4,7.0,13.0,-89.0,33577.0,121.0,25.0,317.0,648.0,519.0,214.0,...,33.0,398425.0,100926.0,18535.0,13611.0,12836.0,16821.0,20912.0,6949.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,35.0,21.0,-167.0,39865.0,3939.0,438.0,611.0,1240.0,2873.0,1495.0,...,36.0,580937.0,91337.0,20016.0,11108.0,12512.0,36884.0,27715.0,10922.0,1.0
396,23.0,34.0,-669.0,18318.0,7658.0,2419.0,419.0,684.0,1120.0,756.0,...,13.0,733329.0,88159.0,21898.0,26124.0,12792.0,39615.0,8322.0,2073.0,0.0
397,14.0,21.0,-722.0,113373.0,2386.0,507.0,810.0,1094.0,3844.0,209.0,...,52.0,805048.0,47221.0,13440.0,17890.0,6864.0,21679.0,12050.0,1194.0,0.0
398,20.0,16.0,-518.0,25283.0,3941.0,618.0,1624.0,492.0,1252.0,1077.0,...,-138.5,726881.0,76627.0,23137.0,22216.0,11156.5,28916.0,16065.0,2411.5,0.0


In [117]:
# If number of segments is 1, this is easier to do

df3 = df1[NUMERIC_FEATURES].agg(['min', 'max', 'mean', 'std', 'median']).reset_index()

df3.columns = ['_'.join(col) for col in df3.columns.values]
df3.drop(columns = ['SubjectID_', 'VideoID_'], inplace = True)

df3

Unnamed: 0,user-definedlabeln_,Attention_min,Attention_max,Attention_mean,Attention_std,Attention_median,Mediation_min,Mediation_max,Mediation_mean,Mediation_std,...,Gamma1_min,Gamma1_max,Gamma1_mean,Gamma1_std,Gamma1_median,Gamma2_min,Gamma2_max,Gamma2_mean,Gamma2_std,Gamma2_median
0,0.0,13.0,100.0,55.256944,22.838058,57.0,17.0,91.0,53.826389,13.480538,...,1915.0,222111.0,40729.284722,37816.854855,29328.5,1175.0,84108.0,16817.062500,13992.645154,13104.5
1,1.0,7.0,77.0,43.621429,14.953870,47.0,13.0,81.0,48.621429,12.456236,...,214.0,141042.0,36758.700000,28912.503649,29606.5,81.0,84001.0,14519.407143,12589.073301,10958.5
2,1.0,3.0,88.0,43.978873,14.095152,43.5,8.0,77.0,47.316901,13.732175,...,709.0,164217.0,33908.873239,29147.639460,27109.0,524.0,66255.0,14545.845070,12391.644533,10977.5
3,0.0,1.0,100.0,51.057377,21.949717,52.0,16.0,81.0,51.844262,12.607239,...,3562.0,227196.0,41438.213115,38480.599044,31548.0,1018.0,112579.0,16558.631148,16304.126506,11812.5
4,0.0,1.0,100.0,55.224138,21.865234,53.0,17.0,81.0,47.474138,12.186565,...,3623.0,181573.0,36024.818966,28085.205743,28606.0,947.0,49188.0,14752.655172,10755.786413,11745.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.0,0.0,100.0,49.813008,20.175602,51.0,0.0,100.0,41.048780,24.734940,...,3099.0,340048.0,54069.455285,57115.599397,32448.0,937.0,138218.0,22656.227642,23485.282390,15022.0
96,0.0,1.0,56.0,29.620690,12.581322,27.0,20.0,84.0,53.293103,13.461712,...,204.0,179242.0,17276.931034,24510.165244,8854.0,117.0,26351.0,3854.594828,4627.463411,2260.0
97,0.0,4.0,57.0,27.633929,11.180739,26.0,8.0,81.0,48.366071,16.646291,...,367.0,365456.0,18249.267857,37434.650176,8817.5,163.0,54829.0,2697.937500,5542.286492,1402.5
98,1.0,10.0,100.0,55.548387,19.858587,56.0,4.0,87.0,52.879032,14.145925,...,1495.0,184525.0,45424.588710,40247.066360,32191.5,685.0,106159.0,17554.927419,16472.015893,11894.5
