# Preprocessing

In [10]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [11]:
# Importing the dataset
train_validate = pd.read_csv("../Data/Train_and_Validate_EEG.csv")

# One-hot encoding of y-values
train_validate = pd.get_dummies(train_validate, columns=['main.disorder'], dtype=int)

# Drop columns that are not needed
train_validate.drop(columns=['specific.disorder', 'sex', 'eeg.date', 'age', "ID"], inplace=True)
train_validate.dropna(inplace=True, axis=1) # Remove columns with NaN values

# Normalize the data in columns starting with AB and COH
norm_cols = [col for col in train_validate.columns if 'AB' in col or 'COH' in col]
train_validate[norm_cols] = (train_validate[norm_cols] - train_validate[norm_cols].mean()) / train_validate[norm_cols].std()

# Print the first 5 rows of the dataset
train_validate.head()

Unnamed: 0,AB.A.delta.a.FP1,AB.A.delta.b.FP2,AB.A.delta.c.F7,AB.A.delta.d.F3,AB.A.delta.e.Fz,AB.A.delta.f.F4,AB.A.delta.g.F8,AB.A.delta.h.T3,AB.A.delta.i.C3,AB.A.delta.j.Cz,...,COH.F.gamma.q.T6.r.O1,COH.F.gamma.q.T6.s.O2,COH.F.gamma.r.O1.s.O2,main.disorder_Addictive disorder,main.disorder_Anxiety disorder,main.disorder_Healthy control,main.disorder_Mood disorder,main.disorder_Obsessive compulsive disorder,main.disorder_Schizophrenia,main.disorder_Trauma and stress related disorder
0,2.491668,2.031052,0.807025,1.214992,1.252584,1.893088,0.802875,0.45903,0.977225,1.867991,...,-0.400901,0.441381,-0.289018,0,0,0,1,0,0,0
1,1.203945,0.122913,0.284084,-0.087321,-0.321349,-0.348295,0.655359,-0.52692,-0.365307,-0.151768,...,0.050242,-0.590623,0.343119,0,0,1,0,0,0,0
2,0.939692,0.555656,1.257185,3.211095,0.764717,2.170673,0.884929,1.55796,1.252847,1.811592,...,0.683425,0.025166,1.188447,0,0,0,0,0,1,0
3,-0.526101,-0.593076,-0.695376,-0.559602,-0.596109,-0.771567,-0.908414,-0.533576,-0.711187,-0.675008,...,-0.442123,-0.026292,-0.11434,0,0,0,0,1,0,0
4,0.125161,0.211484,0.172217,0.633887,0.443298,0.290648,0.256593,0.954082,0.622803,0.519063,...,-1.171757,-1.888873,-0.810426,0,0,1,0,0,0,0


In [12]:
# Splitting the dataset into the Training set and Validation set
X, y = train_validate.drop(train_validate.filter(regex='main.disorder.*').columns, axis=1), train_validate.filter(regex='main.disorder.*')
X_train, X_validate, y_train, y_validate = X[:int(0.8*len(X))], X[int(0.8*len(X)):], y[:int(0.8*len(y))], y[int(0.8*len(y)):]
X.head()

Unnamed: 0,AB.A.delta.a.FP1,AB.A.delta.b.FP2,AB.A.delta.c.F7,AB.A.delta.d.F3,AB.A.delta.e.Fz,AB.A.delta.f.F4,AB.A.delta.g.F8,AB.A.delta.h.T3,AB.A.delta.i.C3,AB.A.delta.j.Cz,...,COH.F.gamma.o.Pz.p.P4,COH.F.gamma.o.Pz.q.T6,COH.F.gamma.o.Pz.r.O1,COH.F.gamma.o.Pz.s.O2,COH.F.gamma.p.P4.q.T6,COH.F.gamma.p.P4.r.O1,COH.F.gamma.p.P4.s.O2,COH.F.gamma.q.T6.r.O1,COH.F.gamma.q.T6.s.O2,COH.F.gamma.r.O1.s.O2
0,2.491668,2.031052,0.807025,1.214992,1.252584,1.893088,0.802875,0.45903,0.977225,1.867991,...,0.569227,0.364459,-0.401448,0.044529,0.665815,-0.427752,0.324128,-0.400901,0.441381,-0.289018
1,1.203945,0.122913,0.284084,-0.087321,-0.321349,-0.348295,0.655359,-0.52692,-0.365307,-0.151768,...,0.09862,-0.343466,0.892017,0.34858,-1.14645,0.582823,-0.004108,0.050242,-0.590623,0.343119
2,0.939692,0.555656,1.257185,3.211095,0.764717,2.170673,0.884929,1.55796,1.252847,1.811592,...,0.992761,-0.154239,0.308933,0.842628,-0.752374,0.824264,0.695554,0.683425,0.025166,1.188447
3,-0.526101,-0.593076,-0.695376,-0.559602,-0.596109,-0.771567,-0.908414,-0.533576,-0.711187,-0.675008,...,0.254283,-0.643903,-0.026028,-0.355933,-0.014185,-0.329801,-0.022343,-0.442123,-0.026292,-0.11434
4,0.125161,0.211484,0.172217,0.633887,0.443298,0.290648,0.256593,0.954082,0.622803,0.519063,...,-2.387585,-1.725329,-1.317361,-1.530363,-1.825926,-1.059648,-1.642119,-1.171757,-1.888873,-0.810426


In [13]:
X_train.to_csv("../Data/X_train.csv", index=False)
X_validate.to_csv("../Data/X_validate.csv", index=False)
y_train.to_csv("../Data/y_train.csv", index=False)
y_validate.to_csv("../Data/y_validate.csv", index=False)