In [3]:
# Importing the libraries
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import classification_report

from imblearn.over_sampling import SMOTE
import lightgbm as lgb


In [4]:
data = pd.read_csv("../Data/Train_and_Validate_EEG.csv")

In [5]:
data.dropna(inplace=True, axis=1) # Remove columns with NaN values

main_disorder_names = np.unique(data[['main.disorder']]).tolist()
specific_disorder_names = np.unique(data[['specific.disorder']]).tolist()

In [6]:
#encode disorders, and sex
encoder = OrdinalEncoder()

encoder.fit(data[['sex']])
data['sex']= encoder.transform(data[['sex']])
encoder.fit(data[['main.disorder']])
data['main.disorder']= encoder.transform(data[['main.disorder']])

encoder.fit(data[['specific.disorder']])
data['specific.disorder']= encoder.transform(data[['specific.disorder']])

In [7]:
data.head()

Unnamed: 0,ID,sex,age,eeg.date,main.disorder,specific.disorder,AB.A.delta.a.FP1,AB.A.delta.b.FP2,AB.A.delta.c.F7,AB.A.delta.d.F3,...,COH.F.gamma.o.Pz.p.P4,COH.F.gamma.o.Pz.q.T6,COH.F.gamma.o.Pz.r.O1,COH.F.gamma.o.Pz.s.O2,COH.F.gamma.p.P4.q.T6,COH.F.gamma.p.P4.r.O1,COH.F.gamma.p.P4.s.O2,COH.F.gamma.q.T6.r.O1,COH.F.gamma.q.T6.s.O2,COH.F.gamma.r.O1.s.O2
0,1,0.0,31.55,2016.4.7,3.0,5.0,48.84389,46.533704,25.924618,30.221841,...,84.825697,62.094883,49.639937,61.347225,81.63063,39.464753,72.16858,30.893891,74.038603,51.237679
1,2,1.0,25.0,2018.3.23,2.0,6.0,34.108015,22.838567,20.646824,18.203362,...,77.398455,48.282117,73.185133,66.742485,49.466312,59.270167,66.639938,40.297206,55.547526,63.630547
2,3,1.0,26.2,2015.4.10,5.0,10.0,31.084064,28.212342,30.467865,48.643,...,91.510025,51.974239,62.571143,75.509166,56.460421,64.001966,78.424703,53.49483,66.581021,80.202968
3,4,1.0,21.0,2016.5.13,4.0,7.0,14.310468,13.947459,10.7616,13.84488,...,79.855172,42.420095,56.473788,54.241183,69.561898,41.384408,66.33278,30.034691,65.659003,54.662177
4,5,0.0,28.15,2018.7.14,2.0,6.0,21.763093,23.938428,19.517805,24.859077,...,38.160508,21.319727,32.967416,33.401386,37.406879,27.08076,39.050046,14.826695,32.28594,41.015604


In [8]:
#drop, ID, date, and specific disorder
data = data.drop(columns =['ID', 'eeg.date','specific.disorder'], axis =1)
norm_cols = [col for col in data.columns if 'AB' in col or 'COH' in col]
data[norm_cols] = (data[norm_cols] - data[norm_cols].mean()) / data[norm_cols].std()

#split X and y datasets
X= data.drop(columns=['main.disorder'], axis=1)
y = data['main.disorder']

In [9]:
#PCA feature extraction
pca = PCA()
pca.fit(X)

pca = PCA(n_components=300)
X = pca.fit_transform(X)


In [10]:
#smote to balance disorder classes
smote = SMOTE(sampling_strategy ='all', random_state = 42)
X_resampled, y_resampled = smote.fit_resample(X, y)


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled , test_size =.2)

In [12]:
def run_model(model):
    model.fit(X_train, y_train)
    test_data_prediction = model.predict(X_test)

    cr= classification_report(y_test, test_data_prediction, target_names=main_disorder_names, digits=4)

    print(cr)


In [13]:
model = lgb.LGBMClassifier()
run_model(model)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002854 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76500
[LightGBM] [Info] Number of data points in the train set: 1400, number of used features: 300
[LightGBM] [Info] Start training from score -1.931022
[LightGBM] [Info] Start training from score -1.971228
[LightGBM] [Info] Start training from score -1.971228
[LightGBM] [Info] Start training from score -2.002481
[LightGBM] [Info] Start training from score -1.971228
[LightGBM] [Info] Start training from score -1.901893
[LightGBM] [Info] Start training from score -1.878252
                                    precision    recall  f1-score   support

                Addictive disorder     0.6200    0.6596    0.6392        47
                  Anxiety disorder     0.9375    0.8182    0.8738        55
                   Healthy control     0.8254    0.9455    0.8814        55
                     Mood d