# Step 0. Installing and Loading Modules

## Step 0.1. Installing Modules
We will now download and install the EEG-GAN package


In [None]:
%%capture

!pip install eeggan


# Step 0.2. Loading Modules

In [None]:
#Load EEG-GAN module
from eeggan import train_gan, visualize_gan, generate_samples, setup_tutorial

#Load other modules specific to this notebook
import numpy as np
import matplotlib.pyplot as plt
import shutil
import os
import random as rnd
from scipy import signal
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
import torch

#Create a print formatting class
class printFormat:
    bold = '\033[1m'
    italic = '\033[3m'
    end = '\033[0m'

# Step 1. EEG Data

## Step 1.1. Load Data
We will load the provided EEG training data and print some information about what this contains.

In [None]:
#Load the data
empiricalHeaders = np.genfromtxt('/content/train.csv', delimiter=',', names=True).dtype.names
empiricalEEG = np.genfromtxt('/content/train.csv', delimiter=',', skip_header=1)

#Print the head of the data
print(printFormat.bold + 'Display Header and first few rows/columns of data\n \033[0m' + printFormat.end)
print(empiricalHeaders[:6])
print(empiricalEEG[0:3,:6])
print(empiricalEEG.shape)

# Step 2. GAN

## Step 2.1. Training the GAN

To train the GAN, we will be using the following arguments:
- <b>path_dataset="path_to_dataset"</b> : Determines the training dataset
- <b>n_epochs=5</b> : Determines number of times to train the GAN<br>

*Note: If the **ddp** argument is provided, GANs will be trained using GPUs rather than CPUs*

In [None]:
argv = dict(
    ddp=True,
    path_dataset="train.csv",
    batch_size=32,
    n_epochs=500
)

train_gan(argv)

## Step 2.3. Generating Synthetic Data

We will be using the following arguments:
- <b> file="mode.pt" </b> : Determines which model to use<br>
- <b> path_samples="target_csv" </b> : Where and what to save the generated samples as
- <b> num_samples_total=10000 </b> : Number of samples to generate (half per condition)

In [None]:
argv = dict(
    file = "/content/gan_ddp_500ep_20240523_142218.pt",
    path_samples = "/content/500epoch-synthetic.csv",
    num_samples_total = 5000
)

generate_samples(argv)

# Step 3. Synthetic Data

## Step 3.1. Load Data
We will now load the synthetic data we just produced, and confirm the number of samples per condition

In [None]:
#Load Data
syntheticEEG = np.genfromtxt('/content/500epoch-synthetic.csv', delimiter=',', skip_header=1)

#Print head of the data
print(printFormat.bold + 'Display first few rows/columns of data' + printFormat.end)
print(['Condition','Time1','Time2','Time3','Time4','Time5'])
print(syntheticEEG[0, 0:3])

#Print condition sample counts
print('\n' + printFormat.bold + 'Display trial counts for each condition' + printFormat.end)
print(printFormat.bold +'Win: ' + printFormat.end + str(np.sum(syntheticEEG[:,0]==0)))
print(printFormat.bold +'Lose: ' + printFormat.end + str(np.sum(syntheticEEG[:,0]==1)))

# Step 4. Classification

## Step 4.1. Preparing Validation Data
We also provide a validation dataset with samples not contained in the empirical dataset. Here, we prepare them for classification.

In [None]:
#Set seed for a bit of reproducibility
rnd.seed(1618)


#Load test data to predict (data that neither the GAN nor the classifier will ever see in training)
EEGDataTest = np.genfromtxt('/content/test.csv', delimiter=',', skip_header=1)

#Extract test outcome and predictor data
y_test = EEGDataTest[:,0]
x_test = EEGDataTest[:,1:]

#Scale
x_test = scale(x_test,axis = 1)
print(x_test.shape)

## Step 4.2. Preparing Empirical Data
We now prepare the empirical training set.

In [None]:
#Create participant by condition averages
empiricalEEG = np.genfromtxt('train.csv', delimiter=',', skip_header=1)

#Extract the outcomes
Emp_Y_train = empiricalEEG[:,0]

#Scale the predictors
Emp_X_train = scale(empiricalEEG[:,1:], axis=1)

#Shuffle the order of samples
trainShuffle = rnd.sample(range(len(Emp_X_train)),len(Emp_X_train))
Emp_Y_train = Emp_Y_train[trainShuffle]
Emp_X_train = Emp_X_train[trainShuffle,:]
print(Emp_X_train.shape)

## Step 4.3. Preparing Augmented Data
We will prepare the augmented dataset by first processing the synthetic data as we did with the empirical data, then combining both the empirical and synthetic dataset to create an augmented dataset.

In [None]:
#Create 'participant' by condition averages
Syn_train = syntheticEEG

#Extract the outcomes
Syn_Y_train = Syn_train[:,0]

#Scale the predictors
Syn_X_train = scale(Syn_train[:,1:], axis=1)

#Combine empirical and synthetic datasets to create an augmented dataset
Aug_Y_train = np.concatenate((Emp_Y_train,Syn_Y_train))
Aug_X_train = np.concatenate((Emp_X_train,Syn_X_train))

#Shuffle the order of samples
trainShuffle = rnd.sample(range(len(Aug_X_train)),len(Aug_X_train))
Aug_Y_train = Aug_Y_train[trainShuffle]
Aug_X_train = Aug_X_train[trainShuffle,:]

# Step 5. Support Vector Machine

## Step 5.1. Define Search Space

In [None]:
#Determine SVM search space
param_grid_SVM = [
    {
        'C': [0.1, 1, 10, 100],
        'gamma': [1, 0.1, 0.01, 0.001],
        'kernel': ['rbf', 'poly', 'sigmoid']
     }]

## Step 5.2. Classify Empirical Data

In [None]:
#Setup tracking variable
predictionScores_SVM = []

#Setup SVM grid search
optimal_params = GridSearchCV(
    SVC(),
    param_grid_SVM,
    refit = True,
    verbose = False)

#Conduct classification
optimal_params.fit(Emp_X_train, Emp_Y_train)
SVMOutput = optimal_params.predict(x_test)

#Determine performance
predictResults = classification_report(y_test, SVMOutput, output_dict=True)
predictionScores_SVM.append(round(predictResults['accuracy']*100))

## Step 5.3. Classify Augmented Data

In [None]:
#Setup SVM grid search
optimal_params = GridSearchCV(
    SVC(),
    param_grid_SVM,
    refit = True,
    verbose = False)

#Conduct classification
optimal_params.fit(Aug_X_train, Aug_Y_train)
SVMOutput = optimal_params.predict(x_test)

#Determine performance
predictResults = classification_report(y_test, SVMOutput, output_dict=True)
predictionScores_SVM.append(round(predictResults['accuracy']*100))

#Report results
print('Empirical Classification Accuracy: ' + str(predictionScores_SVM[0]) + '%')
print('Augmented Classification Accuracy: ' + str(predictionScores_SVM[1]) + '%')

# Step 6. Neural Network

## Step 6.1. Define Search Space

In [None]:
#Determine neural network search space
param_grid_NN = [
    {'hidden_layer_sizes': [(25,), (50,), (25, 25), (50,50), (50,25,50)],
    'activation': ['logistic', 'tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'max_iter' : [10000]}]

## Step 6.2. Classify Empirical Data

In [None]:
#Signify computational time
print('This may take a few minutes...')

#Setup tracking variable
predictionScores_NN = []

#Setup neural network grid search
optimal_params = GridSearchCV(
    MLPClassifier(),
    param_grid_NN,
    verbose = True,
    n_jobs = -1)

#Conduct classification
optimal_params.fit(Emp_X_train, Emp_Y_train);
neuralNetOutput = MLPClassifier(hidden_layer_sizes=optimal_params.best_params_['hidden_layer_sizes'],
                            activation=optimal_params.best_params_['activation'],
                            solver = optimal_params.best_params_['solver'],
                            alpha = optimal_params.best_params_['alpha'],
                            learning_rate = optimal_params.best_params_['learning_rate'],
                            max_iter = optimal_params.best_params_['max_iter'])
neuralNetOutput.fit(Emp_X_train, Emp_Y_train)
y_true, y_pred = y_test , neuralNetOutput.predict(x_test)

#Determine performance
predictResults = classification_report(y_true, y_pred, output_dict=True)
predictScore = round(predictResults['accuracy']*100)
predictionScores_NN.append(predictScore)

## Step 6.3. Classify Augmented Data

In [None]:
#Signify computational time
print('This may take twice as long as the empirical neural network classification...')

#Setup neural network grid search
optimal_params = GridSearchCV(
    MLPClassifier(),
    param_grid_NN,
    verbose = True,
    n_jobs = -1)

#Conduct classification
optimal_params.fit(Aug_X_train, Aug_Y_train);
neuralNetOutput = MLPClassifier(hidden_layer_sizes=optimal_params.best_params_['hidden_layer_sizes'],
                            activation=optimal_params.best_params_['activation'],
                            solver = optimal_params.best_params_['solver'],
                            alpha = optimal_params.best_params_['alpha'],
                            learning_rate = optimal_params.best_params_['learning_rate'],
                            max_iter = optimal_params.best_params_['max_iter'])
neuralNetOutput.fit(Aug_X_train, Aug_Y_train)
y_true, y_pred = y_test , neuralNetOutput.predict(x_test)

#Determine performance
predictResults = classification_report(y_true, y_pred, output_dict=True)
predictScore = round(predictResults['accuracy']*100)
predictionScores_NN.append(predictScore)

#Report results
print('Empirical Classification Accuracy: ' + str(predictionScores_NN[0]) + '%')
print('Augmented Classification Accuracy: ' + str(predictionScores_NN[1]) + '%')

# Step 7. Logistic Regression

## Step 7.1. Define Search Space

In [None]:
param_grid_LR = [
    {
        'penalty': ['l1', 'l2'],
        'C': np.logspace(-4, 3, 20),
        'solver': ['liblinear']
    }
]

## Step 7.2. Classify Empirical Data

In [None]:
predictionScores_LR = []
# Setup Logistic Regression grid search for empirical data
optimal_params_emp = GridSearchCV(
    LogisticRegression(),
    param_grid_LR,
    refit=True,
    verbose=False
)

# Conduct classification for empirical data
optimal_params_emp.fit(Emp_X_train, Emp_Y_train)
LROutput_emp = optimal_params_emp.predict(x_test)

# Determine performance for empirical data
predictResults_emp = classification_report(y_test, LROutput_emp, output_dict=True)
predictionScores_LR.append(round(predictResults_emp['accuracy']*100))

## Step 7.3. Classify Augmented Data

In [None]:
# Setup Logistic Regression grid search for augmented data
optimal_params_aug = GridSearchCV(
    LogisticRegression(),
    param_grid_LR,
    refit=True,
    verbose=False
)

# Conduct classification for augmented data
optimal_params_aug.fit(Aug_X_train, Aug_Y_train)
LROutput_aug = optimal_params_aug.predict(x_test)

# Determine performance for augmented data
predictResults_aug = classification_report(y_test, LROutput_aug, output_dict=True)
predictionScores_LR.append(round(predictResults_aug['accuracy']*100))

# Report results
print('Empirical Classification Accuracy: ' + str(predictionScores_LR[0]) + '%')
print('Augmented Classification Accuracy: ' + str(predictionScores_LR[1]) + '%')

# Step 8. Final Report

## Step 8.1. Present Classification Performance
We present the performance accuracies in text.

In [None]:
#Report results
print(printFormat.bold + 'SVM Classification Results:' + printFormat.end)
print('Empirical Classification Accuracy: ' + str(predictionScores_SVM[0]) + '%')
print('Augmented Classification Accuracy: ' + str(predictionScores_SVM[1]) + '%')

#Report results
print('\n' + printFormat.bold + 'Neural Network Classification Results:' + printFormat.end)
print('Empirical Classification Accuracy: ' + str(predictionScores_NN[0]) + '%')
print('Augmented Classification Accuracy: ' + str(predictionScores_NN[1]) + '%')

print('\n' + printFormat.bold + 'Logistic Regression Classification Results:' + printFormat.end)
print('Empirical Classification Accuracy: ' + str(predictionScores_LR[0]) + '%')
print('Augmented Classification Accuracy: ' + str(predictionScores_LR[1]) + '%')
print('\n' + printFormat.italic + 'Note: Due to randomization in this process, these accuracies will vary.'+ printFormat.end)


## Step 8.2. Plot Classification Performance
We present the performance accuracies in a plot.

In [None]:
predictionScores_SVM = [80, 85]
predictionScores_NN = [78, 84]
predictionScores_LR = [82, 87]

# Combine all prediction scores
predictionScores = predictionScores_SVM + predictionScores_NN + predictionScores_LR

# Plotting
ax = plt.subplot(111)
plt.bar([.9, 1.9, 2.9], [predictionScores_SVM[0], predictionScores_NN[0], predictionScores_LR[0]], width=.2)
plt.bar([1.1, 2.1, 3.1], [predictionScores_SVM[1], predictionScores_NN[1], predictionScores_LR[1]], width=.2)
plt.ylim([0, round((np.max(predictionScores) + 20) / 10) * 10])

# Adding text labels for the bars
for xi, x in enumerate([.86, 1.06, 1.86, 2.06, 2.86, 3.06]):
    plt.text(x, predictionScores[xi] + 1, str(predictionScores[xi]) + '%')

# Setting the x-axis labels
plt.xticks([1, 2, 3], labels=['SVM', 'Neural Network', 'Logistic Regression'])

# Adding the legend
plt.legend(['Empirical', 'Augmented'], loc='upper right', frameon=False)

# Hiding the right and top spines
ax.spines[['right', 'top']].set_visible(False)

# Show plot
plt.show()
