In [3]:
"""
Written by, 
Sriram Ravindran, sriram@ucsd.edu

Original paper - https://arxiv.org/abs/1611.08024

Please reach out to me if you spot an error.
"""

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
from sklearn.preprocessing import StandardScaler

In [4]:
# load the dataframe from the pickle file
import pickle
dir = "C:/Users/gusta/OneDrive/Skrivebord/KI & Data/Bachelor/LegeData"
with open(f"{dir}/dataframe.pkl", "rb") as f:
    df = pickle.load(f)

# keep these channels only, these are the indexes: 
""" 
Fp1 -> 0
Fp2 -> 33
F3  -> 4
F4  -> 38
C3  -> 12
C4  -> 48
P3  -> 20
P4  -> 55
O1  -> 26
O2  -> 61
F7  -> 6
F8  -> 40
T7  -> 14
T8  -> 50
P7  -> 22
P8  -> 57
Fz  -> 36
Cz  -> 46
Pz  -> 30 

but add 1 to each index, since the first channel is channel_1
"""

df = df[["channel_1", "channel_34", "channel_5", "channel_39", "channel_13", "channel_49", "channel_21", "channel_56", "channel_27", "channel_62", "channel_7", "channel_41", "channel_15", "channel_51", "channel_23", "channel_58", "channel_37", "channel_47", "channel_31", "label"]]

patient_ids = np.repeat([1,2,3,4,5,6,7,8,9,10],76288)  # Make sure to have this aligned with your epochs/labels

# Normalize per patient (within training and test sets)
data = df.drop("label", axis=1).values
data_norm = []
for patient_id in np.unique(patient_ids):
    patient_data = data[patient_ids == patient_id]
    scaler = StandardScaler()
    patient_data_scaled = scaler.fit_transform(patient_data)
    data_norm.append(patient_data_scaled)

data_norm = np.concatenate(data_norm, axis=0)
# add labels back
y = df["label"].values
data = data_norm

# make data a dataframe again
df = pd.DataFrame(data)
print(f"size of df: {df.shape}")

# split the data into epochs of 256 datapoints each
epochs = []
for i in range(0, len(df), 256):
    epochs.append(df.iloc[i:i+256].values)

# convert the list of epochs to a numpy array
epochs = np.array(epochs)
print(f"size of epochs: {epochs.shape}")

# save the SCALED epochs
with open(f"{dir}/scaled_rawEEG_epochs.pkl", "wb") as f:
    pickle.dump(epochs, f)




size of df: (762880, 19)
size of epochs: (2980, 256, 19)


In [5]:
labels = []
for _ in range(10):
    labels.append(np.repeat([1, 0], 149))
labels = np.concatenate(labels)

print(f"size of labels: {labels.shape}")


size of labels: (2980,)


In [6]:
channels = 19
sample_len = 256

class EEGNet(nn.Module):
    def __init__(self):
        super(EEGNet, self).__init__()
        self.T = 120
        
        # Layer 1
        self.conv1 = nn.Conv2d(1, 16, (1, channels), padding = 0)
        self.batchnorm1 = nn.BatchNorm2d(16, False)
        
        # Layer 2
        self.padding1 = nn.ZeroPad2d((16, 17, 0, 1))
        self.conv2 = nn.Conv2d(1, 4, (2, 32))
        self.batchnorm2 = nn.BatchNorm2d(4, False)
        self.pooling2 = nn.MaxPool2d(2, 4)
        
        # Layer 3
        self.padding2 = nn.ZeroPad2d((2, 1, 4, 3))
        self.conv3 = nn.Conv2d(4, 4, (8, 4))
        self.batchnorm3 = nn.BatchNorm2d(4, False)
        self.pooling3 = nn.MaxPool2d((2, 4))
        
        # FC Layer
        # NOTE: This dimension will depend on the number of timestamps per sample in your data.
        # I have 120 timepoints. 
        self.fc1 = nn.Linear(8*2 * (sample_len // 32), 1)
        

    def forward(self, x):
        # Layer 1
        x = F.elu(self.conv1(x))
        x = self.batchnorm1(x)
        x = F.dropout(x, 0.25)
        x = x.permute(0, 3, 1, 2)
        
        # Layer 2
        x = self.padding1(x)
        x = F.elu(self.conv2(x))
        x = self.batchnorm2(x)
        x = F.dropout(x, 0.25)
        x = self.pooling2(x)
        
        # Layer 3
        x = self.padding2(x)
        x = F.elu(self.conv3(x))
        x = self.batchnorm3(x)
        x = F.dropout(x, 0.25)
        x = self.pooling3(x)
        
        # FC Layer
        x = x.reshape(-1, 128)
        x = F.sigmoid(self.fc1(x))
        return x


net = EEGNet()
print(net.forward(Variable(torch.Tensor(1,1,sample_len,channels))))
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters())

tensor([[0.6791]], grad_fn=<SigmoidBackward0>)


#### Evaluate function returns values of different criteria like accuracy, precision etc. 
In case you face memory overflow issues, use batch size to control how many samples get evaluated at one time. Use a batch_size that is a factor of length of samples. This ensures that you won't miss any samples.

In [7]:
import torch
from torch.autograd import Variable
import numpy as np
from sklearn.metrics import accuracy_score

def evaluate(model, data_loader, sample_len, channels):
    model.eval()
    
    all_predictions = []
    all_targets = []
    
    with torch.no_grad():  # Disable gradient calculation for evaluation
        for data in data_loader:
            # get the inputs; data is a list of [inputs, labels]
            inputs, test_targets = data
            # permute inputs from (batch_size, channels, sample_len) to (batch_size, sample_len, channels)
            inputs = inputs.permute(0, 2, 1)
            inputs = inputs.reshape(-1,1,sample_len,channels)

            # wrap them in Variable
            inputs, test_targets = Variable(inputs), Variable(test_targets)
            
            # Get model predictions
            outputs = model(inputs)
            predictions = outputs.data.cpu().numpy()
            targets = test_targets.data.cpu().numpy()
            
            all_predictions.extend(np.round(predictions))
            all_targets.extend(targets)
    
    # Calculate accuracy
    acc = accuracy_score(all_targets, all_predictions)
    return f"accuracy = {acc * 100:.2f}%"

#### Generate random data

##### Data format:
Datatype - float32 (both X and Y) <br>
X.shape - (#samples, 1, #timepoints,  #channels) <br>
Y.shape - (#samples)

In [11]:
channels = 19
sample_len = 256
no_samples = 2980

X_train = np.random.rand(no_samples, 1, sample_len, channels).astype('float32') # np.random.rand generates between [0, 1)
y_train = np.round(np.random.rand(no_samples).astype('float32')) # binary data, so we round it to 0 or 1.

X_val = np.random.rand(no_samples, 1, sample_len, channels).astype('float32')
y_val = np.round(np.random.rand(no_samples).astype('float32'))

X_test = np.random.rand(no_samples, 1, sample_len, channels).astype('float32')
y_test = np.round(np.random.rand(no_samples).astype('float32'))

In [8]:
import torch
from torch.utils.data import DataLoader, Dataset

# Dataset class definition
class EEGDataset(Dataset):
    def __init__(self, epochs, labels):
        self.df = epochs
        self.labels = labels

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Access the epoch and corresponding label
        data = self.df[idx]  # data shape: (256, 19)
        label = self.labels[idx]  # label shape: ()
        
        # Ensure the data is converted to float tensor
        data_tensor = torch.tensor(data, dtype=torch.float32)
        
        # Convert label to tensor (assumed to be scalar)
        label_tensor = torch.tensor(label, dtype=torch.float32) 
        
        return data_tensor, label_tensor




#### Run

In [11]:
from sklearn.model_selection import LeaveOneGroupOut
batch_size = 16


# Initialize Leave-One-Group-Out cross-validator
logo = LeaveOneGroupOut()

# This is an array where each entry corresponds to a patient ID (e.g., [0, 0, 0, 1, 1, 1, ..., 9])
patient_ids = np.repeat([1,2,3,4,5,6,7,8,9,10],298)  # Make sure to have this aligned with your epochs/labels

for train_index, test_index in logo.split(epochs, labels, groups=patient_ids):
    
    # Train and test split
    X_train, X_test = epochs[train_index,:,:], epochs[test_index,:,:]
    y_train, y_test = labels[train_index], labels[test_index]
    
    # Create DataLoader for training and testing
    train_dataset = EEGDataset(X_train, y_train)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    test_dataset = EEGDataset(X_test, y_test)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    # Training loop
    net.train()
    running_loss = 0.0

    for i, data in enumerate(train_loader, 0):
        if i % 16 == 0:
            print(i, end=", ")
        
        # Get the inputs; data is a list of [inputs, labels]
        inputs, targets = data
        inputs = inputs.reshape(-1, 1, sample_len, channels)  # Reshape for network input

        # Convert inputs and targets to variables
        inputs, targets = Variable(inputs), Variable(targets)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = net(inputs).reshape(-1)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    # Print training loss
    print(f"Training Loss: {running_loss:.4f}")
    
    # Check performance on training and testing sets
    train_accuracy = evaluate(net, train_loader, sample_len, channels)
    test_accuracy = evaluate(net, test_loader, sample_len, channels)
    
    print(f"Train Accuracy: {float(train_accuracy.split('=')[1].strip('%')):.2f}%")
    print(f"Test Accuracy: {float(test_accuracy.split('=')[1].strip('%')):.2f}%")


0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, Training Loss: 78.1130
Train Accuracy: 50.07%
Test Accuracy: 50.00%
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, Training Loss: 77.1403
Train Accuracy: 50.11%
Test Accuracy: 50.00%
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, Training Loss: 77.2018
Train Accuracy: 50.11%
Test Accuracy: 50.00%
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, Training Loss: 77.7883
Train Accuracy: 49.96%
Test Accuracy: 50.00%
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, Training Loss: 73.4538
Train Accuracy: 49.81%
Test Accuracy: 57.72%
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, Training Loss: 76.3760
Train Accuracy: 50.07%
Test Accuracy: 50.00%
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, Training Loss: 69.4236
Train Accuracy: 49.81%
Test Accuracy: 50.00%
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, Training Loss: 74.1980
Train Accuracy: 50.37%
Test Accuracy: 45.30%
0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, Training Loss: 71.1298
Train Accu