In [3]:
"""
Written by, 
Sriram Ravindran, sriram@ucsd.edu

Original paper - https://arxiv.org/abs/1611.08024

Please reach out to me if you spot an error.
"""

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score
import torch
import torch.nn as nn
import torch.optim as optim
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim

In [25]:
# load the dataframe from the pickle file
import pickle
dir = "C:/Users/gusta/OneDrive/Skrivebord/KI & Data/Bachelor/LegeData"
with open(f"{dir}/dataframe.pkl", "rb") as f:
    df = pickle.load(f)

# keep these channels only, these are the indexes: 
""" 
Fp1 -> 0
Fp2 -> 33
F3  -> 4
F4  -> 38
C3  -> 12
C4  -> 48
P3  -> 20
P4  -> 55
O1  -> 26
O2  -> 61
F7  -> 6
F8  -> 40
T7  -> 14
T8  -> 50
P7  -> 22
P8  -> 57
Fz  -> 36
Cz  -> 46
Pz  -> 30 

but add 1 to each index, since the first channel is channel_1
"""

df = df[["channel_1", "channel_34", "channel_5", "channel_39", "channel_13", "channel_49", "channel_21", "channel_56", "channel_27", "channel_62", "channel_7", "channel_41", "channel_15", "channel_51", "channel_23", "channel_58", "channel_37", "channel_47", "channel_31", "label"]]
df = df.T
df.head()
# print(df.iloc[:, :3])





Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,762870,762871,762872,762873,762874,762875,762876,762877,762878,762879
channel_1,14128830.0,14103800.0,14108510.0,14113430.0,-4909065.0,-4948905.0,-4949609.0,-4952393.0,3253510.0,3231366.0,...,-5643658.0,-5641706.0,-3462086.0,-3462502.0,-3461254.0,-3461862.0,-5158634.0,-5159946.0,-5155018.0,-5154762.0
channel_34,14112280.0,14105080.0,14116830.0,14119130.0,-4927497.0,-4954985.0,-4941001.0,-4947593.0,3241126.0,3229286.0,...,-5643402.0,-5647914.0,-3457190.0,-3457830.0,-3459910.0,-3461766.0,-5158218.0,-5154698.0,-5154378.0,-5156010.0
channel_5,14124310.0,14088470.0,14110330.0,14110140.0,-4917001.0,-4967753.0,-4954601.0,-4961673.0,3247974.0,3218182.0,...,-5644170.0,-5642218.0,-3459174.0,-3458790.0,-3462886.0,-3461894.0,-5157930.0,-5153930.0,-5157130.0,-5158090.0
channel_39,14105530.0,14119870.0,14107830.0,14115740.0,-4932649.0,-4927177.0,-4953513.0,-4954217.0,3228486.0,3243974.0,...,-5645386.0,-5643914.0,-3459238.0,-3460870.0,-3459878.0,-3461062.0,-5156106.0,-5157866.0,-5154922.0,-5156394.0
channel_13,14121590.0,14107030.0,14091710.0,14117980.0,-4914921.0,-4943785.0,-4961129.0,-4948905.0,3250566.0,3230438.0,...,-5647434.0,-5641610.0,-3461734.0,-3461158.0,-3460966.0,-3461670.0,-5155690.0,-5155498.0,-5158602.0,-5156458.0


In [3]:
labels = []
for _ in range(10):
    labels.append(np.repeat([1, 0], 149))
labels = np.concatenate(labels)


In [17]:
from torch.utils.data import DataLoader, Dataset

class EEGDataset(Dataset):
    def __init__(self, df, labels):
        self.df = df
        self.labels = labels
        assert len(df.T) == len(labels) * 256, "Mismatch between df rows and expected length from labels"

    def __len__(self):
        return len(self.df.T) // 256  # Ensure we have complete batches

    def __getitem__(self, idx):
        start_idx = idx * 256
        end_idx = start_idx + 256
        data = self.df.iloc[:,start_idx:end_idx].values
        label = self.labels[idx]
        return torch.tensor(data).float(), torch.tensor(label).float()

# Check dataset length
dataset = EEGDataset(df, labels)
print(f"Length of dataset: {len(dataset)}")

# Check DataLoader
dataloader = DataLoader(dataset, batch_size=1)

for batch_ndx, (batch, label) in enumerate(dataloader):
    print(f"Batch {batch_ndx + 1}: Data shape {batch.shape}, Label {label}")
    break  # To print just the first batch


Length of dataset: 2980
Batch 1: Data shape torch.Size([1, 19, 256]), Label tensor([1.])


In [12]:
print(len(df.T))
print(len(labels)*256)

762880
762880


In [19]:
channels = 19
sample_len = 256

class EEGNet(nn.Module):
    def __init__(self):
        super(EEGNet, self).__init__()
        self.T = 120
        
        # Layer 1
        self.conv1 = nn.Conv2d(1, 16, (1, channels), padding = 0)
        self.batchnorm1 = nn.BatchNorm2d(16, False)
        
        # Layer 2
        self.padding1 = nn.ZeroPad2d((16, 17, 0, 1))
        self.conv2 = nn.Conv2d(1, 4, (2, 32))
        self.batchnorm2 = nn.BatchNorm2d(4, False)
        self.pooling2 = nn.MaxPool2d(2, 4)
        
        # Layer 3
        self.padding2 = nn.ZeroPad2d((2, 1, 4, 3))
        self.conv3 = nn.Conv2d(4, 4, (8, 4))
        self.batchnorm3 = nn.BatchNorm2d(4, False)
        self.pooling3 = nn.MaxPool2d((2, 4))
        
        # FC Layer
        # NOTE: This dimension will depend on the number of timestamps per sample in your data.
        # I have 120 timepoints. 
        self.fc1 = nn.Linear(8*2 * (sample_len // 32), 1)
        

    def forward(self, x):
        # Layer 1
        x = F.elu(self.conv1(x))
        x = self.batchnorm1(x)
        x = F.dropout(x, 0.25)
        x = x.permute(0, 3, 1, 2)
        
        # Layer 2
        x = self.padding1(x)
        x = F.elu(self.conv2(x))
        x = self.batchnorm2(x)
        x = F.dropout(x, 0.25)
        x = self.pooling2(x)
        
        # Layer 3
        x = self.padding2(x)
        x = F.elu(self.conv3(x))
        x = self.batchnorm3(x)
        x = F.dropout(x, 0.25)
        x = self.pooling3(x)
        
        # FC Layer
        x = x.reshape(-1, 128)
        x = F.sigmoid(self.fc1(x))
        return x


net = EEGNet()
print(net.forward(Variable(torch.Tensor(1,1,sample_len,channels))))
criterion = nn.BCELoss()
optimizer = optim.Adam(net.parameters())

tensor([[0.1571]], grad_fn=<SigmoidBackward0>)


#### Evaluate function returns values of different criteria like accuracy, precision etc. 
In case you face memory overflow issues, use batch size to control how many samples get evaluated at one time. Use a batch_size that is a factor of length of samples. This ensures that you won't miss any samples.

In [63]:
def evaluate(model, data_loader, params = ["acc"]):
    results = []
    model.eval()
        
    predicted = []
    
    for i, data in enumerate(data_loader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        # make shape (samples, 1, sample_len, channels)
        inputs = inputs.reshape(-1, 1, sample_len, channels)

        # wrap them in Variable
        inputs, labels = Variable(inputs), Variable(labels)
        
        predicted = model(inputs)
        
        predicted = predicted.data.cpu().numpy()
    
    for param in params:
        if param == 'acc':
            results.append(accuracy_score(labels, np.round(predicted)))
        if param == "auc":
            results.append(roc_auc_score(labels, predicted))
        if param == "recall":
            results.append(recall_score(labels, np.round(predicted)))
        if param == "precision":
            results.append(precision_score(labels, np.round(predicted)))
        if param == "fmeasure":
            precision = precision_score(labels, np.round(predicted))
            recall = recall_score(labels, np.round(predicted))
            results.append(2*precision*recall/ (precision+recall))
    return results

#### Generate random data

##### Data format:
Datatype - float32 (both X and Y) <br>
X.shape - (#samples, 1, #timepoints,  #channels) <br>
Y.shape - (#samples)

In [47]:
channels = 19
sample_len = 256
no_samples = 2980//10

X_train = np.random.rand(no_samples, 1, sample_len, channels).astype('float32') # np.random.rand generates between [0, 1)
y_train = np.round(np.random.rand(no_samples).astype('float32')) # binary data, so we round it to 0 or 1.

X_val = np.random.rand(no_samples, 1, sample_len, channels).astype('float32')
y_val = np.round(np.random.rand(no_samples).astype('float32'))

X_test = np.random.rand(no_samples, 1, sample_len, channels).astype('float32')
y_test = np.round(np.random.rand(no_samples).astype('float32'))

In [59]:
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split

# Dataset class definition
class EEGDataset(Dataset):
    def __init__(self, df, labels):
        self.df = df
        self.labels = labels

    def __len__(self):
        return len(self.df.T) // 256  # Ensure we have complete batches

    def __getitem__(self, idx):
        start_idx = idx * 256
        end_idx = start_idx + 256
        data = self.df[:, start_idx:end_idx]
        label = self.labels[idx*256]
        return torch.tensor(data).float(), torch.tensor(label).float()

# Assume df is a DataFrame where last row is labels, others are features
# Transpose the dataframe to align features and labels correctly
X = df.iloc[:-1, :]  # Features
y = df.iloc[-1, :]   # Labels

# Split into training and test sets using sklearn's train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.T, y, test_size=0.2, random_state=42)

# Reset index for both X and y to avoid KeyError
X_train = X_train.reset_index(drop=True).T.values
X_test = X_test.reset_index(drop=True).T.values
y_train = y_train.reset_index(drop=True).values
y_test = y_test.reset_index(drop=True).values


# Create Dataset and DataLoader for train and test
train_dataset = EEGDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)

test_dataset = EEGDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

# Check DataLoader and print sizes of datasets and labels
for batch_ndx, (batch, label) in enumerate(train_loader):
    print(f"Train Batch {batch_ndx + 1}: Data shape {batch.shape}, Label shape {label.shape}")
    break  # To print just the first batch

print(f"Training set length: {len(train_dataset)}")
print(f"Test set length: {len(test_dataset)}")


Train Batch 1: Data shape torch.Size([1, 19, 256]), Label shape torch.Size([1])
Training set length: 2384
Test set length: 596


#### Run

In [68]:
batch_size = 16
loops = 1

for epoch in range(loops):  # loop over the dataset multiple times
    print("\nEpoch ", epoch)
    net.train()
    
    running_loss = 0.0

    for i, data in enumerate(train_loader, 0):
        #make progress print every 200 samples
        if i % 200 == 0:
            print(i, end=", ")
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data
        # make shape (samples, 1, sample_len, channels)
        inputs = inputs.reshape(-1, 1, sample_len, channels)

        # wrap them in Variable
        inputs, labels = Variable(inputs), Variable(labels)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs).reshape(-1)
        loss = criterion(outputs, labels)
        loss.backward()
        
        optimizer.step()
        
        running_loss += loss.item()
    
    # Validation accuracy
    params = ["acc"]
    print(params)
    print("Training Loss ", running_loss)
    print("Train - ", evaluate(net, train_loader, params))
    print("Test - ", evaluate(net, test_loader, params))


Epoch  0
0, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000, 2200, ['acc']
Training Loss  1632.6879984140396
Train -  [1.0]
Test -  [0.0]
