# BCIC data analysis

<p>
This notebook contains code for preparing the BCIC data for fine-tuning step.
</p>

---
> Author:    Mahmoud Zeydabadinezhad    
> Contact:   zeydabadi@gmail.com   
> Version:   10/25/2023

In [6]:
# Standard library imports
import copy
import os
import shutil
from math import ceil
import sys
sys.path.append('C:\\Users\\shreyas\\Documents\\GitHub\\NeuroGPT\\src')
# Third-party library imports
import mne
import numpy as np
import pandas as pd
import torch
from torch import optim
import torch.nn as nn
import torch.nn.functional as F
from rich import print
from torch.utils.data import DataLoader
from tqdm import tqdm

from src.model import Model


In [7]:
ch_map = {
    "EEG-Fz": "FZ",
    "EEG-0": "FC3",
    "EEG-1": "FC1",
    "EEG-2": "FCZ",
    "EEG-3": "FC2",
    "EEG-4": "FC4",
    "EEG-5": "C5",
    "EEG-C3": "C3",
    "EEG-6": "C1",
    "EEG-Cz": "CZ",
    "EEG-7": "C2",
    "EEG-C4": "C4",
    "EEG-8": "C6",
    "EEG-9": "CP3",
    "EEG-10": "CP1",
    "EEG-11": "CPZ",
    "EEG-12": "CP2",
    "EEG-13": "CP4",
    "EEG-14": "P1",
    "EEG-Pz": "PZ",
    "EEG-15": "P2",
    "EEG-16": "POZ",
}
ch_list = [
    "FP1",
    "FP2",
    "F7",
    "F3",
    "FZ",
    "F4",
    "F8",
    "T3",
    "C3",
    "CZ",
    "C4",
    "T4",
    "T5",
    "P3",
    "PZ",
    "P4",
    "T6",
    "O1",
    "O2",
]

In [8]:
keys_with_values_in_list = [key for key, value in ch_map.items() if value in ch_list]

In [9]:
print(keys_with_values_in_list[0])

In [10]:
def analyze_eeg_data(folder_path, ch_map):
    files_for_investigation = []
    sex_list = []
    age_list = []
    min_values = []
    max_values = []

    # Task 2: List all files with '-PSG.edf' in their names
    files = [
        f for f in os.listdir(folder_path) if ".gdf" in f
    ]  # T for Train, E for Evaluation
    print(files)
    print(f"Number of files: {len(files)}")

    # Task 3: Read each file and verify certain properties
    for file in files:
        print(file)
        try:
            tmp = mne.io.read_raw_gdf(
                os.path.join(folder_path, file),
                exclude=["EOG-left", "EOG-central", "EOG-right"],
                preload=True,
            )
            # Task 4: Get min and max values
            data = tmp.get_data()
            print(data.shape)
            min_values.append(np.min(data))
            max_values.append(np.max(data))

        except Exception as e:
            print(f"Error processing file {file}: {e}")
            files_for_investigation.append(file)

    return files_for_investigation, min_values, max_values

In [23]:
source_dir = "src/BCICIV_2a_gdf"
files_for_investigation, min_values, max_values = analyze_eeg_data(source_dir, ch_map)

Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A01E.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 686999  =      0.000 ...  2747.996 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A01T.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 672527  =      0.000 ...  2690.108 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A02E.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 662665  =      0.000 ...  2650.660 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A02T.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 677168  =      0.000 ...  2708.672 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A03E.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 648774  =      0.000 ...  2595.096 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A03T.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 660529  =      0.000 ...  2642.116 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A04E.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 660046  =      0.000 ...  2640.184 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A04T.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 600914  =      0.000 ...  2403.656 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A05E.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 679862  =      0.000 ...  2719.448 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A05T.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 686119  =      0.000 ...  2744.476 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A06E.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 666372  =      0.000 ...  2665.488 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A06T.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 678979  =      0.000 ...  2715.916 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A07E.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 673134  =      0.000 ...  2692.536 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A07T.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 681070  =      0.000 ...  2724.280 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A08E.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 687791  =      0.000 ...  2751.164 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A08T.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 675269  =      0.000 ...  2701.076 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A09E.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 675097  =      0.000 ...  2700.388 secs...


  next(self.gen)


Extracting EDF parameters from c:\Users\shreyas\Documents\GitHub\NeuroGPT\src\BCICIV_2a_gdf\A09T.gdf...
GDF file detected
Setting channel info structure...
Could not determine channel type of the following channels, they will be set as EEG:
EEG-Fz, EEG, EEG, EEG, EEG, EEG, EEG, EEG-C3, EEG, EEG-Cz, EEG, EEG-C4, EEG, EEG, EEG, EEG, EEG, EEG, EEG, EEG-Pz, EEG, EEG
Creating raw.info structure...
Reading 0 ... 673327  =      0.000 ...  2693.308 secs...


  next(self.gen)


In [24]:
ds_max, ds_min = 100, -100

def scaler(x):
    """
    Scales the input array x to the range [-1, 1].

    Parameters:
    - x (numpy.ndarray): The input array to be scaled.

    Returns:
    - numpy.ndarray: The scaled array.

    Raises:
    - ValueError: If the input is not a numpy array.
    - ValueError: If the input array is empty.
    - ZeroDivisionError: If the max and min values of the array are the same.
    """

    # Check if input is a numpy array
    if not isinstance(x, np.ndarray):
        raise ValueError("Input must be a numpy array.")

    # Check if the array is empty
    if x.size == 0:
        raise ValueError("Input array must not be empty.")

    # Calculate min and max
    x_min = np.min(x)
    x_max = np.max(x)

    # Check for division by zero
    if x_max == x_min:
        x_scaled = x / x_max if x_max != 0 else np.zeros_like(x)
        return x_scaled

    # Perform scaling
    x_std = (x - x_min) / (x_max - x_min)
    x_scaled = (x_std * 2) - 1

    return x_scaled


def process_file(raw, ch_map, ch_list, ds_max, ds_min):
    # selects 19 standard channels and adds a 20th
    raw = raw.copy()
    try:
        raw = raw.pick(ch_list)
    except ValueError as v:
        pl = v.args[0].split("[")[1].split("]")[0].split(",")
        pl = [p.strip(" ' ") for p in pl]
        new_pick = list(set(ch_list) - set(pl))
        raw = raw.pick(new_pick)

    if len(raw.ch_names) != len(ch_list):
        missing_channels = [ch for ch in ch_list if ch not in raw.ch_names]

        new_channel_data = np.vstack(
            [np.full((1, raw.n_times), 0)] * len(missing_channels)
        )
        new_channel_info = mne.create_info(
            ch_names=missing_channels,
            sfreq=raw.info["sfreq"],
            ch_types=["eeg"] * len(missing_channels),
        )
        new_channel_raw = mne.io.RawArray(
            data=new_channel_data, info=new_channel_info, first_samp=raw.first_samp
        )
        raw.load_data().add_channels([new_channel_raw], force_update_info=True)

    try:
        # raw = raw.rename_channels(ch_map)
        raw = raw.reorder_channels(ch_list)
    except Exception as e:
        print(f"Error in renaming or reordering channels: {e}")
        return None

    # scale
    trial_min = np.min(raw.get_data())
    trial_max = np.max(raw.get_data())
    raw = raw.load_data().apply_function(scaler, channel_wise=False)

    # add compensation channel
    compensation = (trial_max - trial_min) / (ds_max - ds_min)
    comp_ch_data = np.full((1, raw.n_times), compensation)
    comp_ch_info = mne.create_info(
        ch_names=["compensation"], sfreq=raw.info["sfreq"], ch_types="misc"
    )
    comp_ch_raw = mne.io.RawArray(
        data=comp_ch_data, info=comp_ch_info, first_samp=raw.first_samp
    )
    raw.add_channels([comp_ch_raw], force_update_info=True)
    
    return raw


def process_gdf_file(gdf_file):
    print("the file to be processed is: ", gdf_file)
    try:
        f = mne.io.read_raw_gdf(
            gdf_file, eog=["EOG-left", "EOG-central", "EOG-right"], preload=True
        )
        f.drop_channels(["EOG-left", "EOG-central", "EOG-right"])
    except Exception as e:
        print(f"Error reading EDF file {gdf_file}: {e}")
        return

    assert "lowpass" in f.info, "lowpass information is not available in f.info"
    assert f.info["lowpass"] > 0, "lowpass frequency should be greater than 0"
    assert f.info["sfreq"] > 0, "Sampling frequency should be greater than 0"

    if f.info["bads"]:
        print(f"Warning: The following channels are marked as bad: {f.info['bads']}")
        print(gdf_file)
        # input("Press Enter to continue or Ctrl+C to abort.")

    if 256 >= 2 * f.info.get("lowpass", 0):
        try:
            f = f.resample(sfreq=256)
            f = f.rename_channels(ch_map)
            f = process_file(
                f,
                ch_map=ch_map,
                ch_list=ch_list,
                ds_max=ds_max,
                ds_min=ds_min,
            )
        except Exception as e:
            print(
                f"An error occurred while processing the file {gdf_file}: {e} or while resampling"
            )
            # continue

        event_id = {"769": 0, "770": 1, "771": 2, "772": 3}
        events = mne.events_from_annotations(f, event_id=event_id)
        epochs = mne.Epochs(
            f, events[0], [0, 1, 2, 3], tmin=-2, tmax=4, on_missing="warn"
        )
        # print("here", np.max(f.get_data()), np.min(f.get_data()))
        df = epochs.to_data_frame(scalings=dict(eeg=1, mag=1, grad=1))
        # print("df", df.iloc[:, 3:].values.max(), df.iloc[:, 3:].values.min())
        df["person"] = f.info["subject_info"]["his_id"]
        indices = [(f.info["subject_info"]["his_id"], ep) for ep in df.epoch.unique()]

        return df, indices

In [25]:
mne.set_log_level("WARNING")

In [26]:
class EEGDatasetCls(torch.utils.data.Dataset):
    def __init__(self, df, idxs):
        self.df = df.sort_index()
        self.idxs = idxs

    def __len__(self):
        return len(self.idxs)

    def __getitem__(self, idx):
        data = self.df.loc[self.idxs[idx]].iloc[:, 1:]
        label = self.df.loc[self.idxs[idx]].iloc[:, 0].unique().astype(int)

        return torch.Tensor(data.values.T), torch.tensor(label, dtype=torch.long)

In [27]:
# Define the source and destination directories

dest_dir = "train"

# Ensure the destination directory exists, if not create it
os.makedirs(dest_dir, exist_ok=True)

# Iterate through the files in the source directory
for ff in os.listdir(source_dir):
    # Check if the file name (without extension) ends with 't', case-insensitive
    if ff.split(".")[0].lower().endswith("t"):
        # Construct the full path of the source and destination files
        source_file = os.path.join(source_dir, ff)
        dest_file = os.path.join(dest_dir, ff)
        
        # Check if the destination file already exists to avoid overwriting
        if os.path.exists(dest_file):
            print(f"File {dest_file} already exists, skipping.")
            continue
        
        try:
            # Copy the file from the source to destination directory
            shutil.copy(source_file, dest_file)
            print(f"Successfully copied {source_file} to {dest_file}")
        except Exception as e:
            # Handle any exceptions that occur during the copy
            print(f"Failed to copy {source_file} to {dest_file}. Error: {e}")

In [28]:
len(os.listdir('train'))

9

#Leave-One-Out Cross Validation (LOOCV):

In [82]:
def split_chunks(data, length=512, ovlp=51, num_chunks=34, start_point=-1): 
        '''2 seconds, 0.2 seconds overlap'''
        all_chunks = []
        total_len = data.shape[1]
        actual_num_chunks = num_chunks
        
        if start_point == -1:
            if num_chunks * length > total_len - 1:
                start_point = 0
                actual_num_chunks = total_len // length
            else:
                start_point = np.random.randint(0, total_len - num_chunks * length)
        
        for i in range(actual_num_chunks):
            chunk = data[:, start_point: start_point + length]
            all_chunks.append(np.array(chunk))
            start_point = start_point + length - ovlp
        return np.array(all_chunks), start_point

In [97]:
def training(data_loader, model, criterion, optimizer, scheduler=None):
    model.train()
    total_loss = 0
    for batch in data_loader:
        inputs, targets = batch  # Assuming batch returns inputs and targets
       # inputs = split_chunks(inputs)
       # inputs = torch.tensor(inputs, dtype=torch.float64)
        print(inputs.shape)
        print(inputs.unsqueeze(1).shape)
        inputs = inputs[:, :, :-1]
        # Prepare the batch in the required dictionary format
        print(inputs.unsqueeze(1).shape)
        model_batch = {'inputs': inputs.unsqueeze(1)}  # Unsqueezing along the third dimension

        optimizer.zero_grad()
        outputs = model(model_batch)  # Pass the dictionary to the model
        logits = outputs['outputs']  # Adjust this if your model output structure is different

        loss = criterion(logits, targets)
        loss.backward()
        optimizer.step()

        if scheduler:
            scheduler.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    return avg_loss


In [45]:
def validation(data_loader, model, criterion):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0
    with torch.no_grad():  # Disable gradient calculation
        for batch in data_loader:
            inputs, targets = batch  # Adjust according to your data format

            outputs = model(inputs)  # Forward pass
            loss = criterion(outputs, targets)  # Compute the loss

            total_loss += loss.item()

    avg_loss = total_loss / len(data_loader)
    return avg_loss


In [98]:
# Define the source and destination directory for the data
from decoder.make_decoder import make_decoder
from embedder.make import make_embedder
from encoder.conformer_braindecode import EEGConformer


source_dir = "train"
test_dir = "test"
# Example default configuration
model_config = {
  "train_data_path": "C:\\Users\\shreyas\\Documents\\GitHub\\NeuroGPT\\tuh_tensors\\",
  "parcellation_dim": 1120,
  "pretrained_model": None,
  "embedding_dim": 1024,
  "num_hidden_layers_embedding_model": 1,
  "freeze_embedder": False,
  "num_hidden_layers_unembedding_model": 1,
  "freeze_unembedder": False,
  "architecture": "GPT",
  "num_hidden_layers": 6,
  "num_attention_heads": 16,
  "intermediate_dim_factor": 4,
  "hidden_activation": "gelu_new",
  "freeze_decoder": False,
  "freeze_decoder_without_pooler_heads": None,
  "resume_from": None,
  "training_style": "CSM",
  "decoding_target": None,
  "num_decoding_classes": 4,
  "training_steps": 5,
  "validation_steps": 1000,
  "test_steps": 1000,
  "per_device_training_batch_size": 1,
  "per_device_validation_batch_size": 1,
  "optim": "adamw_hf",
  "learning_rate": 0.0001,
  "warmup_ratio": 0.01,
  "weight_decay": 0.1,
  "adam_beta_1": 0.9,
  "adam_beta_2": 0.999,
  "adam_epsilon": 1e-08,
  "max_grad_norm": 1.0,
  "lr_scheduler": "linear",
  "dropout": 0.1,
  "log_dir": "results/models/upstream\\32clen2_embed1024",
  "log_every_n_steps": 4,
  "run_name": "32clen2_embed1024",
  "wandb_mode": "disabled",
  "wandb_project_name": "learning-from-brains",
  "seed": 1234,
  "set_seed": True,
  "fp16": True,
  "deepspeed": None,
  "local_rank": -1,
  "num_workers": 1,
  "plot_model_graph": False,
  "smoke_test": False,
  "bold_dummy_mode": False,
  "do_train": True,
  "n_positions": 512,
  "chunk_len": 512,
  "num_chunks": 34,
  "chunk_ovlp": 51,
  "sampling_rate": 250,
  "fold_i": 0,
  "use_encoder": True,
  "do_normalization": True,
  "filter_time_length": 25,
  "pool_time_length": 75,
  "stride_avg_pool": 15,
  "n_filters_time": 40,
  "num_encoder_layers": 6,
  "eval_every_n_steps": 4,
  "freeze_encoder": False,
  "ft_only_encoder": 'True'
}


# Ensure the destination directory exists, if not create it
os.makedirs(test_dir, exist_ok=True)

encoder = EEGConformer(n_outputs=model_config["num_decoding_classes"], n_chans=20, n_times=model_config['chunk_len'], ch_pos=None, is_decoding_mode=model_config["ft_only_encoder"])
embedder = make_embedder(
        training_style=model_config["training_style"],
        architecture=model_config["architecture"],
        in_dim=model_config["parcellation_dim"], # flattened, channel x chunk length
        embed_dim=model_config["embedding_dim"],
        num_hidden_layers=model_config["num_hidden_layers_embedding_model"],
        dropout=model_config["dropout"],
        n_positions=model_config["n_positions"]
    )
decoder = make_decoder(
        architecture=model_config["architecture"],
        num_hidden_layers=model_config["num_hidden_layers"],
        embed_dim=model_config["embedding_dim"],
        num_attention_heads=model_config["num_attention_heads"],
        n_positions=model_config["n_positions"],
        intermediate_dim_factor=model_config["intermediate_dim_factor"],
        hidden_activation=model_config["hidden_activation"],
        dropout=model_config["dropout"]
    )
model = Model(encoder,embedder,decoder)
model.from_pretrained("src/pytorch_model.bin")

# Configure the model for encoder-only fine-tuning
model.switch_ft_mode(ft_encoder_only=True)


# List and sort all files in the source directory
all_data = os.listdir(source_dir)
all_data.sort()

# Initialize lists to store validation and training results
val_results = []
trn_results = []
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
  # Fraction of the cycle (total training steps) spent increasing the learning rate


# Loop through all files, treating each file as the test data in turn
for i in range(len(all_data)):
    # Pop the test data file from the list
    test_data = all_data.pop(i)
    print(f"Using {test_data} as test data, {len(all_data)} files remain for training.")
    
    # Empty the test directory if it contains any files
    for file in os.listdir(test_dir):
        file_path = os.path.join(test_dir, file)
        if os.path.isfile(file_path):
            os.unlink(file_path)
            print(f"Removed {file_path}")
    
    # Copy the test data file to the test directory
    shutil.copy(f"{source_dir}/{test_data}", f"{test_dir}/{test_data}")
    print(f"Copied {test_data} to {test_dir}/{test_data}")
    
    # Initialize empty lists to store DataFrames and indices
    dfs = []
    idxs = []
    # Process each training file and accumulate the data and indices
    for trn_file in all_data:
        dd, idd = process_gdf_file(f"{source_dir}/{trn_file}")
        dfs.append(dd)
        idxs += idd
    
    # Concatenate all training data into a single DataFrame
    train_df = pd.concat(dfs, axis=0)
    # Set multi-index for the training DataFrame
    train_df.set_index(["person", "epoch"], inplace=True)
    # Drop the 'time' column as it's not needed
    train_df.drop("time", inplace=True, axis=1)
    
    # Process the test data file
    test_df, test_idxs = process_gdf_file(f"{test_dir}/{test_data}")
    # Drop the 'time' column from the test DataFrame as well
    test_df.drop("time", axis=1, inplace=True)
    # Set multi-index for the test DataFrame
    test_df.set_index(["person", "epoch"], inplace=True)
    
    # Initialize the pre-trained model and the linear classifier
    # Code needs to be added
    
    # Set up the optimizer for training
    # Code needs to be added
    
    # Create DataLoader instances for training and testing data
    trn_dataset = EEGDatasetCls(train_df, idxs)
    train_loader = DataLoader(trn_dataset, 60, pin_memory=True)
    test_dataset = EEGDatasetCls(test_df, test_idxs)
    test_loader = DataLoader(test_dataset, 60, pin_memory=True)
    
    # Define the number of epochs and set up the learning rate scheduler
    epochs = 15
    pct_start = 0.3
    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=5e-5,
        steps_per_epoch=len(train_loader),
        epochs=epochs,
        pct_start= pct_start,
    )
    # Define the loss function
    criterion = nn.CrossEntropyLoss()
    
    # Initialize the best validation accuracy to zero
    best_val_acc = 0.0
    
    # Training and validation loop
    for e in tqdm(range(epochs)):
        # Perform training and validation for the current epoch
        trn_res = training(train_loader, model, criterion, optimizer, scheduler)
        val_res = validation(test_loader, model, criterion) 
        
        # Update the best validation result if necessary
        if val_res["acc"] >= best_val_acc:
            best_val = val_res
            best_val_acc = val_res["acc"]
        
        # Print the training and validation results for the current epoch
        print(
            "Epoch %d: trn_loss %.4f val_loss %.4f val_acc %.4f trn_acc %.4f"
            % (e, trn_res["loss"], val_res["loss"], val_res["acc"], trn_res["acc"])
        )
    
    # Append the training and validation results to the results lists
    trn_results.append(trn_res)
    val_results.append(best_val)
    
    # Delete the linear classifier to free up GPU memory
    
    
    # Reinsert the test_data back into all_data for the next iteration
    all_data.insert(i, test_data)
    print(f"Reinserted {test_data} back into all_data for the next iteration")

FC Layer for Classification created.




Loading pretrained model from src/pytorch_model.bin




  next(self.gen)


  next(self.gen)


  next(self.gen)


  next(self.gen)


  next(self.gen)


  next(self.gen)


  next(self.gen)


  next(self.gen)


  next(self.gen)
  0%|          | 0/15 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (30x7680 and 2240x256)

In [121]:
k = torch.rand([20,1537])
v = split_chunks(k)
n = torch.tensor(v[0], dtype=torch.float64)
print(v[1])
print("Input tensor:")
encoder(v[0])



TypeError: 'int' object is not callable

In [None]:
# Leave-One-Out Cross Validation (LOOCV):
val_results = []
trn_results = []
for i in range(len(os.listdir("train"))):
    all_data = os.listdir("train")
    all_data.sort()

    test_data = all_data.pop(i)
    print(test_data, len(all_data))
    shutil.rmtree("test")
    os.mkdir("test")
    shutil.copy(f"train/{test_data}", f"test/{test_data}")

    dfs = []
    idxs = []

    for trn_file in all_data:
        dd, idd = process_gdf_file(f"train/{trn_file}")
        dfs.append(dd)
        idxs += idd

    train_df = pd.concat(dfs, axis=0)
    train_df.set_index(["person", "epoch"], inplace=True)
    train_df.drop("time", inplace=True, axis=1)

    test_df, test_idxs = process_gdf_file(f"test/{test_data}")
    test_df.drop("time", axis=1, inplace=True)
    test_df.set_index(["person", "epoch"], inplace=True)


    trn_dataset = EEGDatasetCls(train_df, idxs)
    train_loader = DataLoader(trn_dataset, 60, pin_memory=True)

    test_dataset = EEGDatasetCls(test_df, test_idxs)
    test_loader = DataLoader(test_dataset, 60, pin_memory=True)

    epochs = 15

    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=5e-5,
        steps_per_epoch=len(train_loader),
        epochs=epochs,
        pct_start= pct_start,
    )
    criterion = nn.CrossEntropyLoss()

    best_val_acc = 0.0
    for e in tqdm(range(epochs)):
        trn_res = training(
            train_loader, linear_classifier, criterion, optimizer, scheduler
        )
        val_res = validation(test_loader, linear_classifier, criterion)

        if val_res["acc"] >= best_val_acc:
            best_val = val_res
            best_val_acc = val_res["acc"]
        print(
            "trn_loss %.4f val_loss %.4f val_acc %.4f trn_acc %.4f"
            % (trn_res["loss"], val_res["loss"], val_res["acc"], trn_res["acc"])
        )

    trn_results.append(trn_res)
    val_results.append(best_val)
