# Dataset
We will explore this dataset: https://archive.ics.uci.edu/ml/datasets/EEG+Eye+State#

> All data is from one continuous EEG measurement with the Emotiv EEG Neuroheadset. The duration of the measurement was 117 seconds. The eye state was detected via a camera during the EEG measurement and added later manually to the file after analysing the video frames. '1' indicates the eye-closed and '0' the eye-open state. All values are in chronological order with the first measured value at the top of the data.

In [1]:
%load_ext autoreload
%autoreload 2
from sys import path
import torch
import torch.utils.data.dataloader
import numpy as np
path.append('..')

In [2]:
import tensorflow as tf
data_dir = "../../data/raw"
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00264/EEG%20Eye%20State.arff"
datapath = tf.keras.utils.get_file(
        "eeg", origin=url, untar=False, cache_dir=data_dir
    )

from scipy.io import arff
data = arff.loadarff(datapath)
    

2022-06-07 14:16:16.537310: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-06-07 14:16:16.537338: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
type(data)

tuple

In [4]:
len(data)

2

In [5]:
type(data[0]) , type (data[1])

(numpy.ndarray, scipy.io.arff._arffread.MetaData)

In [6]:
newdata = data[0].tolist()
type(newdata)

list

In [7]:
from __future__ import annotations
from typing import Tuple
Tensor = torch.Tensor

class BaseListDataset():
    """Base class for loading list data
    """
    def __init__(self, data: list):
        self.data = data
        self.dataset = []
        self.process_data()
    
    def process_data(self) -> None:
        # abstract function which needs to be inherited
        raise NotImplementedError

    def __len__(self) -> int:
        return len(self.dataset)

    def __getitem__(self, idx: int) -> Tuple[Tensor, int]:
        return self.dataset[idx]

class EEGListDataset(BaseListDataset):
    """Processes data for EEG Lists

    Args:
        BaseListDataset (_type_): base class for list data
    """
    def process_data(self) -> None:
        for record in self.data:
            x = torch.tensor([record[0],record[1],record[2],record[3],
                             record[4],record[5],record[6],record[7],
                             record[8],record[9],record[10],record[11],
                             record[12],record[13]],dtype=float
                            )
            y= torch.tensor(int(record[14]))
            self.dataset.append((x, y))

In [8]:
dsNew = EEGListDataset(data=data[0].tolist())
x,y = dsNew[12000]
y

tensor(1)

In [9]:
from torch.nn.utils.rnn import pad_sequence

class BaseDataIterator:
    def __init__(self, dataset: BaseListDataset, batchsize: int):
        self.dataset = dataset
        self.batchsize = batchsize
        self.curindex = 0

    def __len__(self) -> int:
        # the lenght is the amount of batches
        return int(len(self.dataset) / self.batchsize)

    def __iter__(self) -> BaseDataIterator:
        # initialize index
        self.index = 0
        self.index_list = torch.randperm(len(self.dataset))
        return self
    
    def batchloop(self) -> Tuple[Tensor, Tensor]:
        X = []  # noqa N806
        Y = []  # noqa N806
        # fill the batch
        for _ in range(self.batchsize):
            x, y = self.dataset[int(self.index_list[self.index])]
            X.append(x)
            Y.append(y)
            self.index += 1
        return X, Y

    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index <= (len(self.dataset) - self.batchsize):
            X, Y = self.batchloop()
            return X, Y
        else:
            raise StopIteration


class PaddedDatagenerator(BaseDataIterator):
    # again, we inherit everything from the baseclass
    def __init__(self, dataset: BaseListDataset	, batchsize: int) -> None:
        # we initialize the super class BaseDataIterator
        # we now have everything the BaseDataIterator can do, for free
        super().__init__(dataset, batchsize)
    
    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index <= (len(self.dataset) - self.batchsize):
            X, Y = self.batchloop()
            # I do not have a clue why this function returns a tensort and torch.tensor(X) gives me an error
            X_ = pad_sequence(X, batch_first=True, padding_value=0)  # noqa N806
            return X_, torch.tensor(Y)
        else:
            raise StopIteration

In [10]:
trainloader = PaddedDatagenerator(dsNew, batchsize=32)
testloader = PaddedDatagenerator(dsNew, batchsize=32)

In [11]:
x, y = next(iter(trainloader))
x.shape, y.shape


(torch.Size([32, 14]), torch.Size([32]))

In [12]:

class SwitchIterator:
    def __init__(self, dataset: BaseListDataset):
        self.dataset = dataset
        self.curindex = 0
        self.index = 0

    def __len__(self) -> int:
        # the lenght is the amount of batches
        return int(len(self.dataset) / self.batchsize)

    def __iter__(self) -> BaseDataIterator:
        # initialize index
        
        return self
    
    def batchloop(self) -> Tuple[Tensor, Tensor]:
        X = []  # noqa N806
        Y = []  # noqa N806
        # fill the batch
        x, y = self.dataset[int(self.index)]
        X.append(x)
        Y.append(y)
        self.index = self.index + 1 
        currentY = y
        while y == currentY:
            if self.index +1 < len(self.dataset):
                X.append(x)
                Y.append(y)  
                self.index = self.index + 1      
                x, y = self.dataset[int(self.index)]                   
            else:
                self.index = 0
                break                               
     
        return X, Y

    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index < (len(self.dataset)):            
            X, Y = self.batchloop()
            return X, Y
        else:
            raise StopIteration


class SwitchPaddedDatagenerator(SwitchIterator):
    # again, we inherit everything from the baseclass
    def __init__(self, dataset: BaseListDataset) -> None:
        # we initialize the super class BaseDataIterator
        # we now have everything the BaseDataIterator can do, for free
        super().__init__(dataset)
    
    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index < (len(self.dataset)):
            X, Y = self.batchloop()
            # we just want to add padding
            X_ = pad_sequence(X, batch_first=True, padding_value=0)  # noqa N806
            return X_, torch.tensor(Y)
        else:
            raise StopIteration

In [13]:
trainloader = SwitchPaddedDatagenerator(dsNew)
testloader = SwitchPaddedDatagenerator(dsNew)



In [14]:
for i in range(1,23):
    x, y = next(iter(trainloader))

In [15]:
x, y = next(iter(trainloader))
x.shape



torch.Size([670, 14])

In [16]:
x

tensor([[4424.1000, 4087.6900, 4311.2800,  ..., 4314.8700, 4675.9000,
         4466.6700],
        [4424.1000, 4087.6900, 4311.2800,  ..., 4314.8700, 4675.9000,
         4466.6700],
        [4417.9500, 4081.5400, 4310.7700,  ..., 4318.4600, 4693.8500,
         4462.5600],
        ...,
        [4322.0500, 4034.8700, 4280.0000,  ..., 4291.7900, 4617.4400,
         4375.9000],
        [4315.9000, 4031.7900, 4275.3800,  ..., 4285.6400, 4618.4600,
         4376.9200],
        [4315.9000, 4027.6900, 4275.9000,  ..., 4282.0500, 4618.4600,
         4374.3600]], dtype=torch.float64)

In [17]:
class EEGBatchIterator:
    def __init__(self, dataset: BaseListDataset, batchsize: int):
        self.dataset = dataset
        self.batchsize = batchsize
        self.curindex = 0
        self.index = 0

    def __len__(self) -> int:
        # the lenght is the amount of batches
        return int(len(self.dataset) / self.batchsize)

    def __iter__(self) -> BaseDataIterator:
        # initialize index
        
        return self
    
    def batchloop(self) -> Tuple[Tensor, Tensor]:
        X = []  # noqa N806
        Y = []  # noqa N806
        # fill the batch
        x, y = self.dataset[int(self.index)]
        count = 1
        X.append(x)
        Y.append(y)
        self.index = self.index + 1 
        currentY = y
        while y == currentY and count < self.batchsize :
            if self.index == self.__len__:
                break
            else:                                 
                X.append(x)
                Y.append(y)
                count = count +1
                self.index = self.index + 1      
                x, y = self.dataset[int(self.index)]        
        return X, Y

    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index <= (len(self.dataset)):
            X, Y = self.batchloop()
            # we just want to add padding
            X_ = pad_sequence(X, batch_first=True, padding_value=0)  # noqa N806
            return X_, torch.tensor(Y)
        else:
            raise StopIteration




In [18]:
trainloader2 = EEGBatchIterator(dsNew, batchsize=64)


In [19]:
x, y = next(iter(trainloader2))
x.shape, y

(torch.Size([64, 14]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [20]:
x

tensor([[4329.2300, 4009.2300, 4289.2300, 4148.2100, 4350.2600, 4586.1500,
         4096.9200, 4641.0300, 4222.0500, 4238.4600, 4211.2800, 4280.5100,
         4635.9000, 4393.8500],
        [4329.2300, 4009.2300, 4289.2300, 4148.2100, 4350.2600, 4586.1500,
         4096.9200, 4641.0300, 4222.0500, 4238.4600, 4211.2800, 4280.5100,
         4635.9000, 4393.8500],
        [4327.6900, 4006.6700, 4295.3800, 4156.4100, 4336.9200, 4583.5900,
         4096.9200, 4630.2600, 4207.6900, 4222.0500, 4206.6700, 4282.0500,
         4628.7200, 4389.2300],
        [4328.7200, 4011.7900, 4296.4100, 4155.9000, 4343.5900, 4582.5600,
         4097.4400, 4630.7700, 4217.4400, 4235.3800, 4210.7700, 4287.6900,
         4632.3100, 4396.4100],
        [4326.1500, 4011.7900, 4292.3100, 4151.2800, 4347.6900, 4586.6700,
         4095.9000, 4627.6900, 4210.7700, 4244.1000, 4212.8200, 4288.2100,
         4632.8200, 4398.4600],
        [4321.0300, 4004.6200, 4284.1000, 4153.3300, 4345.6400, 4587.1800,
         4093.3