# Dataset
We will explore this dataset: https://archive.ics.uci.edu/ml/datasets/EEG+Eye+State#

> All data is from one continuous EEG measurement with the Emotiv EEG Neuroheadset. The duration of the measurement was 117 seconds. The eye state was detected via a camera during the EEG measurement and added later manually to the file after analysing the video frames. '1' indicates the eye-closed and '0' the eye-open state. All values are in chronological order with the first measured value at the top of the data.

In [1]:
%load_ext autoreload
%autoreload 2
from sys import path
import torch
import torch.utils.data.dataloader
import numpy as np
path.append('..')

In [2]:
import tensorflow as tf
data_dir = "../../data/raw"
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00264/EEG%20Eye%20State.arff"
datapath = tf.keras.utils.get_file(
        "eeg", origin=url, untar=False, cache_dir=data_dir
    )

from scipy.io import arff
data = arff.loadarff(datapath)
    

In [3]:
type(data)

tuple

In [4]:
len(data)

2

In [5]:
type(data[0]) , type (data[1])

(numpy.ndarray, scipy.io.arff._arffread.MetaData)

In [6]:
newdata = data[0].tolist()
type(newdata)

list

In [7]:
from __future__ import annotations
from typing import Tuple
Tensor = torch.Tensor

class BaseListDataset():
    """Base class for loading list data
    """
    def __init__(self, data: list):
        self.data = data
        self.dataset = []
        self.process_data()
    
    def process_data(self) -> None:
        # abstract function which needs to be inherited
        raise NotImplementedError

    def __len__(self) -> int:
        return len(self.dataset)

    def __getitem__(self, idx: int) -> Tuple[Tensor, int]:
        return self.dataset[idx]

class EEGListDataset(BaseListDataset):
    """Processes data for EEG Lists

    Args:
        BaseListDataset (_type_): base class for list data
    """
    def process_data(self) -> None:
        for record in self.data:
            x = torch.tensor([record[0],record[1],record[2],record[3],
                             record[4],record[5],record[6],record[7],
                             record[8],record[9],record[10],record[11],
                             record[12],record[13]],dtype=float
                            )
            y= torch.tensor(int(record[14]))
            self.dataset.append((x, y))

In [8]:
dsNew = EEGListDataset(data=data[0].tolist())
x,y = dsNew[12000]
y

tensor(1)

In [9]:
from torch.nn.utils.rnn import pad_sequence

class BaseDataIterator:
    def __init__(self, dataset: BaseListDataset, batchsize: int):
        self.dataset = dataset
        self.batchsize = batchsize
        self.curindex = 0

    def __len__(self) -> int:
        # the lenght is the amount of batches
        return int(len(self.dataset) / self.batchsize)

    def __iter__(self) -> BaseDataIterator:
        # initialize index
        self.index = 0
        self.index_list = torch.randperm(len(self.dataset))
        return self
    
    def batchloop(self) -> Tuple[Tensor, Tensor]:
        X = []  # noqa N806
        Y = []  # noqa N806
        # fill the batch
        for _ in range(self.batchsize):
            x, y = self.dataset[int(self.index_list[self.index])]
            X.append(x)
            Y.append(y)
            self.index += 1
        return X, Y

    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index <= (len(self.dataset) - self.batchsize):
            X, Y = self.batchloop()
            return X, Y
        else:
            raise StopIteration


class PaddedDatagenerator(BaseDataIterator):
    # again, we inherit everything from the baseclass
    def __init__(self, dataset: BaseListDataset	, batchsize: int) -> None:
        # we initialize the super class BaseDataIterator
        # we now have everything the BaseDataIterator can do, for free
        super().__init__(dataset, batchsize)
    
    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index <= (len(self.dataset) - self.batchsize):
            X, Y = self.batchloop()
            # I do not have a clue why this function returns a tensort and torch.tensor(X) gives me an error
            X_ = pad_sequence(X, batch_first=True, padding_value=0)  # noqa N806
            return X_, torch.tensor(Y)
        else:
            raise StopIteration

In [10]:
trainloader = PaddedDatagenerator(dsNew, batchsize=32)
testloader = PaddedDatagenerator(dsNew, batchsize=32)

In [27]:
x, y = next(iter(trainloader))
x.shape, y.shape

y

3342-14980


tensor([1, 1, 1,  ..., 1, 1, 1])

In [12]:

class SwitchIterator:
    def __init__(self, dataset: BaseListDataset):
        self.dataset = dataset
        self.curindex = 0
        self.index = 0

    def __len__(self) -> int:
        # the lenght is the amount of batches
        return int(len(self.dataset) / self.batchsize)

    def __iter__(self) -> BaseDataIterator:
        # initialize index
        
        return self
    
    def batchloop(self) -> Tuple[Tensor, Tensor]:
        X = []  # noqa N806
        Y = []  # noqa N806
        # fill the batch
        x, y = self.dataset[int(self.index)]
        X.append(x)
        Y.append(y)
        self.index = self.index + 1 
        currentY = y
        while y == currentY:
            if self.index +1 < len(self.dataset):
                X.append(x)
                Y.append(y)  
                self.index = self.index + 1      
                x, y = self.dataset[int(self.index)]                   
            else:
                self.index = 0
                break                               
     
        return X, Y

    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index < (len(self.dataset)):            
            X, Y = self.batchloop()
            return X, Y
        else:
            raise StopIteration


class SwitchPaddedDatagenerator(SwitchIterator):
    # again, we inherit everything from the baseclass
    def __init__(self, dataset: BaseListDataset) -> None:
        # we initialize the super class BaseDataIterator
        # we now have everything the BaseDataIterator can do, for free
        super().__init__(dataset)
    
    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index < (len(self.dataset)):
            X, Y = self.batchloop()
            # we just want to add padding
            X_ = pad_sequence(X, batch_first=True, padding_value=0)  # noqa N806
            return X_, torch.tensor(Y)
        else:
            raise StopIteration

In [13]:
trainloader = SwitchPaddedDatagenerator(dsNew)
testloader = SwitchPaddedDatagenerator(dsNew)



In [28]:
for i in range(1,27):
    x, y = next(iter(trainloader))

4352-14980
5244-14980
5928-14980
6653-14980
9054-14980
11105-14980
12076-14980
12728-14980
12771-14980
12976-14980
13028-14980
14217-14980
14289-14980
14959-14980
0-14980
188-14980
871-14980
1336-14980
1638-14980
2176-14980
2633-14980
2900-14980
2927-14980
3342-14980
4352-14980
5244-14980


In [33]:
x, y = next(iter(trainloader))
x.shape



11105-14980


torch.Size([971, 14])

In [31]:
x

tensor([1, 1, 1,  ..., 1, 1, 1])

In [17]:
class EEGBatchIterator:
    def __init__(self, dataset: BaseListDataset, batchsize: int):
        self.dataset = dataset
        self.batchsize = batchsize
        self.curindex = 0
        self.index = 0

    def __len__(self) -> int:
        # the lenght is the amount of batches
        return int(len(self.dataset) / self.batchsize)

    def __iter__(self) -> BaseDataIterator:
        # initialize index
        
        return self
    
    def batchloop(self) -> Tuple[Tensor, Tensor]:
        X = []  # noqa N806
        Y = []  # noqa N806
        # fill the batch
        x, y = self.dataset[int(self.index)]
        count = 1
        X.append(x)
        Y.append(y)
        self.index = self.index + 1 
        currentY = y
        while y == currentY and count < self.batchsize :
            if self.index == self.__len__:
                break
            else:                                 
                X.append(x)
                Y.append(y)
                count = count +1
                self.index = self.index + 1      
                x, y = self.dataset[int(self.index)]        
        return X, Y

    def __next__(self) -> Tuple[Tensor, Tensor]:
        if self.index <= (len(self.dataset)):
            X, Y = self.batchloop()
            # we just want to add padding
            X_ = pad_sequence(X, batch_first=True, padding_value=0)  # noqa N806
            return X_, torch.tensor(Y)
        else:
            raise StopIteration




In [18]:
trainloader2 = EEGBatchIterator(dsNew, batchsize=64)


In [55]:
x, y = next(iter(trainloader2))
x.shape, y

(torch.Size([17, 14]),
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]))

In [35]:
x

tensor([[4296.4100, 4004.1000, 4263.5900, 4122.0500, 4342.0500, 4590.7700,
         4082.0500, 4606.1500, 4176.9200, 4211.2800, 4194.3600, 4267.1800,
         4602.0500, 4360.5100],
        [4296.4100, 4004.1000, 4263.5900, 4122.0500, 4342.0500, 4590.7700,
         4082.0500, 4606.1500, 4176.9200, 4211.2800, 4194.3600, 4267.1800,
         4602.0500, 4360.5100],
        [4313.3300, 4008.2100, 4268.7200, 4136.9200, 4342.5600, 4595.3800,
         4091.2800, 4623.0800, 4207.6900, 4222.0500, 4206.1500, 4283.0800,
         4607.6900, 4362.5600],
        [4316.4100, 4014.3600, 4278.9700, 4139.4900, 4337.9500, 4583.5900,
         4085.6400, 4614.8700, 4198.4600, 4234.8700, 4212.3100, 4285.6400,
         4620.0000, 4378.4600],
        [4320.0000, 4017.4400, 4284.1000, 4137.9500, 4337.4400, 4586.1500,
         4086.6700, 4612.3100, 4198.9700, 4242.5600, 4217.9500, 4284.6200,
         4626.6700, 4389.7400],
        [4328.2100, 4021.0300, 4283.0800, 4141.0300, 4332.8200, 4599.4900,
         4100.0