In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'brain-motor-imagery-classification:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F77659%2F8567221%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240516%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240516T131433Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D4f4ff009c47fb4ce28e14c1d732038b948ab7113100a1bb4ce82a80fec2a3a982a0fc8da28c9e99d4e89ee69d9112ec1261bb9035a15eb8cdc7d46732eecf8a4a691c5c2755ccde1bd23b8c33cb40cb7fc3bdec549111a9a8885e901693722ac723b12b1d00eead748d2c29ada3a3b9f0723ec94ead5460ef81f5f520b5bf26973faa8980dbe22f65794585128bc21cbe990cc662facf5f3833eb643679dcc7cc983482aa8a95e72893239d899e006d56877cb1296680426e4f7f119ddf93c4f3297784807237af23c295efb3a8f1ea3c81f4a75ed237457d5bf77a6a29e1279e8a17ccf2507e06551f174fc1af0d4fb0d14547c71122da3bd899f72e2b69420'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

# make dataframe

In [None]:
train_list = []

for dirname, _, filenames in os.walk('/kaggle/input/brain-motor-imagery-classification/train/train'):
    for filename in filenames:
        filename = filename.replace('data_time_series.npy', '').replace('data_time_stamps.npy', '').replace('label_time_series.npy', '').replace('label_time_stamps.npy', '')
        if filename not in train_list:
            train_list.append(filename)
print(len(train_list))

In [None]:
train_list[:10]

In [None]:
index_dict = {}
for file in tqdm(train_list):
    data_time = np.load('/kaggle/input/brain-motor-imagery-classification/train/train/' + file + 'data_time_stamps.npy')
    label_time = np.load('/kaggle/input/brain-motor-imagery-classification/train/train/' + file + 'label_time_stamps.npy')

    start_index = []
    for lt in label_time:
        for dt in data_time:
            if dt >= lt:
                start_index.append(np.where(data_time == dt)[0][0])
                break

    index_dict[file] = start_index

In [None]:
label_list = []
signal_list = []
dir_list = []
for file in tqdm(train_list):
    index_arr = index_dict[file]
    data_series = np.load('/kaggle/input/brain-motor-imagery-classification/train/train/' + file + 'data_time_series.npy')
    label_series = np.load('/kaggle/input/brain-motor-imagery-classification/train/train/' + file + 'label_time_series.npy')

    for label in label_series:
        dir_list.append(file + 'data_time_series.npy')
        label_list.append(label[0])
#     print(dir_list)
#     print(label_list)
#     break
    for i in range(len(index_arr)):
        if i != len(index_arr)-1:
            cut = data_series[index_arr[i]:index_arr[i+1], :8]
        else:
            cut = data_series[index_arr[i]:, :8]
        signal_list.append(cut)

# EDA

## old train

- visualize each block

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

path = "/kaggle/input/brain-motor-imagery-classification/train/train/s1_d2_p002_002_"
data_series = np.load(path + 'data_time_series.npy')
data_stamps = np.load(path + 'data_time_stamps.npy')
label_series = np.load(path + 'label_time_series.npy')
label_stamps = np.load(path + 'label_time_stamps.npy')

In [None]:
print("data time series shape:", data_series.shape)
print("data time stamps shape:", data_stamps.shape)
print("label time series shape:", label_series.shape)
print("label time stamps shape:", label_stamps.shape)

In [None]:
data_series

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(9, 2, figsize=(10, 20))
count = 0

for i in range(9):
    for j in range(2):
        if count == 17:
            break
        axes[i, j].plot(data_series[:, count])
        axes[i, j].set_title('Chanel '+str(count+1))
        count += 1
plt.tight_layout()

In [None]:
data_stamps

In [None]:
label_series

In [None]:
label_stamps

- visual each label

In [None]:
lbst_0 = label_stamps[0]
lbsr_0 = label_series[0]
lbst_0, lbsr_0

In [None]:
dtst_0 = data_stamps[data_stamps < lbst_0]
len(dtst_0)

In [None]:
dtsr_0 = data_series[:len(dtst_0)]
dtsr_0

In [None]:
lbst_1 = label_stamps[1]
lbsr_1 = label_series[1]
lbst_1, lbsr_1

In [None]:
dtst_1 = data_stamps[data_stamps < lbst_1]
dtst_1 = dtst_1[dtst_1 >= lbst_0]
len(dtst_1)

In [None]:
dtsr_1 = data_series[len(dtst_0):len(dtst_0) + len(dtst_1)]
dtsr_1

label 110

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(9, 2, figsize=(10, 20))
count = 0

for i in range(9):
    for j in range(2):
        if count == 17:
            break
        axes[i, j].plot(dtsr_1[:, count])
        axes[i, j].set_title('Chanel '+str(count+1))
        count += 1
plt.tight_layout() # 110

In [None]:
lbst_2 = label_stamps[2]
lbsr_2 = label_series[2]

lbst_3 = label_stamps[3]
lbsr_3 = label_series[3]

lbst_4 = label_stamps[4]
lbsr_4 = label_series[4]
lbst_4, lbsr_4

In [None]:
dtst_2 = data_stamps[data_stamps < lbst_2]
dtst_2 = dtst_2[dtst_2 >= lbst_1]
len(dtst_2)

In [None]:
dtst_3 = data_stamps[data_stamps < lbst_3]
dtst_3 = dtst_3[dtst_3 >= lbst_2]
len(dtst_3)

In [None]:
dtst_4 = data_stamps[data_stamps < lbst_4]
dtst_4 = dtst_4[dtst_4 >= lbst_3]
len(dtst_4)

In [None]:
ind = len(dtst_0) + len(dtst_1)
ind

In [None]:
dtsr_2 = data_series[ind:ind + len(dtst_2)]
len(dtsr_2)

In [None]:
ind += len(dtsr_2)
dtsr_3 = data_series[ind:ind + len(dtst_3)]
len(dtsr_3)

In [None]:
ind += len(dtsr_3)
dtsr_4 = data_series[ind:ind + len(dtst_4)]
len(dtsr_4)

label 120

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(9, 2, figsize=(10, 20))
count = 0

for i in range(9):
    for j in range(2):
        if count == 17:
            break
        axes[i, j].plot(dtsr_0[:, count])
        axes[i, j].set_title('Chanel '+str(count+1))
        count += 1
plt.tight_layout() # 120

other label 120

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(9, 2, figsize=(10, 20))
count = 0

for i in range(9):
    for j in range(2):
        if count == 17:
            break
        axes[i, j].plot(dtsr_4[:, count])
        axes[i, j].set_title('Chanel '+str(count+1))
        count += 1
plt.tight_layout() # 120

label 150

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(9, 2, figsize=(10, 20))
count = 0

for i in range(9):
    for j in range(2):
        if count == 17:
            break
        axes[i, j].plot(dtsr_2[:, count])
        axes[i, j].set_title('Chanel '+str(count+1))
        count += 1
plt.tight_layout() # 150

other label 150

In [None]:
import matplotlib.pyplot as plt

fig, axes = plt.subplots(9, 2, figsize=(10, 20))
count = 0

for i in range(9):
    for j in range(2):
        if count == 17:
            break
        axes[i, j].plot(dtsr_3[:, count])
        axes[i, j].set_title('Chanel '+str(count+1))
        count += 1
plt.tight_layout() # 150

## new train

### load new data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt

path = "/kaggle/input/brain-motor-imagery-classification/train_application/train/train/"
trainApp_list = []
for dirname, _, filenames in os.walk(path):
    for filename in filenames:
        trainApp_list.append(filename)
print(len(trainApp_list))

In [None]:
label = pd.read_csv('/kaggle/input/brain-motor-imagery-classification/train_application/label_application.csv')
label

In [None]:
samp_data = np.load(path + label['id'][0] +'.npy')
samp_data, samp_data.shape

In [None]:
fig, axes = plt.subplots(4, 2, figsize=(16, 16))
count = 0

for i in range(4):
    for j in range(2):
        axes[i, j].plot(samp_data[:, count])
        axes[i, j].set_title('Chanel '+str(count+1))
        count += 1
plt.tight_layout()
print(label['label'][0])

In [None]:
ls_shape = []

for i in range(180):
    ls_shape.append(np.load(path + label['id'][i] +'.npy'))
ls_shape

In [None]:
pd.DataFrame(np.array(ls_shape)[:, 0]).value_counts()

In [None]:
np.load(path + label['id'][0] +'.npy')[:1734, :8].shape

In [None]:
from tqdm import tqdm

ls_success = []
for i in tqdm(range(180)):
    chunk = []
    chunk.append(np.load(path + label['id'][0] +'.npy')[:1734, :8])
    chunk.append([label['label'][0]])
    ls_success.append(chunk)

In [None]:
ls_success[0]

In [None]:
ls_success[0][0].shape

In [None]:
ls_shape = []

for i in tqdm(range(180)):
    ls_shape.append(ls_success[i][0].shape)
pd.DataFrame(np.array(ls_shape)[:, 0]).value_counts()

In [None]:
newtrain_app = pd.DataFrame(ls_success)
newtrain_app.columns = ['chunk_seq', 'label']
newtrain_app

In [None]:
newtrain_app.chunk_seq[0].shape

In [None]:
pd.Series(newtrain_app.label).value_counts()

In [None]:
newtrain_app.to_csv('newtrain_app.csv', index=False)

In [None]:
!pip install datasets

In [None]:
from huggingface_hub import login
login() # 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

### concat data

In [None]:
from datasets import load_dataset

dataset = load_dataset("Expss4/chunk_train_seq")
dataset

In [None]:
train_dataset = dataset['train']
train_dataset

In [None]:
import pandas as pd
df_train = pd.DataFrame(train_dataset)
df_train

In [None]:
np.array(df_train.chunk_seq[0]).shape

In [None]:
newtrain_app = pd.concat([newtrain_app, df_train], ignore_index=True)

### push to hub

In [None]:
from datasets import Dataset
ds = Dataset.from_dict({"chunk_seq": newtrain_app.chunk_seq,
                        "label": newtrain_app.label})
ds

In [None]:
ds.push_to_hub('herobyeKung/new_brainwave_train', private=True)

In [None]:
from datasets import load_dataset

dataset = load_dataset("herobyeKung/new_brainwave_train")
dataset

## Boss from Ohm data

In [None]:
!pip install datasets

In [None]:
from huggingface_hub import login
login() # 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

In [None]:
from datasets import load_dataset

dataset = load_dataset("Expss4/EXP-Last-Place")
dataset

In [None]:
import pandas as pd

df = pd.DataFrame(dataset['train'])
df

In [None]:
len(df['data'][0][0])

In [None]:
exp = df['data'][0]
exp

In [None]:
plt.plot(exp)

In [None]:
exp[:4]

In [None]:
(np.array(exp[:4]).T).tolist()

In [None]:
[df['label'][0]]

In [None]:
from tqdm import tqdm
ls_data, ls_label = [], []
for i in tqdm(range(len(df))):
  dt_f = df['data'][i]
  lb_f = df['label'][i]
  ls_data.append((np.array(exp[:4]).T).tolist())
  ls_label.append([lb_f])

In [None]:
np.array(ls_data).shape, np.array(ls_label).shape

In [None]:
from datasets import Dataset
ds = Dataset.from_dict({"data": ls_data,
                        "label": ls_label})
ds

In [None]:
ds.push_to_hub('Expss4/EGG_style', private=True)

# PreData for EEG Deformer

In [None]:
!pip install datasets

In [None]:
from huggingface_hub import login
login() # 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

## duplicate data

In [None]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd


In [None]:
from datasets import load_dataset

dataset_i = load_dataset("Expss4/EGG_style")
dataset_i

In [None]:
dataset_i['train']

In [None]:
df_i = pd.DataFrame(dataset_i['train'])
df_i

In [None]:
np.array(df_i.data[0])

In [None]:
df_i.data[0][0]

In [None]:
# new_data[:, ::] = np.array(df_i.data[0])  # Original columns

new_data = np.repeat(df_i.data[0],2,axis = 0)  # Duplicated columns
new_data, new_data.shape

In [None]:
for i in tqdm(range(len(df_i))):
  df_i.data[i] = np.repeat(df_i.data[0],2,axis = 0).tolist()

In [None]:
np.array(df_i.data[0]).shape

In [None]:
from datasets import Dataset
ds_i = Dataset.from_dict({"data": df_i.data,
                        "label": df_i.label})
ds_i

In [None]:
ds_i.push_to_hub('Expss4/EGG_style', private=True)

## debug data

In [None]:
# Gradient Accumulation Settings
# Set to 1 for no accumulation
train_batch_size = 8
eval_batch_size = 16
num_accumulate = 4

In [None]:
# Cross validation
from sklearn.model_selection import KFold
k_splits = 4
kf = KFold(n_splits=k_splits, shuffle=True, random_state=42)
kf

In [None]:
for fold, (train_idx, val_idx) in enumerate(kf.split(dataset_i['train'])):
  print(f"Fold {fold+1} of 4")
  print(train_idx, val_idx)

In [None]:
train_set = torch.utils.data.Subset(dataset_i['train'], train_idx)
val_set = torch.utils.data.Subset(dataset_i['train'], val_idx)

In [None]:
train_set.indices, train_set.indices.shape, val_set.indices, val_set.indices.shape

In [None]:
train_dataloader = DataLoader(train_set, batch_size=train_batch_size, shuffle=False)
val_dataloader = DataLoader(val_set, batch_size=eval_batch_size, shuffle=False)
len(train_dataloader), len(val_dataloader)

In [None]:
for idx, batch in enumerate(tqdm(val_dataloader)):
    inputs, targets = batch
    print(inputs, targets)
    break

In [None]:
defg = batch[targets][0]
defg, defg.shape

In [None]:
abc = torch.stack([torch.stack(b_i) for b_i in batch[inputs]])
abc, abc.shape

In [None]:
abc_t = torch.transpose(abc, 1, 2)
abc_t, abc_t.shape

In [None]:
abc = abc.to(torch.float32)
abc.dtype

# test

In [None]:
spsb = pd.read_csv('/kaggle/input/brain-motor-imagery-classification/sample_submission.csv')
spsb

In [None]:
import os

test_list = []
for dirname, _, filenames in os.walk('/kaggle/input/brain-motor-imagery-classification/test'):
    for filename in filenames:
        test_list.append(filename)
print(len(test_list))

In [None]:
test_list.index("a0dfb1f6-a76b-404f-a04a-b61e955ce1ef.npy")