## 1. Imports
Minimal imports needed for preprocessing (no model code).

## 1. Imports
Minimal imports needed for preprocessing (no model code).

In [None]:
import os, re, glob
import numpy as np
import matplotlib.pyplot as plt
import mne
import librosa

## 2. Dataset Root
Adjust the path below if your dataset is stored elsewhere.

In [None]:
DATASET_ROOT = '/root/.cache/kagglehub/datasets/brianleung2020/eeg-motor-movementimagery-dataset/versions/1'
print('Dataset root:', DATASET_ROOT)

## 3. List EDF Files & Subject ID Utility
Recursively gathers EDF files and extracts subject IDs from filenames.

In [None]:
edf_paths = glob.glob(os.path.join(DATASET_ROOT, '**', '*.edf'), recursive=True)
edf_paths = sorted(edf_paths)
print('Found EDF files:', len(edf_paths))
def subject_id_from_path(p):
    m = re.search(r'[sS](\d{1,3})', os.path.basename(p))
    if m: return int(m.group(1))
    m2 = re.search(r'\d{1,3}', os.path.basename(os.path.dirname(p)))
    return int(m2.group(0)) if m2 else None
for p in edf_paths[:5]:
    print(p, '->', subject_id_from_path(p))

## 4. Preprocessing Parameters

In [None]:
SAMPLE_RATE = 128  # Hz after resampling
WINDOW_SECONDS = 2.0
WINDOW_SAMPLES = int(SAMPLE_RATE * WINDOW_SECONDS)
FMIN, FMAX = 1.0, 40.0
N_MELS = 64
HOP_LENGTH = 64

## 5. Processing Function
Reads EDF, picks EEG channels, averages, filters, windows, and produces normalized mel-spectrograms.

In [None]:
def process_file_to_spectrograms(edf_path, max_windows_per_file=50):
    try:
        raw = mne.io.read_raw_edf(edf_path, preload=True, verbose=False)
    except Exception as e:
        print('Failed to read', edf_path, e)
        return [], []
    picks = mne.pick_types(raw.info, eeg=True, meg=False)
    data = raw.get_data(picks) if len(picks) else raw.get_data()
    sig = np.mean(data, axis=0)
    raw_resamp = mne.io.RawArray(sig[np.newaxis, :], mne.create_info(ch_names=['EEGavg'], sfreq=raw.info['sfreq']))
    raw_resamp.resample(SAMPLE_RATE, npad='auto')
    sig_rs = raw_resamp.get_data().squeeze()
    sig_rs = mne.filter.filter_data(sig_rs, SAMPLE_RATE, l_freq=FMIN, h_freq=FMAX, verbose=False)
    n_samples = len(sig_rs)
    specs = []
    for start in range(0, n_samples - WINDOW_SAMPLES + 1, WINDOW_SAMPLES):
        w = sig_rs[start:start + WINDOW_SAMPLES]
        S = librosa.feature.melspectrogram(y=w.astype(float), sr=SAMPLE_RATE, n_mels=N_MELS, hop_length=HOP_LENGTH)
        S_db = librosa.power_to_db(S, ref=np.max)
        S_db = (S_db - S_db.mean()) / (S_db.std() + 1e-8)
        specs.append(S_db.astype(np.float32))
        if max_windows_per_file and len(specs) >= max_windows_per_file:
            break
    return specs, len(specs)

## 6. Build Dataset
Iterate over EDF files, generate spectrogram windows, collect arrays, save to disk for downstream notebooks.

In [None]:
MAX_FILES = None  # set to an int to limit during debugging
MAX_WINDOWS_PER_FILE = 30
X = []
Y = []
file_count = 0
for p in edf_paths:
    sid = subject_id_from_path(p)
    if sid is None: continue
    specs, count = process_file_to_spectrograms(p, max_windows_per_file=MAX_WINDOWS_PER_FILE)
    for s in specs:
        X.append(s)
        Y.append(sid - 1)
    file_count += 1
    if MAX_FILES and file_count >= MAX_FILES: break
X = np.array(X)
Y = np.array(Y)
print('Collected spectrograms:', X.shape, 'labels:', Y.shape)
np.save('X.npy', X)
np.save('Y.npy', Y)
print('Saved X.npy and Y.npy')

## 7. Sanity Plot
Visualize one spectrogram window to confirm preprocessing.

In [None]:
if len(X) > 0:
    plt.figure(figsize=(6,3))
    plt.imshow(X[0], aspect='auto', origin='lower')
    plt.title(f'Subject {Y[0]+1} example spectrogram')
    plt.xlabel('Time frames')
    plt.ylabel('Mel bins')
    plt.colorbar(label='Normalized dB')
    plt.show()