# A. Importing Libraries

In [34]:
from scipy.io import loadmat
import mne
import os
import glob
import h5py

import numpy as np
import matplotlib.pyplot as plt

# B. Importing Data

## 1. Data File Structure

- `data/`
  - `openendedloosely_cleaned/`
    - `Data_Design_Sub_x.mat`
  - `openendedloosely_raw/`
    - `Participant_x/`
      - `filename.vhdr` — header (metadata)
      - `filename.eeg` — EEG signal (binary samples)
      - `filename.vmrk` — event markers (timestamps)


## 2. Loading Raw Data

In [18]:
mne_data_path = os.path.abspath("../../data/openendedloosely_raw/")
vhdr_files = glob.glob(os.path.join(mne_data_path, "Participant_*", "*.vhdr"))

eeg_raw_data = {}

for vhdr in vhdr_files:
    participant_name = os.path.basename(os.path.dirname(vhdr))  
    print(f"Loading {participant_name}: {vhdr}")

    raw = mne.io.read_raw_brainvision(vhdr, preload=True)
    eeg_raw_data[participant_name] = raw

# MNE Raw objects
print("\nLoaded EEG datasets:", list(eeg_raw_data.keys()))
print("Total loaded:", len(eeg_raw_data))

Loading Participant_1: c:\Users\Aryo\PersonalMade\Programming\GAN\repo\data\openendedloosely_raw\Participant_1\Feb_07(1)_2014.vhdr
Extracting parameters from c:\Users\Aryo\PersonalMade\Programming\GAN\repo\data\openendedloosely_raw\Participant_1\Feb_07(1)_2014.vhdr...
Setting channel info structure...
Reading 0 ... 2612146  =      0.000 ...  5224.292 secs...
Loading Participant_10: c:\Users\Aryo\PersonalMade\Programming\GAN\repo\data\openendedloosely_raw\Participant_10\april_2(1).vhdr
Extracting parameters from c:\Users\Aryo\PersonalMade\Programming\GAN\repo\data\openendedloosely_raw\Participant_10\april_2(1).vhdr...
Setting channel info structure...
Reading 0 ... 1562175  =      0.000 ...  3124.350 secs...
Loading Participant_15: c:\Users\Aryo\PersonalMade\Programming\GAN\repo\data\openendedloosely_raw\Participant_15\april_16(1).vhdr
Extracting parameters from c:\Users\Aryo\PersonalMade\Programming\GAN\repo\data\openendedloosely_raw\Participant_15\april_16(1).vhdr...
Setting channel i

## 3. Loading Cleaned Data

### Inspect MATLAB Version

In [14]:
def inspect_signature(path):
    with open(path, 'rb') as f:
        sig = f.read(128)
    return sig[:4], sig[:20]

for f in mat_files:
    filename = os.path.basename(f)   
    print(f"\nFile: {filename}")
    print(inspect_signature(f))



File: Data_Design_Sub_1.mat
(b'MATL', b'MATLAB 5.0 MAT-file,')

File: Data_Design_Sub_10.mat
(b'MATL', b'MATLAB 7.3 MAT-file,')

File: Data_Design_Sub_15.mat
(b'MATL', b'MATLAB 7.3 MAT-file,')

File: Data_Design_Sub_20.mat
(b'MATL', b'MATLAB 7.3 MAT-file,')

File: Data_Design_Sub_25.mat
(b'MATL', b'MATLAB 7.3 MAT-file,')

File: Data_Design_Sub_5.mat
(b'MATL', b'MATLAB 7.3 MAT-file,')


### Import data

In [19]:
cleaned_path = os.path.abspath("../../data/openendedloosely_cleaned/")
mat_files = glob.glob(os.path.join(cleaned_path, "Data_Design_Sub_*.mat"))

eeg_cleaned_data = {}

def is_v73(path):
    # There is two types of MATLAB version in data v5.0 and v7.3, each needs a special loader
    with open(path, "rb") as f:
        sig = f.read(20).decode(errors="ignore")
    return "MATLAB 7.3" in sig

for mat_file in mat_files:
    key = os.path.splitext(os.path.basename(mat_file))[0]
    print(f"Loading {key}: {mat_file}")

    if is_v73(mat_file):
        eeg_cleaned_data[key] = h5py.File(mat_file, "r")
    else:
        eeg_cleaned_data[key] = loadmat(mat_file)

# Python dictionary for v5.0
# HDF5 file object for v7.3
print("\nLoaded MAT datasets:", list(eeg_cleaned_data.keys()))
print("Total:", len(eeg_cleaned_data))

Loading Data_Design_Sub_1: c:\Users\Aryo\PersonalMade\Programming\GAN\repo\data\openendedloosely_cleaned\Data_Design_Sub_1.mat
Loading Data_Design_Sub_10: c:\Users\Aryo\PersonalMade\Programming\GAN\repo\data\openendedloosely_cleaned\Data_Design_Sub_10.mat
Loading Data_Design_Sub_15: c:\Users\Aryo\PersonalMade\Programming\GAN\repo\data\openendedloosely_cleaned\Data_Design_Sub_15.mat
Loading Data_Design_Sub_20: c:\Users\Aryo\PersonalMade\Programming\GAN\repo\data\openendedloosely_cleaned\Data_Design_Sub_20.mat
Loading Data_Design_Sub_25: c:\Users\Aryo\PersonalMade\Programming\GAN\repo\data\openendedloosely_cleaned\Data_Design_Sub_25.mat
Loading Data_Design_Sub_5: c:\Users\Aryo\PersonalMade\Programming\GAN\repo\data\openendedloosely_cleaned\Data_Design_Sub_5.mat

Loaded MAT datasets: ['Data_Design_Sub_1', 'Data_Design_Sub_10', 'Data_Design_Sub_15', 'Data_Design_Sub_20', 'Data_Design_Sub_25', 'Data_Design_Sub_5']
Total: 6


# C. Exploratory Data Analysis

## 1. Raw Data Description

### Raw EEG

In [20]:
def inspect_raw_eeg(raw):
    print("=== EEG RAW INFO ===")
    print(raw)
    print("\n=== CHANNELS ===")
    print(raw.info['ch_names'][:10], "...")  

    data, times = raw.get_data(return_times=True)
    print("\n=== DATA SHAPE (channels x samples) ===")
    print(data.shape)

    print("\n=== SAMPLE PREVIEW (first channel, first 10 samples) ===")
    print(data[0, :10])

inspect_raw_eeg(eeg_raw_data["Participant_1"])

=== EEG RAW INFO ===
<RawBrainVision | Feb_07(1)_2014.eeg, 63 x 2612147 (5224.3 s), ~1.23 GiB, data loaded>

=== CHANNELS ===
['Fp1', 'Fz', 'F3', 'F7', 'FT9', 'FC5', 'FC1', 'C3', 'T7', 'TP9'] ...

=== DATA SHAPE (channels x samples) ===
(63, 2612147)

=== SAMPLE PREVIEW (first channel, first 10 samples) ===
[-0.00796543 -0.00796101 -0.00796801 -0.00797028 -0.00797124 -0.00797133
 -0.00797287 -0.00797358 -0.00796958 -0.00796489]


### Cleaned EEG

In [29]:
def safe_preview(x, limit=10):
    """Return a short preview of any array/dataset."""
    try:
        arr = np.array(x)  
        flat = arr.ravel()
        return flat[:limit]
    except Exception:
        return "<unpreviewable>"

def inspect_mat_data(mat, preview_limit=10):
    # SciPy case (v5)
    if isinstance(mat, dict):
        keys = [k for k in mat.keys() if not k.startswith("__")]
        print("Keys:", keys)

        for k in keys:
            v = mat[k]
            print(f"\n-- {k} --")

            if isinstance(v, np.ndarray):
                print("shape:", v.shape)
                print("preview:", safe_preview(v, preview_limit))
            else:
                print("type:", type(v))

    # HDF5 case (v7.3)
    elif isinstance(mat, h5py.File):
        keys = list(mat.keys())
        print("Keys:", keys)

        for k in keys:
            v = mat[k]
            print(f"\n-- {k} --")

            if isinstance(v, h5py.Dataset):
                print("shape:", v.shape)
                print("preview:", safe_preview(v, preview_limit))

            elif isinstance(v, h5py.Group):
                print("Group with subkeys:", list(v.keys()))

print("=" * 80)
print("PREVIEW OF Data_Design_Sub_1")
print("=" * 80)
inspect_mat_data(eeg_cleaned_data["Data_Design_Sub_1"])

print()
print("=" * 80)
print("PREVIEW OF Data_Design_Sub_10")
print("=" * 80)
inspect_mat_data(eeg_cleaned_data["Data_Design_Sub_10"])

PREVIEW OF Data_Design_Sub_1
Keys: ['Design_1_1_IE', 'Design_1_1_IG', 'Design_1_1_PU', 'Design_1_1_RIE', 'Design_1_1_RIG', 'Design_1_2_IE', 'Design_1_2_IG', 'Design_1_2_PU', 'Design_1_2_RIE', 'Design_1_2_RIG', 'Design_1_3_IE', 'Design_1_3_IG', 'Design_1_3_PU', 'Design_1_3_RIE', 'Design_1_3_RIG', 'Design_1_4_IE', 'Design_1_4_IG', 'Design_1_4_PU', 'Design_1_4_RIE', 'Design_1_4_RIG', 'Design_1_5_IE', 'Design_1_5_IG', 'Design_1_5_PU', 'Design_1_5_RIE', 'Design_1_5_RIG', 'Design_1_6_IE', 'Design_1_6_IG', 'Design_1_6_PU', 'Design_1_6_RIE', 'Design_1_6_RIG', 'Design_1_RST1', 'Design_1_RST2']

-- Design_1_1_IE --
shape: (63, 18000)
preview: [9.02585853e-06 1.60095187e-05 1.89526859e-05 1.58640779e-05
 1.19633963e-05 1.06680568e-05 1.42827485e-05 2.01476532e-05
 2.50142040e-05 2.66213462e-05]

-- Design_1_1_IG --
shape: (63, 41500)
preview: [-5.91825580e-07 -2.38090595e-06 -3.32150893e-06 -3.28518556e-06
 -3.50044570e-06 -4.19427790e-06 -5.18841580e-06 -5.44808762e-06
 -4.49535263e-06 -2.593680

## 2. Raw and Cleaned EEG Merge

In [31]:
unified = {}

for p_key in eeg_raw_data.keys():
    # Extract number: "Participant_10" → 10
    p_num = int(p_key.split("_")[1])

    mat_key = f"Data_Design_Sub_{p_num}"

    if mat_key not in eeg_cleaned_data:
        print(f"WARNING: Missing MAT data for {p_key}")
        continue

    unified[p_key] = {
        "raw_eeg": eeg_raw_data[p_key],
        "clean_eg": eeg_cleaned_data[mat_key]
    }

print("Unified participants:", list(unified.keys()))

Unified participants: ['Participant_1', 'Participant_10', 'Participant_15', 'Participant_20', 'Participant_25', 'Participant_5']


In [32]:
def safe_arr_shape(x):
    try:
        return np.array(x).shape
    except:
        return None

def preview_unified(unified, limit=5):
    print("\n=== UNIFIED DATASET PREVIEW ===")
    participants = list(unified.keys())
    print("Participants:", participants)

    for p in participants[:limit]:
        entry = unified[p]
        raw = entry["raw_eeg"]
        clean = entry["clean_eg"]

        print(f"\n--- {p} ---")

        # RAW EEG SUMMARY
        print("RAW EEG:")
        print(f"  Channels: {raw.info['nchan']}")
        print(f"  Sampling rate: {raw.info['sfreq']} Hz")

        data = raw.get_data()
        print(f"  Shape: {data.shape} (channels × samples)")

        # CLEAN MAT SUMMARY
        print("CLEANED DATA:")
        if isinstance(clean, dict):
            keys = [k for k in clean.keys() if not k.startswith("__")]
        else:  # h5py.File
            keys = list(clean.keys())

        print(f"  Keys: {keys}")

        # Show shapes of first few fields
        for k in keys[:5]: 
            v = clean[k]
            if isinstance(clean, dict):
                shape = safe_arr_shape(v)
            else:
                shape = v.shape if hasattr(v, "shape") else None

            print(f"    {k}: shape={shape}")

In [33]:
preview_unified(unified)


=== UNIFIED DATASET PREVIEW ===
Participants: ['Participant_1', 'Participant_10', 'Participant_15', 'Participant_20', 'Participant_25', 'Participant_5']

--- Participant_1 ---
RAW EEG:
  Channels: 63
  Sampling rate: 500.0 Hz
  Shape: (63, 2612147) (channels × samples)
CLEANED DATA:
  Keys: ['Design_1_1_IE', 'Design_1_1_IG', 'Design_1_1_PU', 'Design_1_1_RIE', 'Design_1_1_RIG', 'Design_1_2_IE', 'Design_1_2_IG', 'Design_1_2_PU', 'Design_1_2_RIE', 'Design_1_2_RIG', 'Design_1_3_IE', 'Design_1_3_IG', 'Design_1_3_PU', 'Design_1_3_RIE', 'Design_1_3_RIG', 'Design_1_4_IE', 'Design_1_4_IG', 'Design_1_4_PU', 'Design_1_4_RIE', 'Design_1_4_RIG', 'Design_1_5_IE', 'Design_1_5_IG', 'Design_1_5_PU', 'Design_1_5_RIE', 'Design_1_5_RIG', 'Design_1_6_IE', 'Design_1_6_IG', 'Design_1_6_PU', 'Design_1_6_RIE', 'Design_1_6_RIG', 'Design_1_RST1', 'Design_1_RST2']
    Design_1_1_IE: shape=(63, 18000)
    Design_1_1_IG: shape=(63, 41500)
    Design_1_1_PU: shape=(63, 6000)
    Design_1_1_RIE: shape=(63, 9000)
   

### Raw EEG is the full continuous experiment

Raw EEG = full experiment timeline. For example:

For Participant 25:
- Raw EEG shape: (63 channels, 2,806,105 samples)
- Sampling rate: 500 Hz
- Duration ≈ 2,806,105 / 500 = 5,612 seconds ≈ 93.5 minutes

This includes everything:
- instructions
- breaks
- all 6 design problems
- all 5 tasks per design problem
- eye blinks
- noise
- irrelevant data
- experimenter speaking
- rest periods
- etc.

### Cleaned EEG are only the Experiments

- Downsampled (raw: 500 Hz → clean: 250 Hz)
- Processed (ICA, filtering)
- Clipped (only includes specific task periods)
- Extracted using VIDEO timestamps, not raw markers (.vmrk)

So the clean IE chunk does NOT have the same indexing as raw EEG.

Example:
`Design_25_1_IE: (63 × 84,000)`

This means:
- 63 channels
- 84,000 samples
- cleaned + downsampled to 250 Hz
- Duration = 84,000 / 250 = 336 seconds ≈ 5.6 minutes

This is the IE task inside Design Problem 1.


### Cleaned Label Meanings
| **Label** | **Meaning (according to the experiment description)**                       |
| --------- | --------------------------------------------------------------------------- |
| **PU**    | *Problem Understanding* — participant studies the design task               |
| **IG**    | *Idea Generation* — participant generates design ideas                      |
| **IE**    | *Idea Evaluation* — participant evaluates previously generated ideas        |
| **RIG**   | *Rating Generated Ideas* — participant rates the quality of generated ideas |
| **RIE**   | *Rating Idea Evaluations* — participant rates their evaluations             |
| **RST1**  | Resting state before the experiment (eyes closed, 3 minutes)                |
| **RST2**  | Resting state after the experiment (eyes closed, 3 minutes)                 |


## 3. Raw and Cleaned Alignment

In [41]:
participant = "Participant_10"
p_num = int(participant.split("_")[1])
mat_key = f"Data_Design_Sub_{p_num}"

raw = eeg_raw_data[participant]          # MNE Raw object
clean_mat = eeg_cleaned_data[mat_key]    # dict or HDF5

In [42]:
# ----- 1. Get raw data and basic info -----
raw_data = raw.get_data()                  # shape: (63, N_raw)
sfreq_raw = raw.info["sfreq"]              # should be 500 Hz
n_channels, n_raw_samples = raw_data.shape
duration_raw_sec = n_raw_samples / sfreq_raw

print(f"RAW EEG ({participant}):")
print(f"  channels      : {n_channels}")
print(f"  samples       : {n_raw_samples}")
print(f"  sfreq         : {sfreq_raw} Hz")
print(f"  duration      : {duration_raw_sec:.2f} s (~{duration_raw_sec/60:.2f} min)")

# ----- 2. Collect and concatenate all cleaned segments for this participant -----
clean_keys = [k for k in clean_mat.keys() if k.startswith(f"Design_{p_num}_")]
clean_keys = sorted(clean_keys)  # just for stable order

clean_segments = []

for k in clean_keys:
    seg = np.array(clean_mat[k])

    # ensure shape is (63, T)
    if seg.shape[0] == n_channels:
        seg_ch_first = seg
    else:
        seg_ch_first = seg.T

    clean_segments.append(seg_ch_first)

# Concatenate along time axis
clean_concat = np.concatenate(clean_segments, axis=1)  # shape: (63, total_T_clean)
n_clean_samples = clean_concat.shape[1]
sfreq_clean = 250.0   # from the paper
duration_clean_sec = n_clean_samples / sfreq_clean

print(f"\nCLEANED EEG CONCAT ({mat_key}):")
print(f"  channels      : {clean_concat.shape[0]}")
print(f"  samples       : {n_clean_samples}")
print(f"  assumed sfreq : {sfreq_clean} Hz")
print(f"  duration      : {duration_clean_sec:.2f} s (~{duration_clean_sec/60:.2f} min)")

# ----- 3. Rough 'coverage' ratio (how much of experiment has clean segments) -----
# Convert both to seconds to compare fairly:
coverage_ratio = duration_clean_sec / duration_raw_sec
print(f"\nClean segments cover about {coverage_ratio*100:.1f}% of the entire raw recording (by time).")

# ----- 5. Very coarse 'energy' comparison per channel (sum of squares) -----
raw_energy = np.sum(raw_data**2, axis=1)          # shape: (63,)
clean_energy = np.sum(clean_concat**2, axis=1)    # shape: (63,)

print("\nExample energy comparison (channel 0):")
print(f"  raw energy(ch0)   : {raw_energy[0]:.3e}")
print(f"  clean energy(ch0) : {clean_energy[0]:.3e}")

RAW EEG (Participant_10):
  channels      : 63
  samples       : 1562176
  sfreq         : 500.0 Hz
  duration      : 3124.35 s (~52.07 min)

CLEANED EEG CONCAT (Data_Design_Sub_10):
  channels      : 63
  samples       : 724000
  assumed sfreq : 250.0 Hz
  duration      : 2896.00 s (~48.27 min)

Clean segments cover about 92.7% of the entire raw recording (by time).

Example energy comparison (channel 0):
  raw energy(ch0)   : 1.402e+02
  clean energy(ch0) : 1.292e-05


# D. Preparing Data