# Phase 3: Synthetic EEG Data Generation

## Objective

Build a robust foundation for synthetic EEG generation that accurately reflects the structure of the original Alcoholic vs Control EEG dataset, while respecting:

1. subject independence to prevent information leakage
2. condition specificity across S1, S2-match, S2-nomatch
3. class structure (control vs alcoholic)
4. realistic band-power relationships across Delta, Theta, Alpha, Beta, Gamma, and Total Power

## 3.0. Load package & Setup

In [459]:
import os
import random
from pathlib import Path
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.cluster import KMeans
from sklearn.covariance import LedoitWolf
from sklearn.neighbors import NearestNeighbors
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
plt.rcParams["figure.figsize"] = (6, 4)

DATA_DIR = Path("../output/band_extraction")
OUT_DIR = Path("../output/synthetic_generation")
OUT_DIR.mkdir(parents=True, exist_ok=True)

## 3.1. Processed data features check

### 3.1.1. Dataset overview

In [460]:
# Data structure
LABEL_COL = "subject_type"         
CONDITION_COL = "matching_condition"   
SPLIT_COL = "dataset_split"
META_COLS = ["dataset_split", "file_name", "subject_type", "subject_id", "channel", "trial", "matching_condition", "Delta", "Theta", "Alpha", "Beta", "Gamma", "total_power"]
BAND_COLS = ["Delta", "Theta", "Alpha", "Beta", "Gamma", "total_power"]

In [461]:
# load current feature dataset
FEATURE_FP = DATA_DIR / "band_features_segments.csv"
df_all = pd.read_csv(FEATURE_FP)
print("Full data shape:", df_all.shape)
df_all.head()

Full data shape: (60672, 13)


Unnamed: 0,dataset_split,file_name,subject_type,subject_id,channel,trial,matching_condition,Delta,Theta,Alpha,Beta,Gamma,total_power
0,train,Data1.csv,a,co2a0000364,FP1,0,S1 obj,20.048105,5.830134,0.854299,6.705598,6.848762,40.286898
1,train,Data1.csv,a,co2a0000364,FP2,0,S1 obj,21.769006,6.052321,1.013807,16.487621,15.773774,61.09653
2,train,Data1.csv,a,co2a0000364,F7,0,S1 obj,7.742259,6.272004,1.893497,39.119253,49.533282,104.560295
3,train,Data1.csv,a,co2a0000364,F8,0,S1 obj,11.400244,4.816262,2.360998,53.64694,44.50218,116.726624
4,train,Data1.csv,a,co2a0000364,AF1,0,S1 obj,13.188257,2.347635,0.54275,4.036543,2.914738,23.029923


In [462]:
# Basic sanity checks
print("Split counts:")
print(df_all[SPLIT_COL].value_counts())

Split counts:
dataset_split
test     30720
train    29952
Name: count, dtype: int64


In [463]:
print("Subject_type counts:")
print(df_all[LABEL_COL].value_counts())

Subject_type counts:
subject_type
a    30400
c    30272
Name: count, dtype: int64


In [464]:
print("Matching_condition counts:")
print(df_all[CONDITION_COL].value_counts())

Matching_condition counts:
matching_condition
S1 obj         20480
S2 match       20416
S2 nomatch,    19776
Name: count, dtype: int64


In [465]:
# Train / Test based on dataset_split
df_train = df_all[df_all[SPLIT_COL] == "train"].reset_index(drop=True)
df_test = df_all[df_all[SPLIT_COL] == "test"].reset_index(drop=True)
print("Train shape:", df_train.shape)
print("Test shape :", df_test.shape)

Train shape: (29952, 13)
Test shape : (30720, 13)


In [466]:
print("Train label x condition:")
print(pd.crosstab(df_train[LABEL_COL], df_train[CONDITION_COL]))

Train label x condition:
matching_condition  S1 obj  S2 match  S2 nomatch,
subject_type                                     
a                     5120      5120         4800
c                     5120      5056         4736


In [467]:
print("Test label x condition:")
print(pd.crosstab(df_test[LABEL_COL], df_test[CONDITION_COL]))

Test label x condition:
matching_condition  S1 obj  S2 match  S2 nomatch,
subject_type                                     
a                     5120      5120         5120
c                     5120      5120         5120


### 3.1.2. Basic check for missing values and infinite features

In [468]:
# Check missing and extra columns
missing = [c for c in META_COLS if c not in df_all.columns]
extra = [c for c in df_all.columns if c not in META_COLS]

if missing:
    print("MISSING columns:", missing)
if extra:
    print("EXTRA columns:", extra)
if not missing:
    df_all = df_all[META_COLS].copy()

In [469]:
# Force to numeric and report any conversion issues
for col in BAND_COLS:
    df_all[col] = pd.to_numeric(df_all[col], errors="coerce")

print("Data types:")
print(df_all.dtypes)

Data types:
dataset_split          object
file_name              object
subject_type           object
subject_id             object
channel                object
trial                   int64
matching_condition     object
Delta                 float64
Theta                 float64
Alpha                 float64
Beta                  float64
Gamma                 float64
total_power           float64
dtype: object


In [470]:
# Check missing values
print("Missing values per column:")
print(df_all.isna().sum())
mask_nan_features = df_all[BAND_COLS].isna().any(axis=1)
n_nan_rows = mask_nan_features.sum()
print(f"Rows with NaN in any feature: {n_nan_rows}")

Missing values per column:
dataset_split         0
file_name             0
subject_type          0
subject_id            0
channel               0
trial                 0
matching_condition    0
Delta                 0
Theta                 0
Alpha                 0
Beta                  0
Gamma                 0
total_power           0
dtype: int64
Rows with NaN in any feature: 0


In [471]:
# Check for non-finite values
mask_nonfinite = ~np.isfinite(df_all[BAND_COLS].to_numpy()).all(axis=1)
n_nonfinite = mask_nonfinite.sum()
print(f"Rows with non-finite feature values: {n_nonfinite}")

Rows with non-finite feature values: 0


### 3.1.3. Clip outlier check and process

In [472]:
# clip extreme outliers to stabilize covariance
q_low = df_all[BAND_COLS].quantile(0.01)
q_high = df_all[BAND_COLS].quantile(0.99)

print("1st percentile for features:\n", q_low)
print("99th percentile for features:\n", q_high)

1st percentile for features:
 Delta          0.254321
Theta          0.126250
Alpha          0.108306
Beta           0.191547
Gamma          0.057350
total_power    1.368929
Name: 0.01, dtype: float64
99th percentile for features:
 Delta          125.841570
Theta           37.735722
Alpha           42.706482
Beta            23.624554
Gamma           18.142717
total_power    187.203228
Name: 0.99, dtype: float64


In [473]:
# Clip outliers FIRST on full dataset
df_all[BAND_COLS] = df_all[BAND_COLS].clip(lower=q_low, upper=q_high, axis=1)
print("After clipping, feature summary (all):")
df_all[BAND_COLS].describe(percentiles=[0.01, 0.5, 0.99])

After clipping, feature summary (all):


Unnamed: 0,Delta,Theta,Alpha,Beta,Gamma,total_power
count,60672.0,60672.0,60672.0,60672.0,60672.0,60672.0
mean,14.194337,5.21076,5.2124,4.203085,1.73345,31.266163
std,19.77333,6.28368,7.203401,4.136839,2.747217,31.317794
min,0.254321,0.12625,0.108306,0.191547,0.05735,1.368929
1%,0.254393,0.126368,0.1084,0.191558,0.057352,1.368965
50%,7.512234,3.138961,2.681088,2.934805,0.805226,22.106393
99%,125.82675,37.729023,42.705766,23.622675,18.129812,187.112226
max,125.84157,37.735722,42.706482,23.624554,18.142717,187.203228


### 3.1.4. Subject-wise train/test split

In [474]:
# make sure subjects don't appear in both splits
train_subj = set(df_train["subject_id"])
test_subj = set(df_test["subject_id"])
overlap = train_subj & test_subj

print(f"# unique subjects in train: {len(train_subj)}")
print(f"# unique subjects in test: {len(test_subj)}")
print(f"# overlapping subjects: {len(overlap)}")

if overlap:
    print("WARNING: Some subject_ids appear in BOTH train and test!")

# unique subjects in train: 16
# unique subjects in test: 16
# overlapping subjects: 16


In [475]:
# subject-wise train/test split
subjects = df_all["subject_id"].unique()
rng = np.random.default_rng(42)
rng.shuffle(subjects)

In [476]:
# 50/50 split
n_train = len(subjects) // 2
train_subjects = set(subjects[:n_train])
test_subjects = set(subjects[n_train:])

df_train = df_all[df_all["subject_id"].isin(train_subjects)].reset_index(drop=True)
df_test = df_all[df_all["subject_id"].isin(test_subjects)].reset_index(drop=True)

print("Train subjects:", len(train_subjects))
print("Test subjects :", len(test_subjects))
print("Overlap       :", len(train_subjects & test_subjects))

Train subjects: 8
Test subjects : 8
Overlap       : 0


### 3.1.5. Check of cleaned data for synthetic generation

In [477]:
# Overwrite dataset_split using the NEW split
df_all[SPLIT_COL] = np.where(df_all["subject_id"].isin(train_subjects), "train", "test")
df_train = df_all[df_all["subject_id"].isin(train_subjects)].reset_index(drop=True)
df_test = df_all[df_all["subject_id"].isin(test_subjects)].reset_index(drop=True)
print("Split counts with new split:")
print(df_all[SPLIT_COL].value_counts())

Split counts with new split:
dataset_split
train    30336
test     30336
Name: count, dtype: int64


In [478]:
print("Train shape with new split:", df_train.shape)
print("Test shape with new split:", df_test.shape)

Train shape with new split: (30336, 13)
Test shape with new split: (30336, 13)


In [479]:
print("Train label x condition with new split:")
print(pd.crosstab(df_train[LABEL_COL], df_train["matching_condition"]))

print("Test label x condition with new split:")
print(pd.crosstab(df_test[LABEL_COL], df_test["matching_condition"]))

Train label x condition with new split:
matching_condition  S1 obj  S2 match  S2 nomatch,
subject_type                                     
a                     5120      5120         4992
c                     5120      5056         4928
Test label x condition with new split:
matching_condition  S1 obj  S2 match  S2 nomatch,
subject_type                                     
a                     5120      5120         4928
c                     5120      5120         4928


### 3.1.6. Encode labels & standardize features

In [480]:
label_map = {"c": 0, "a": 1}

y_train = df_train[LABEL_COL].map(label_map).values
y_test = df_test[LABEL_COL].map(label_map).values

X_train_raw = df_train[BAND_COLS].values
X_test_raw = df_test[BAND_COLS].values

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_raw)
X_test = scaler.transform(X_test_raw)

conds_train = df_train[CONDITION_COL].values
conds_test = df_test[CONDITION_COL].values

print("Final matrices ready for synthetic generation:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train distribution:", Counter(y_train))
print("y_test distribution:", Counter(y_test))
print("Train condition counts:", Counter(conds_train))
print("Test condition counts:", Counter(conds_test))

Final matrices ready for synthetic generation:
X_train shape: (30336, 6)
X_test shape: (30336, 6)
y_train distribution: Counter({1: 15232, 0: 15104})
y_test distribution: Counter({1: 15168, 0: 15168})
Train condition counts: Counter({'S1 obj': 10240, 'S2 match': 10176, 'S2 nomatch,': 9920})
Test condition counts: Counter({'S1 obj': 10240, 'S2 match': 10240, 'S2 nomatch,': 9856})


### 3.1.7: Condition-conditional setup
- All the synthetic generation is done within each task condition separately (S1 obj / S2 match / S2 nomatch), instead of pooling them together.
1. All synthetic generation is restricted to one condition at a time:
- S1: use only S1 trials
- S2 match: use only S2 match trials
- S2 nomatch: use only S2 nomatch trials

2. The synthetic data you produce for S1 is not influenced by S2 distributions, and vice versa.

In [481]:
# Unique conditions in train
label_map = {"c": 0, "a": 1}

unique_conditions = sorted(df_train[CONDITION_COL].unique())
print("Conditions in train:", unique_conditions)

Conditions in train: ['S1 obj', 'S2 match', 'S2 nomatch,']


- P(X | condition = S1) is modelled separately from P(X | condition = S2 match) and P(X | condition = S2 nomatch).
- every synthetic sample is generated from a model that was trained only on data from the same task condition (S1 / S2 match / S2 nomatch), so each condition gets its own generative model and distribution.

In [482]:
def get_condition_slice(df, condition, band_cols, label_col):
    mask = df[CONDITION_COL] == condition
    df_cond = df.loc[mask]

    features_cond = df_cond[band_cols].to_numpy()
    labels_cond = df_cond[label_col].map(label_map).to_numpy()

    return features_cond, labels_cond

In [483]:
# Fit scaler on REAL training band features
real_features_train = df_train[BAND_COLS].to_numpy()
scaler = StandardScaler().fit(real_features_train)

# keep scaled real train for later comparison / saving
X_train = scaler.transform(real_features_train)
y_train = df_train[LABEL_COL].map(label_map).to_numpy()
conds_train = df_train[CONDITION_COL].to_numpy()

## 3.2. Synthetic Generation

### 3.2.1. Method 0: Mixup baseline Generator

In [484]:
def generate_mixup_baseline(real_features, n_synthetic=None, random_seed=RANDOM_SEED):
    """
    Mixup-style baseline for synthetic EEG feature generation.
    
    - Interpolates between two real samples
    - Adds small Gaussian noise proportional to feature std
    - Ensures same dimensionality as the input (6 bands)
    """
    np.random.seed(random_seed)
    
    if n_synthetic is None:
        n_synthetic = len(real_features)

    n_samples, n_features = real_features.shape
    synthetic_features = np.zeros((n_synthetic, n_features))

    # Noise scale
    noise_scale = 0.1 * np.std(real_features, axis=0)

    for i in range(n_synthetic):
        idx1, idx2 = np.random.choice(n_samples, 2, replace=False)

        alpha = np.random.uniform(0.3, 0.7)
        interpolated = alpha * real_features[idx1] + (1 - alpha) * real_features[idx2]

        # Add Gaussian noise
        noise = np.random.normal(0, noise_scale)
        synthetic = interpolated + noise

        # Ensure non-negative powers
        synthetic = np.clip(synthetic, 0, None)

        synthetic_features[i] = synthetic

    return synthetic_features

- Mixup: only mixes examples from the same condition.

In [485]:
def generate_mixup_by_condition(df_train, band_cols, label_col, scaler, random_seed=RANDOM_SEED):
    """
    Generate mixup-based synthetic samples *per condition* (S1 obj / S2 match / S2 nomatch,).

    For each condition in df_train:
      - take real band features only from that condition
      - generate the same number of synthetic samples via mixup
      - standardize using the global scaler
      - keep labels (a/c -> 1/0) and condition tokens

    Returns:
      X_syn_mixup : (N_train, n_bands) standardized synthetic features
      y_syn_mixup : (N_train,) labels (0/1)
      cond_syn    : (N_train,) condition tokens
    """
    unique_conditions = sorted(df_train[CONDITION_COL].unique())
    print("Conditions in train:", unique_conditions)

    X_syn_list, y_syn_list, cond_syn_list = [], [], []

    for i, cond in enumerate(unique_conditions):
        # slice by condition
        real_cond, labels_cond = get_condition_slice(
            df=df_train,
            condition=cond,
            band_cols=band_cols,
            label_col=label_col,
        )

        n_synth_cond = real_cond.shape[0]
        print(f"[Mixup] Condition={cond}, real={n_synth_cond}, synth={n_synth_cond}")

        # generate synthetic features for this condition only
        syn_raw = generate_mixup_baseline(
            real_features=real_cond,
            n_synthetic=n_synth_cond,
            random_seed=random_seed + i,
        )

        # standardize using global scaler fitted on REAL train
        X_syn_cond = scaler.transform(syn_raw)

        # reuse labels and attach condition token
        y_syn_cond = labels_cond.copy()
        cond_tokens = np.full(n_synth_cond, cond, dtype=object)

        X_syn_list.append(X_syn_cond)
        y_syn_list.append(y_syn_cond)
        cond_syn_list.append(cond_tokens)

    # concatenate all conditions
    X_syn_mixup = np.vstack(X_syn_list)
    y_syn_mixup = np.concatenate(y_syn_list)
    cond_syn_mixup = np.concatenate(cond_syn_list)

    # shuffle to remove any block structure
    perm = np.random.permutation(len(y_syn_mixup))
    X_syn_mixup = X_syn_mixup[perm]
    y_syn_mixup = y_syn_mixup[perm]
    cond_syn_mixup = cond_syn_mixup[perm]

    print("Final condition-conditional mixup shape:", X_syn_mixup.shape)
    print("Label dist:", Counter(y_syn_mixup))
    print("Condition dist:", Counter(cond_syn_mixup))

    return X_syn_mixup, y_syn_mixup, cond_syn_mixup

- Mixup is strictly condition-conditional; label and condition distributions match the real train set.

In [486]:
X_syn_mixup, y_syn_mixup, cond_syn_mixup = generate_mixup_by_condition(
    df_train=df_train,
    band_cols=BAND_COLS,
    label_col=LABEL_COL,
    scaler=scaler,
    random_seed=RANDOM_SEED,
)

Conditions in train: ['S1 obj', 'S2 match', 'S2 nomatch,']
[Mixup] Condition=S1 obj, real=10240, synth=10240
[Mixup] Condition=S2 match, real=10176, synth=10176
[Mixup] Condition=S2 nomatch,, real=9920, synth=9920
Final condition-conditional mixup shape: (30336, 6)
Label dist: Counter({1: 15232, 0: 15104})
Condition dist: Counter({'S1 obj': 10240, 'S2 match': 10176, 'S2 nomatch,': 9920})


### 3.2.2 Method 1: Correlation Sampling Generator

In [487]:
def generate_correlation_based_eeg(real_features, band_names, n_synthetic=None, random_seed=RANDOM_SEED):
    """
    Generate synthetic EEG band features using a correlation sampling method.

    real_features: np.ndarray of shape (n_samples, n_bands)
    band_names    : list of band names (for logging, same order as columns)
    n_synthetic   : number of synthetic samples to generate. If None,
                    uses n_synthetic = real_features.shape[0]
    """
    np.random.seed(random_seed)
    
    if n_synthetic is None:
        n_synthetic = real_features.shape[0]

    # Correlation and summary stats
    correlation_matrix = np.corrcoef(real_features.T)
    mean_features = np.mean(real_features, axis=0)
    std_features = np.std(real_features, axis=0)

    print("Correlation Matrix of Frequency Bands:")
    for i, band1 in enumerate(band_names):
        for j, band2 in enumerate(band_names):
            if j >= i:
                print(f"{band1:11s} - {band2:11s}: {correlation_matrix[i, j]:7.3f}")
    
    # Covariance = D * Corr * D
    covariance_matrix = np.outer(std_features, std_features) * correlation_matrix
    
    # Multivariate normal sampling
    synthetic_features = np.random.multivariate_normal(
        mean_features,
        covariance_matrix,
        size=n_synthetic
    )

    # Ensure non-negative band powers
    synthetic_features = np.clip(synthetic_features, a_min=0.0, a_max=None)

    print(f"Generated {n_synthetic} synthetic feature vectors")
    print("Correlation structure preserved (in expectation)")

    return synthetic_features, correlation_matrix, covariance_matrix

- computes the correlation matrix only on that condition’s subset.

In [488]:
def generate_corr_by_condition(df_train, band_cols, label_col, scaler, random_seed=RANDOM_SEED):
    """
    Condition-conditional correlation sampling.

    For each condition (S1 obj / S2 match / S2 nomatch,):
      - slice df_train to that condition
      - compute correlation & stats on that subset
      - generate the same number of synthetic samples for that condition
      - standardize with global scaler (fit on real train)
      - keep labels (a/c -> 1/0) and condition tokens

    Returns:
      X_syn_corr    : (N_train, n_bands) standardized synthetic features
      y_syn_corr    : (N_train,) labels 0/1
      cond_syn_corr : (N_train,) condition tokens
    """
    unique_conditions = sorted(df_train[CONDITION_COL].unique())
    print("Conditions in train:", unique_conditions)

    X_syn_list, y_syn_list, cond_syn_list = [], [], []

    for i, cond in enumerate(unique_conditions):
        # Slice by condition using the shared helper
        real_cond, labels_cond = get_condition_slice(
            df=df_train,
            condition=cond,
            band_cols=band_cols,
            label_col=label_col,
        )

        n_synth_cond = real_cond.shape[0]
        print(f"[Corr] Condition={cond}, real={n_synth_cond}, synth={n_synth_cond}")

        if n_synth_cond < band_cols.__len__():
            # Edge case: very few samples for this condition (should not happen here, but safe)
            print(f"  WARNING: very few samples for {cond}; skipping correlation sampling.")
            continue

        # Per-condition correlation-based generation
        syn_raw, corr_mat, cov_mat = generate_correlation_based_eeg(
            real_features=real_cond,
            band_names=band_cols,
            n_synthetic=n_synth_cond,
            random_seed=random_seed + i,  # small offset per condition
        )

        # Standardize using the global scaler (fit on REAL train)
        X_syn_cond = scaler.transform(syn_raw)

        # Attach labels and condition tokens
        y_syn_cond = labels_cond.copy()
        cond_tokens = np.full(n_synth_cond, cond, dtype=object)

        X_syn_list.append(X_syn_cond)
        y_syn_list.append(y_syn_cond)
        cond_syn_list.append(cond_tokens)

    # Concatenate across conditions
    X_syn_corr = np.vstack(X_syn_list)
    y_syn_corr = np.concatenate(y_syn_list)
    cond_syn_corr = np.concatenate(cond_syn_list)

    # Shuffle to break condition-wise block ordering
    perm = np.random.permutation(len(y_syn_corr))
    X_syn_corr = X_syn_corr[perm]
    y_syn_corr = y_syn_corr[perm]
    cond_syn_corr = cond_syn_corr[perm]

    print("Final condition-conditional CORR shape:", X_syn_corr.shape)
    print("Label dist:", Counter(y_syn_corr))
    print("Condition dist:", Counter(cond_syn_corr))

    return X_syn_corr, y_syn_corr, cond_syn_corr

In [489]:
X_syn_corr, y_syn_corr, cond_syn_corr = generate_corr_by_condition(
    df_train=df_train,
    band_cols=BAND_COLS,
    label_col=LABEL_COL,
    scaler=scaler,
    random_seed=RANDOM_SEED,
)

Conditions in train: ['S1 obj', 'S2 match', 'S2 nomatch,']
[Corr] Condition=S1 obj, real=10240, synth=10240
Correlation Matrix of Frequency Bands:
Delta       - Delta      :   1.000
Delta       - Theta      :   0.590
Delta       - Alpha      :   0.230
Delta       - Beta       :   0.274
Delta       - Gamma      :   0.116
Delta       - total_power:   0.804
Theta       - Theta      :   1.000
Theta       - Alpha      :   0.332
Theta       - Beta       :   0.328
Theta       - Gamma      :   0.127
Theta       - total_power:   0.680
Alpha       - Alpha      :   1.000
Alpha       - Beta       :   0.268
Alpha       - Gamma      :   0.066
Alpha       - total_power:   0.509
Beta        - Beta       :   1.000
Beta        - Gamma      :   0.717
Beta        - total_power:   0.652
Gamma       - Gamma      :   1.000
Gamma       - total_power:   0.480
total_power - total_power:   1.000
Generated 10240 synthetic feature vectors
Correlation structure preserved (in expectation)
[Corr] Condition=S2 match, 

### 3.2.3. Method 2: WGAN-GP Synthetic Generator

In [490]:
def generate_wgangp_eeg(real_features, n_synthetic=100, noise_dim=16, hidden_dim=64, n_critic=5, gp_lambda=10.0, lr=1e-4, batch_size=128, epochs=300, random_seed=RANDOM_SEED):
    """
    Train a compact WGAN-GP on band-power features and return synthetic samples.
    real_features: np.ndarray of shape (n_samples, n_features) – here (N_cond, 6)
    """
    torch.manual_seed(random_seed)
    np.random.seed(random_seed)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # real_features already clipped and cleaned
    data = torch.from_numpy(real_features.astype(np.float32))
    dataset = TensorDataset(data)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)

    feature_dim = real_features.shape[1]

    class Generator(nn.Module):
        def __init__(self):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(noise_dim, hidden_dim),
                nn.LayerNorm(hidden_dim),
                nn.GELU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.GELU(),
                nn.Linear(hidden_dim, feature_dim),
            )

        def forward(self, z):
            x = self.net(z)
            # softplus keeps outputs positive but smooth
            return torch.nn.functional.softplus(x)

    class Critic(nn.Module):
        def __init__(self):
            super().__init__()
            self.net = nn.Sequential(
                nn.Linear(feature_dim, hidden_dim),
                nn.LayerNorm(hidden_dim),
                nn.GELU(),
                nn.Linear(hidden_dim, hidden_dim),
                nn.GELU(),
                nn.Linear(hidden_dim, 1),
            )
        def forward(self, x):
            return self.net(x)

    def gradient_penalty(critic, real, fake):
        batch_size = real.size(0)
        epsilon = torch.rand(batch_size, 1, device=real.device)
        epsilon = epsilon.expand_as(real)
        interpolated = epsilon * real + (1 - epsilon) * fake
        interpolated.requires_grad_(True)
        mixed_scores = critic(interpolated)
        grad = torch.autograd.grad(
            outputs=mixed_scores,
            inputs=interpolated,
            grad_outputs=torch.ones_like(mixed_scores),
            create_graph=True,
            retain_graph=True,
        )[0]
        grad = grad.view(batch_size, -1)
        gp = ((grad.norm(2, dim=1) - 1) ** 2).mean()
        return gp

    G = Generator().to(device)
    D = Critic().to(device)

    opt_G = optim.Adam(G.parameters(), lr=lr, betas=(0.5, 0.9))
    opt_D = optim.Adam(D.parameters(), lr=lr, betas=(0.5, 0.9))

    for epoch in range(epochs):
        for i, (real_batch,) in enumerate(loader):
            real_batch = real_batch.to(device)

            # Critic updates
            for _ in range(n_critic):
                z = torch.randn(real_batch.size(0), noise_dim, device=device)
                fake_batch = G(z).detach()

                opt_D.zero_grad()
                critic_real = D(real_batch).mean()
                critic_fake = D(fake_batch).mean()
                gp = gradient_penalty(D, real_batch, fake_batch)
                loss_D = -(critic_real - critic_fake) + gp_lambda * gp
                loss_D.backward()
                opt_D.step()

            # Generator update
            z = torch.randn(real_batch.size(0), noise_dim, device=device)
            opt_G.zero_grad()
            fake_batch = G(z)
            loss_G = -D(fake_batch).mean()
            loss_G.backward()
            opt_G.step()

        # simple monitoring every 50 epochs
        if (epoch + 1) % 50 == 0:
            with torch.no_grad():
                z = torch.randn(batch_size, noise_dim, device=device)
                preview = G(z).cpu().numpy()
            preview_mean = preview.mean(axis=0)
            print(
                f"Epoch {epoch + 1:03d}/{epochs} | "
                f"D: {loss_D.item():.4f} | G: {loss_G.item():.4f} | "
                f"preview mean={np.round(preview_mean, 3)}"
            )

    # Generate n_synthetic samples
    G.eval()
    with torch.no_grad():
        synth_chunks = []
        remaining = n_synthetic
        while remaining > 0:
            current = min(batch_size, remaining)
            z = torch.randn(current, noise_dim, device=device)
            synth = G(z).cpu().numpy()
            synth_chunks.append(synth)
            remaining -= current

    synthetic = np.vstack(synth_chunks)
    synthetic = np.clip(synthetic, a_min=0.0, a_max=None)
    return synthetic

- trains a separate GAN per condition.

In [None]:
def generate_wgangp_by_condition(df_train, band_cols, label_col, scaler, epochs=300, random_seed=RANDOM_SEED):
    
    unique_conditions = sorted(df_train[CONDITION_COL].unique())
    print("Conditions in train:", unique_conditions)

    X_syn_list, y_syn_list, cond_syn_list = [], [], []

    for i, cond in enumerate(unique_conditions):
        # Slice real data by condition
        real_cond, labels_cond = get_condition_slice(
            df=df_train,
            condition=cond,
            band_cols=band_cols,
            label_col=label_col,
        )

        n_synth_cond = real_cond.shape[0]
        print(f"[WGAN-GP] Condition={cond}, real={n_synth_cond}, synth={n_synth_cond}")

        if n_synth_cond < 2 * band_cols.__len__():
            # If you ever had too few samples for a condition, skip or fallback
            print(f"WARNING: very few samples for {cond}; skipping WGAN-GP.")
            continue

        # Train WGAN-GP on this condition only
        synth_raw = generate_wgangp_eeg(
            real_features=real_cond.astype(np.float32),
            n_synthetic=n_synth_cond,
            epochs=epochs,
            random_seed=random_seed + i
        )

        # Standardize with global scaler (already fit on REAL X_train)
        X_syn_cond = scaler.transform(synth_raw)

        # Attach labels + condition tokens
        y_syn_cond = labels_cond.copy()
        cond_tokens = np.full(n_synth_cond, cond, dtype=object)

        X_syn_list.append(X_syn_cond)
        y_syn_list.append(y_syn_cond)
        cond_syn_list.append(cond_tokens)

    # Concatenate all conditions
    X_syn_wgangp = np.vstack(X_syn_list)
    y_syn_wgangp = np.concatenate(y_syn_list)
    cond_syn_wgangp = np.concatenate(cond_syn_list)

    # Shuffle to break block structure
    perm = np.random.permutation(len(y_syn_wgangp))
    X_syn_wgangp = X_syn_wgangp[perm]
    y_syn_wgangp = y_syn_wgangp[perm]
    cond_syn_wgangp = cond_syn_wgangp[perm]

    print("Final condition-conditional WGAN-GP shape:", X_syn_wgangp.shape)
    print("Label dist:", Counter(y_syn_wgangp))
    print("Condition dist:", Counter(cond_syn_wgangp))

    return X_syn_wgangp, y_syn_wgangp, cond_syn_wgangp

In [492]:
print("Training WGAN-GP generators per condition on band-power features")

real_features = df_train[BAND_COLS].to_numpy()
X_syn_wgangp, y_syn_wgangp, cond_syn_wgangp = generate_wgangp_by_condition(
    df_train=df_train,
    band_cols=BAND_COLS,
    label_col=LABEL_COL,
    scaler=scaler,
    epochs=300,
    random_seed=RANDOM_SEED,
)

Training WGAN-GP generators per condition on band-power features
Conditions in train: ['S1 obj', 'S2 match', 'S2 nomatch,']
[WGAN-GP] Condition=S1 obj, real=10240, synth=10240
Epoch 050/300 | D: -1.8628 | G: 88.3494 | preview mean=[10.048  3.788  5.271  5.396  2.427 26.904]
Epoch 100/300 | D: -0.7408 | G: 66.3920 | preview mean=[14.188  4.425  4.945  5.697  2.56  31.712]
Epoch 150/300 | D: -1.5953 | G: 73.5776 | preview mean=[14.251  4.537  5.312  6.156  2.869 34.027]
Epoch 200/300 | D: -1.8032 | G: 79.3281 | preview mean=[12.784  4.14   6.374  5.542  2.875 32.975]
Epoch 250/300 | D: -1.4544 | G: 85.8254 | preview mean=[14.405  4.745  5.834  5.783  2.324 33.927]
Epoch 300/300 | D: 0.0453 | G: 89.6252 | preview mean=[12.375  3.811  5.596  5.387  2.517 30.318]
[WGAN-GP] Condition=S2 match, real=10176, synth=10176
Epoch 050/300 | D: -18.2941 | G: 123.7942 | preview mean=[16.172  4.67   4.781  5.155  1.678 31.74 ]
Epoch 100/300 | D: -6.5992 | G: 73.8731 | preview mean=[18.242  5.103  4.663

### 3.2.4. Method 3: Gaussian Copula Sampling Generator

In [493]:
def _allocate_samples_by_class(labels, n_total):
    """
    Allocate synthetic samples per class, preserving empirical ratios.
    Returns a dict: {class_label: n_synth_for_that_class}
    """
    classes, counts = np.unique(labels, return_counts=True)
    ratios = counts / counts.sum()
    expected = ratios * n_total

    allocated = np.floor(expected).astype(int)
    remainder = n_total - allocated.sum()

    if remainder > 0:
        remainders = expected - allocated
        order = np.argsort(remainders)[::-1]
        for idx in order[:remainder]:
            allocated[idx] += 1

    return dict(zip(classes, allocated))

In [494]:
def generate_gaussian_copula_eeg(real_features, labels, n_synthetic=100, random_seed=RANDOM_SEED):
    """
    Gaussian copula sampling (class-conditional):

    1. For each class (0/1), fit a quantile transformer to map marginals -> N(0,1)
    2. Estimate regularised covariance (Ledoit-Wolf) in that latent space
    3. Sample multivariate normal per class and invert the transform
    4. Clip to non-negative band powers
    5. Return synthetic features + synthetic labels
    """
    rng = np.random.default_rng(random_seed)
    allocation = _allocate_samples_by_class(labels, n_synthetic)
    synthetic_blocks = []
    label_blocks = []

    print("Generating Gaussian copula samples per class...")
    for cls, n_cls_samples in allocation.items():
        class_features = real_features[labels == cls]
        if len(class_features) == 0 or n_cls_samples == 0:
            continue

        # quantile transformer to approximate Gaussian marginals
        n_quantiles = min(len(class_features), 1000)
        transformer = QuantileTransformer(
            n_quantiles=n_quantiles,
            output_distribution="normal",
            random_state=random_seed,
        )

        latent = transformer.fit_transform(class_features)

        # Ledoit-Wolf for stable covariance
        cov_estimator = LedoitWolf().fit(latent)
        latent_mean = cov_estimator.location_
        latent_cov = cov_estimator.covariance_

        # sample in latent Gaussian space
        latent_samples = rng.multivariate_normal(
            latent_mean,
            latent_cov,
            size=n_cls_samples,
        )

        # invert back to band-power space
        samples = transformer.inverse_transform(latent_samples)

        # enforce non-negativity for power features
        samples = np.clip(samples, a_min=0, a_max=None)
        synthetic_blocks.append(samples)

        # create corresponding label block
        label_blocks.append(np.full(n_cls_samples, cls, dtype=labels.dtype))

        print(f"  Class {cls}: real={len(class_features)}, synthetic={n_cls_samples}")

    if not synthetic_blocks:
        raise ValueError("No synthetic samples were generated. Check class labels.")

    synthetic_features = np.vstack(synthetic_blocks)
    synthetic_labels = np.concatenate(label_blocks)

    return synthetic_features, synthetic_labels

- fits marginals + covariance on a given condition’s data.

In [495]:
def generate_gaussian_copula_by_condition(df_train, band_cols, label_col, scaler, random_seed=RANDOM_SEED):
    unique_conditions = sorted(df_train[CONDITION_COL].unique())
    print("Conditions in train:", unique_conditions)

    X_syn_list, y_syn_list, cond_syn_list = [], [], []

    for i, cond in enumerate(unique_conditions):
        # slice real data by condition
        real_cond, labels_cond = get_condition_slice(
            df=df_train,
            condition=cond,
            band_cols=band_cols,
            label_col=label_col,
        )
        n_cond = real_cond.shape[0]
        print(f"[Copula] Condition={cond}, real={n_cond}, synth={n_cond}")

        if n_cond == 0:
            print(f"WARNING: no samples for condition={cond}, skipping.")
            continue

        # run Gaussian Copula on this condition slice (class-conditional inside)
        synth_raw_cond, labels_syn_cond = generate_gaussian_copula_eeg(
            real_features=real_cond,
            labels=labels_cond,
            n_synthetic=n_cond,
            random_seed=random_seed + i,   # offset per condition
        )

        # Standardize using global scaler (fit on REAL df_train band features)
        X_syn_cond = scaler.transform(synth_raw_cond)

        # condition tokens
        cond_tokens = np.full(n_cond, cond, dtype=object)

        X_syn_list.append(X_syn_cond)
        y_syn_list.append(labels_syn_cond)
        cond_syn_list.append(cond_tokens)

    # concatenate over conditions
    X_syn_copula = np.vstack(X_syn_list)
    y_syn_copula = np.concatenate(y_syn_list)
    cond_syn_copula = np.concatenate(cond_syn_list)

    # shuffle
    perm = np.random.permutation(len(y_syn_copula))
    X_syn_copula = X_syn_copula[perm]
    y_syn_copula = y_syn_copula[perm]
    cond_syn_copula = cond_syn_copula[perm]

    print("Final condition-conditional Copula shape:", X_syn_copula.shape)
    print("Label dist:", Counter(y_syn_copula))
    print("Condition dist:", Counter(cond_syn_copula))

    return X_syn_copula, y_syn_copula, cond_syn_copula

In [496]:
print("Training Gaussian Copula generators per condition")

X_syn_copula, y_syn_copula, cond_syn_copula = generate_gaussian_copula_by_condition(
    df_train=df_train,
    band_cols=BAND_COLS,
    label_col=LABEL_COL,
    scaler=scaler,
    random_seed=RANDOM_SEED,
)

Training Gaussian Copula generators per condition
Conditions in train: ['S1 obj', 'S2 match', 'S2 nomatch,']
[Copula] Condition=S1 obj, real=10240, synth=10240
Generating Gaussian copula samples per class...
  Class 0: real=5120, synthetic=5120
  Class 1: real=5120, synthetic=5120
[Copula] Condition=S2 match, real=10176, synth=10176
Generating Gaussian copula samples per class...
  Class 0: real=5056, synthetic=5056
  Class 1: real=5120, synthetic=5120
[Copula] Condition=S2 nomatch,, real=9920, synth=9920
Generating Gaussian copula samples per class...
  Class 0: real=4928, synthetic=4928
  Class 1: real=4992, synthetic=4992
Final condition-conditional Copula shape: (30336, 6)
Label dist: Counter({1: 15232, 0: 15104})
Condition dist: Counter({'S1 obj': 10240, 'S2 match': 10176, 'S2 nomatch,': 9920})


### 3.2.5. Method 4: Classwise Interpolation Generator

In [497]:
def generate_classwise_interpolation_eeg(real_features, labels, n_synthetic=100, random_seed=RANDOM_SEED, k_neighbors=8, noise_scale=0.02):
    """
    Class-conditional interpolation inspired by SMOTE.
    Operates in log-power space to better capture multiplicative structure.

    real_features: np.ndarray, shape (N, n_bands)
    labels       : np.ndarray, shape (N,) with class labels (0/1)

    Returns:
        synthetic_features: (n_synthetic, n_bands)
        synthetic_labels  : (n_synthetic,)
    """
    rng = np.random.default_rng(random_seed)
    allocation = _allocate_samples_by_class(labels, n_synthetic)

    synthetic_blocks = []
    label_blocks = []

    # work in log-power space
    log_features = np.log1p(real_features)

    for cls, n_cls_samples in allocation.items():
        class_mask = labels == cls
        class_features_log = log_features[class_mask]
        if len(class_features_log) == 0 or n_cls_samples == 0:
            continue

        # effective neighbors (avoid k > n-1)
        n_neighbors_eff = min(k_neighbors, len(class_features_log) - 1)

        if n_neighbors_eff <= 0:
            # fallback: jitter existing samples
            base_samples = np.repeat(
                class_features_log,
                repeats=max(1, n_cls_samples // max(1, len(class_features_log))),
                axis=0,
            )
            base_samples = base_samples[:n_cls_samples]
            jitter = rng.normal(0, noise_scale, size=base_samples.shape)
            augmented = base_samples + jitter
            syn_block = np.expm1(augmented)
        else:
            from sklearn.neighbors import NearestNeighbors  # ensure imported

            nbrs = NearestNeighbors(n_neighbors=n_neighbors_eff + 1)
            nbrs.fit(class_features_log)

            class_std = np.std(class_features_log, axis=0, ddof=1)
            class_std[class_std == 0] = 1e-6

            syn_list = []
            for _ in range(n_cls_samples):
                # pick anchor
                idx = rng.integers(len(class_features_log))
                # get neighbors (excluding itself)
                neighbors = nbrs.kneighbors(
                    class_features_log[idx].reshape(1, -1),
                    return_distance=False,
                )[0]
                neighbors = neighbors[neighbors != idx]

                if len(neighbors) == 0:
                    neighbor_idx = idx
                else:
                    neighbor_idx = rng.choice(neighbors)

                # interpolate in log-space
                alpha = rng.uniform(0.2, 0.8)
                interpolated = (
                    alpha * class_features_log[idx]
                    + (1 - alpha) * class_features_log[neighbor_idx]
                )

                # add small Gaussian noise in log-space
                noise = rng.normal(
                    0,
                    noise_scale,
                    size=class_features_log.shape[1],
                ) * class_std
                synthetic_log = interpolated + noise

                syn_list.append(np.expm1(synthetic_log))

            syn_block = np.vstack(syn_list)

        syn_block = np.clip(syn_block, a_min=0, a_max=None)
        synthetic_blocks.append(syn_block)
        label_blocks.append(np.full(n_cls_samples, cls, dtype=labels.dtype))

    if not synthetic_blocks:
        raise ValueError("Interpolation generator did not create any samples.")

    synthetic_features = np.vstack(synthetic_blocks)
    synthetic_labels = np.concatenate(label_blocks)
    return synthetic_features, synthetic_labels

- interpolates points within the same condition and class.

In [498]:
def generate_classwise_interpolation_by_condition(df_train, band_cols, label_col, scaler, random_seed=RANDOM_SEED, k_neighbors=10, noise_scale=0.015):
    """
    Condition-conditional classwise interpolation:

      - For each condition (S1 obj / S2 match / S2 nomatch,)
        * slice df_train to that condition
        * run classwise interpolation (class-conditional) on that slice
        * generate the SAME number of synthetic samples as real
      - Standardize using global scaler fit on REAL df_train

    Returns:
      X_syn_interp    : (N_train, n_bands)
      y_syn_interp    : (N_train,)
      cond_syn_interp : (N_train,)
    """
    unique_conditions = sorted(df_train[CONDITION_COL].unique())
    print("Conditions in train:", unique_conditions)

    X_syn_list, y_syn_list, cond_syn_list = [], [], []

    for i, cond in enumerate(unique_conditions):
        # 1) slice by condition
        real_cond, labels_cond = get_condition_slice(
            df=df_train,
            condition=cond,
            band_cols=band_cols,
            label_col=label_col,
        )

        n_cond = real_cond.shape[0]
        print(f"[Interp] Condition={cond}, real={n_cond}, synth={n_cond}")

        if n_cond == 0:
            print(f"  WARNING: no samples for condition={cond}, skipping.")
            continue

        # 2) run classwise interpolation on this condition slice
        synth_raw_cond, labels_syn_cond = generate_classwise_interpolation_eeg(
            real_features=real_cond,
            labels=labels_cond,
            n_synthetic=n_cond,
            random_seed=random_seed + i,
            k_neighbors=k_neighbors,
            noise_scale=noise_scale,
        )

        # 3) Standardize using global scaler fit on REAL train
        X_syn_cond = scaler.transform(synth_raw_cond)

        cond_tokens = np.full(n_cond, cond, dtype=object)

        X_syn_list.append(X_syn_cond)
        y_syn_list.append(labels_syn_cond)
        cond_syn_list.append(cond_tokens)

    X_syn_interp = np.vstack(X_syn_list)
    y_syn_interp = np.concatenate(y_syn_list)
    cond_syn_interp = np.concatenate(cond_syn_list)

    # Shuffle
    perm = np.random.permutation(len(y_syn_interp))
    X_syn_interp = X_syn_interp[perm]
    y_syn_interp = y_syn_interp[perm]
    cond_syn_interp = cond_syn_interp[perm]

    print("Final condition-conditional Interp shape:", X_syn_interp.shape)
    print("Label dist:", Counter(y_syn_interp))
    print("Condition dist:", Counter(cond_syn_interp))

    return X_syn_interp, y_syn_interp, cond_syn_interp

In [499]:
print("Training classwise interpolation generators per condition")

X_syn_interp, y_syn_interp, cond_syn_interp = generate_classwise_interpolation_by_condition(
    df_train=df_train,
    band_cols=BAND_COLS,
    label_col=LABEL_COL,
    scaler=scaler,
    random_seed=RANDOM_SEED,
    k_neighbors=10,
    noise_scale=0.015,
)

Training classwise interpolation generators per condition
Conditions in train: ['S1 obj', 'S2 match', 'S2 nomatch,']
[Interp] Condition=S1 obj, real=10240, synth=10240
[Interp] Condition=S2 match, real=10176, synth=10176
[Interp] Condition=S2 nomatch,, real=9920, synth=9920
Final condition-conditional Interp shape: (30336, 6)
Label dist: Counter({1: 15232, 0: 15104})
Condition dist: Counter({'S1 obj': 10240, 'S2 match': 10176, 'S2 nomatch,': 9920})


## 3.3. Save model output

In [503]:
def save_model_dataset(model_name, X_real, y_real, cond_real, X_syn,  y_syn,  cond_syn, out_dir=OUT_DIR):

    model_dir = out_dir / model_name
    model_dir.mkdir(parents=True, exist_ok=True)

    real_df = pd.DataFrame(X_real, columns=BAND_COLS)
    real_df["label"] = y_real
    real_df["condition"] = cond_real
    real_df["source"] = "real"

    syn_df = pd.DataFrame(X_syn, columns=BAND_COLS)
    syn_df["label"] = y_syn
    syn_df["condition"] = cond_syn
    syn_df["source"] = "synthetic"

    pool_df = pd.concat([real_df, syn_df], axis=0).reset_index(drop=True)

    real_path = model_dir / f"{model_name}_real.csv"
    syn_path  = model_dir / f"{model_name}_syn.csv"
    pool_path = model_dir / f"{model_name}_pool.csv"

    real_df.to_csv(real_path, index=False)
    syn_df.to_csv(syn_path, index=False)
    pool_df.to_csv(pool_path, index=False)

    print(f"[Saved] {model_name}")
    print(" ", real_path)
    print(" ", syn_path)
    print(" ", pool_path)

    return real_df, syn_df, pool_df

In [504]:
# 1. MIXUP
save_model_dataset(
    "mixup",
    X_real=X_train,
    y_real=y_train,
    cond_real=conds_train,
    X_syn=X_syn_mixup,
    y_syn=y_syn_mixup,
    cond_syn=cond_syn_mixup
)

# 2. CORRELATION SAMPLING
save_model_dataset(
    "corr",
    X_real=X_train,
    y_real=y_train,
    cond_real=conds_train,
    X_syn=X_syn_corr,
    y_syn=y_syn_corr,
    cond_syn=cond_syn_corr
)

# 3. WGAN-GP
save_model_dataset(
    "wgangp",
    X_real=X_train,
    y_real=y_train,
    cond_real=conds_train,
    X_syn=X_syn_wgangp,
    y_syn=y_syn_wgangp,
    cond_syn=cond_syn_wgangp
)

# 4. GAUSSIAN COPULA
save_model_dataset(
    "copula",
    X_real=X_train,
    y_real=y_train,
    cond_real=conds_train,
    X_syn=X_syn_copula,
    y_syn=y_syn_copula,
    cond_syn=cond_syn_copula
)

# 5. CLASSWISE INTERPOLATION
save_model_dataset(
    "interp",
    X_real=X_train,
    y_real=y_train,
    cond_real=conds_train,
    X_syn=X_syn_interp,
    y_syn=y_syn_interp,
    cond_syn=cond_syn_interp
)

[Saved] mixup
  ../output/synthetic_generation/mixup/mixup_real.csv
  ../output/synthetic_generation/mixup/mixup_syn.csv
  ../output/synthetic_generation/mixup/mixup_pool.csv
[Saved] corr
  ../output/synthetic_generation/corr/corr_real.csv
  ../output/synthetic_generation/corr/corr_syn.csv
  ../output/synthetic_generation/corr/corr_pool.csv
[Saved] wgangp
  ../output/synthetic_generation/wgangp/wgangp_real.csv
  ../output/synthetic_generation/wgangp/wgangp_syn.csv
  ../output/synthetic_generation/wgangp/wgangp_pool.csv
[Saved] copula
  ../output/synthetic_generation/copula/copula_real.csv
  ../output/synthetic_generation/copula/copula_syn.csv
  ../output/synthetic_generation/copula/copula_pool.csv
[Saved] interp
  ../output/synthetic_generation/interp/interp_real.csv
  ../output/synthetic_generation/interp/interp_syn.csv
  ../output/synthetic_generation/interp/interp_pool.csv


(          Delta     Theta     Alpha      Beta     Gamma  total_power  label  \
 0      0.292732  0.249902 -0.610662  0.411409  1.345733     0.284397      1   
 1      0.375690  0.290176 -0.585701  2.490218  3.991401     0.927912      1   
 2     -0.300488  0.329996 -0.448042  4.006910  4.693634     2.271983      1   
 3     -0.124150  0.066128 -0.374884  4.006910  4.693634     2.648213      1   
 4     -0.037956 -0.381335 -0.659416 -0.155800  0.179559    -0.249257      1   
 ...         ...       ...       ...       ...       ...          ...    ...   
 30331 -0.661455 -0.783984 -0.727400 -0.972912 -0.667465    -0.919100      1   
 30332 -0.661455 -0.783984 -0.727400 -0.972912 -0.667465    -0.919100      1   
 30333 -0.661455 -0.783984 -0.727400 -0.972912 -0.667465    -0.919100      1   
 30334 -0.661455 -0.783984 -0.727400 -0.972912 -0.667465    -0.919100      1   
 30335 -0.661455 -0.783984 -0.727400 -0.972912 -0.667465    -0.919100      1   
 
       condition source  
 0        S1