In [None]:
#!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
#!python rapidsai-csp-utils/colab/pip-install.py


In [None]:
#!pip install torch torchaudio --index-url https://download.pytorch.org/whl/cpu
#!pip install pandas pyarrow numpy tqdm soundfile cloudpickle scikit-learn joblib

In [None]:
!pip install --upgrade pip
!pip install --index-url https://download.pytorch.org/whl/cpu torch==2.5.1 torchaudio==2.5.1
!pip install pandas pyarrow numpy tqdm soundfile cloudpickle scikit-learn joblib


## Data reading from .wav files .. loading into a df. with filepath for name extraction

In [1]:
# ===== UNIFIED MANIFEST (drop-in) =====
from pathlib import Path
import re, pandas as pd
import numpy as np

# --- POINT THESE TO YOUR FOLDERS ---
ROOTS = {
    # CREMA-D files directly here, e.g. 1001_IEO_HAP_LO.wav
    "CREMA-D":   Path("data/CREMA-D"),
    # ESD speakers directly here, e.g. data/ESD/0011/Angry/0011_000351.wav
    "ESD":       Path("data/ESD"),
    # JL-CORPUS files directly here, e.g. female1_angry_1a_1.wav
    "JL-CORPUS": Path("data/JL-CORPUS"),
    # RAVDESS parent that contains Actor_01..Actor_24
    "RAVDESS":   Path("data/RAVDESS"),
    # TESS files directly here, e.g. OAF_back_neutral.wav
    "TESS":      Path("data/TESS"),
}

CANON = {"angry","happy","neutral","sad"}

def _norm_emo(x:str|None):
    if not x: return None
    t = x.strip().lower()
    aliases = {
        "ang":"angry","anger":"angry",
        "hap":"happy","happiness":"happy",
        "neu":"neutral","calm":"neutral",   # safety fold
        "sadness":"sad",
    }
    t = aliases.get(t, t)
    return t if t in CANON else None

# ---------- CREMA-D (1001_IEO_HAP_LO.wav) ----------
_pat_cremad = re.compile(r"^(?P<spk>\d{4})_[A-Z]{3}_(?P<emo>[A-Z]{3})_[A-Z]{2}\.(?:wav|WAV)$")
CREMA_MAP = {"ANG":"angry","HAP":"happy","NEU":"neutral","SAD":"sad"}

def parse_cremad(root:Path, diag):
    rows=[]
    if not root.exists():
        diag.append(("CREMA-D","root_missing",str(root))); return rows
    for p in root.rglob("*.wav"):
        m = _pat_cremad.match(p.name)
        if not m: diag.append(("CREMA-D","name_pattern_miss",str(p))); continue
        emo = _norm_emo(CREMA_MAP.get(m.group("emo")))
        if not emo: diag.append(("CREMA-D","emo_not_in_4",str(p))); continue
        rows.append({"dataset":"CREMA-D","filepath":str(p),"speaker_id":m.group("spk"),"emotion":emo})
    return rows

# ---------- ESD (ESD/0011/Angry/0011_000351.wav) ----------
def parse_esd(root:Path, diag):
    rows=[]
    if not root.exists():
        diag.append(("ESD","root_missing",str(root))); return rows
    # speaker folders are numeric (3–5 digits), e.g., 0011..0020
    for spk_dir in root.iterdir():
        if not (spk_dir.is_dir() and re.fullmatch(r"\d{3,5}", spk_dir.name)):
            continue
        spk = spk_dir.name
        for emo_dir in spk_dir.iterdir():
            if not emo_dir.is_dir(): continue
            emo = _norm_emo(emo_dir.name)
            if not emo: continue
            for p in emo_dir.glob("*.wav"):
                rows.append({"dataset":"ESD","filepath":str(p),"speaker_id":spk,"emotion":emo})
    if not rows:
        diag.append(("ESD","no_matches_found",str(root)))
    return rows

# ---------- JL-CORPUS (female1_angry_1a_1.wav) ----------
_pat_tess_suffix = re.compile(r"_(angry|happy|neutral|sad)\.(?:wav|WAV)$", re.IGNORECASE)

def parse_jl(root:Path, diag):
    rows=[]
    if not root.exists():
        diag.append(("JL-CORPUS","root_missing",str(root))); return rows
    for p in root.rglob("*.wav"):
        emo = None
        m = _pat_tess_suffix.search(p.name)
        if m:
            emo = _norm_emo(m.group(1))
        else:
            tokens = re.split(r"[_\-\s]+", p.stem)
            for tok in tokens:
                e = _norm_emo(tok)
                if e: emo = e; break
        if not emo: diag.append(("JL-CORPUS","no_emo_in_name",str(p))); continue
        speaker = re.split(r"[_\-\s]+", p.stem)[0] or p.parent.name
        rows.append({"dataset":"JL-CORPUS","filepath":str(p),"speaker_id":speaker,"emotion":emo})
    return rows

# ---------- RAVDESS (Actor_01/03-01-05-02-01-01-24.wav) ----------
_pat_rav = re.compile(r"^(?P<mod>\d{2})-(?P<vc>\d{2})-(?P<emo>\d{2})-(?P<int>\d{2})-(?P<stm>\d{2})-(?P<rep>\d{2})-(?P<act>\d{2})\.(?:wav|WAV)$")
RAV_EMO = {"01":"neutral","03":"happy","04":"sad","05":"angry"}

def parse_ravdess(root:Path, diag, restrict_audio_only=True, restrict_speech_only=True):
    rows=[]
    if not root.exists():
        diag.append(("RAVDESS","root_missing",str(root))); return rows
    actor_dirs = [d for d in root.glob("Actor_*") if d.is_dir()]
    if not actor_dirs:
        actor_dirs = [d for d in root.rglob("Actor_*") if d.is_dir()] or [root]
    for adir in actor_dirs:
        for p in adir.rglob("*.wav"):
            m = _pat_rav.match(p.name)
            if not m: diag.append(("RAVDESS","name_pattern_miss",str(p))); continue
            if restrict_audio_only and m.group("mod") != "03": continue
            if restrict_speech_only and m.group("vc")  != "01": continue
            emo = RAV_EMO.get(m.group("emo"))
            if not emo: continue
            rows.append({"dataset":"RAVDESS","filepath":str(p),"speaker_id":m.group("act"),"emotion":emo})
    return rows

# ---------- TESS (OAF_back_neutral.wav) ----------
def parse_tess(root:Path, diag):
    rows=[]
    if not root.exists():
        diag.append(("TESS","root_missing",str(root))); return rows
    for p in root.rglob("*.wav"):
        m = _pat_tess_suffix.search(p.name)
        emo = _norm_emo(m.group(1)) if m else None
        if not emo: diag.append(("TESS","no_emo_in_suffix",str(p))); continue
        spk = p.stem.split("_",1)[0]  # OAF/YAF
        rows.append({"dataset":"TESS","filepath":str(p),"speaker_id":spk,"emotion":emo})
    return rows

# ---------- Build unified manifest ----------
def build_unified_manifest(roots:dict,
                           restrict_ravdess_audio_only=True,
                           restrict_ravdess_speech_only=True):
    diag = []
    rows = []
    rows += parse_cremad(roots.get("CREMA-D", Path()), diag)
    rows += parse_esd(roots.get("ESD", Path()), diag)            # <— ESD fixed
    rows += parse_jl(roots.get("JL-CORPUS", Path()), diag)
    rows += parse_ravdess(roots.get("RAVDESS", Path()), diag,
                          restrict_audio_only=restrict_ravdess_audio_only,
                          restrict_speech_only=restrict_ravdess_speech_only)
    rows += parse_tess(roots.get("TESS", Path()), diag)

    manifest = pd.DataFrame(rows)
    if manifest.empty:
        raise RuntimeError("No files parsed. Check ROOTS paths.")
    # keep only 4 classes (safety), ensure file exists, dedupe
    manifest = manifest[manifest["emotion"].isin(list(CANON))].copy()
    manifest = manifest[manifest["filepath"].apply(lambda s: Path(s).is_file())]
    manifest = manifest.drop_duplicates(subset=["filepath"]).reset_index(drop=True)

    # composite speaker to avoid cross-dataset leakage
    manifest["speaker_uid"] = manifest["dataset"] + ":" + manifest["speaker_id"]

    diag_df = pd.DataFrame(diag, columns=["dataset","reason","path"])
    return manifest, diag_df

# ---------- Speaker-independent split (same as your code, but use speaker_uid) ----------
def speaker_independent_split(df, train=0.8, val=0.1, seed=42, speaker_col="speaker_uid"):
    rng = np.random.RandomState(seed)
    spk = df[speaker_col].unique()
    rng.shuffle(spk)
    n = len(spk)
    n_tr = int(round(train*n))
    n_va = int(round(val*n))
    tr, va, te = set(spk[:n_tr]), set(spk[n_tr:n_tr+n_va]), set(spk[n_tr+n_va:])
    def tag(s): return "train" if s in tr else ("val" if s in va else "test")
    return df.assign(split=df[speaker_col].map(tag))

# ---------- USE IT ----------
# Ensure this matches your actual tree. For you it’s speakers directly under data/ESD:
ROOTS["ESD"] = Path("data/ESD")

manifest, diag = build_unified_manifest(
    ROOTS,
    restrict_ravdess_audio_only=True,   # True = use only audio-only
    restrict_ravdess_speech_only=True   # True = use only speech (not song)
)

print("TOTAL parsed:", len(manifest))
print("\nBy dataset:\n", manifest["dataset"].value_counts())

print("\nBy emotion (all 4 expected):")
print(manifest["emotion"].value_counts().reindex(["angry","happy","neutral","sad"]).fillna(0).astype(int))

print("\nPer-dataset × emotion:")
print(pd.crosstab(manifest["dataset"], manifest["emotion"])
        .reindex(columns=["angry","happy","neutral","sad"])
        .fillna(0).astype(int))

if not diag.empty:
    print("\nSkip reasons (top 12):")
    print(diag.groupby(["dataset","reason"]).size().sort_values(ascending=False).head(12))

# Split by composite speaker id (no leakage)
# Defer split — we will do a single stratified, group-aware split below.
# manifest = speaker_independent_split(manifest, train=0.8, val=0.1, seed=42, speaker_col="speaker_uid")
# print("\nSplit sizes:\n", manifest["split"].value_counts())
# print("\nClass balance (head):\n", manifest.groupby("split")["emotion"].value_counts().head())


TOTAL parsed: 22131

By dataset:
 dataset
ESD          14000
CREMA-D       4899
TESS          1600
JL-CORPUS      960
RAVDESS        672
Name: count, dtype: int64

By emotion (all 4 expected):
emotion
angry      5603
happy      5603
neutral    5323
sad        5602
Name: count, dtype: int64

Per-dataset × emotion:
emotion    angry  happy  neutral   sad
dataset                               
CREMA-D     1271   1271     1087  1270
ESD         3500   3500     3500  3500
JL-CORPUS    240    240      240   240
RAVDESS      192    192       96   192
TESS         400    400      400   400


### Split speakers into train/val/test (80/10/10) so that no speaker appears in more than one split to ensure speaker independence
### And we want to keep the class distribution roughly similar across splits
### To do this, we will randomly shuffle the speakers and assign them to splits
### based on the desired proportions.
### For reproducibility, use a fixed random seed=42


In [2]:
import numpy as np, pandas as pd
from sklearn.model_selection import StratifiedGroupKFold

def stratified_group_split(df: pd.DataFrame, train=0.8, val=0.1, seed=42,
                           label_col="emotion", group_col="speaker_uid"):
    """
    2-stage split using StratifiedGroupKFold:
      1) train+val vs test
      2) train vs val
    """
    rng = np.random.RandomState(seed)
    n_splits_outer = int(round(1/(1-train-val)))  # e.g., 1/(0.2) = 5
    n_splits_outer = max(3, min(10, n_splits_outer))

    sgkf_outer = StratifiedGroupKFold(n_splits=n_splits_outer, shuffle=True, random_state=seed)
    X = df.index.values
    y = df[label_col].values
    g = df[group_col].values

    # Find a test split closest to target size
    target_test = 1 - (train + val)
    best = None
    for trval_idx, te_idx in sgkf_outer.split(X, y, groups=g):
        frac_test = len(te_idx)/len(X)
        score = abs(frac_test - target_test)
        if (best is None) or (score < best[0]):
            best = (score, trval_idx, te_idx)
    _, trval_idx, te_idx = best

    # Now split train vs val on the tr+val portion
    X_tv = X[trval_idx]; y_tv = y[trval_idx]; g_tv = g[trval_idx]
    target_val = val/(train+val)
    n_splits_inner = int(round(1/target_val))
    n_splits_inner = max(3, min(10, n_splits_inner))

    sgkf_inner = StratifiedGroupKFold(n_splits=n_splits_inner, shuffle=True, random_state=seed+1)
    best2 = None
    for tr_idx, va_idx in sgkf_inner.split(X_tv, y_tv, groups=g_tv):
        frac_val = len(va_idx)/len(X_tv)
        score = abs(frac_val - target_val)
        if (best2 is None) or (score < best2[0]):
            best2 = (score, tr_idx, va_idx)
    _, tr_idx, va_idx = best2

    split = pd.Series(index=df.index, dtype="object")
    split.iloc[X_tv[tr_idx]] = "train"
    split.iloc[X_tv[va_idx]] = "val"
    split.iloc[X[te_idx]]    = "test"
    return df.assign(split=split)


In [3]:
# after you build `manifest` and add `speaker_uid`
manifest = stratified_group_split(manifest, train=0.8, val=0.1, seed=42,
                                  label_col="emotion", group_col="speaker_uid")

print(manifest["split"].value_counts())
print(pd.crosstab(manifest["split"], manifest["emotion"]))


split
train    17714
test      2394
val       2023
Name: count, dtype: int64
emotion  angry  happy  neutral   sad
split                               
test       604    604      582   604
train     4485   4485     4260  4484
val        514    514      481   514


## Extract feature into arrays using different torchaudio functions

In [4]:
import json, torch, torchaudio
import torchaudio.transforms as T, torchaudio.functional as AF
import torch.nn.functional as Fnn

# Load your config.json
with open("config.json") as f:
    CFG = json.load(f)

SR = CFG["sr"]; N_FFT = CFG["n_fft"]
WIN = int(SR*CFG["win_ms"]/1000); HOP = int(SR*CFG["hop_ms"]/1000)
_EPS = 1e-10

class FeatureExtractor:
    def __init__(self, cfg):
        self.cfg = cfg
        self.mel  = T.MelSpectrogram(
            sample_rate=SR, n_fft=N_FFT, hop_length=HOP, win_length=WIN,
            n_mels=cfg["n_mels"], power=2.0
        )
        self.mfcc = T.MFCC(
            sample_rate=SR, n_mfcc=cfg["n_mfcc"],
            melkwargs={"n_fft":N_FFT, "hop_length":HOP, "win_length":WIN, "n_mels":cfg["n_mels"]}
        )
        self.spec = T.Spectrogram(n_fft=N_FFT, hop_length=HOP, win_length=WIN, power=2.0)

    @torch.no_grad()
    def from_path(self, path:str) -> np.ndarray:
        # load & standardize
        y, sr0 = torchaudio.load(path)              # [C, T]
        y = y.mean(0, keepdim=True)
        if sr0 != SR:
            y = torchaudio.functional.resample(y, sr0, SR)
        peak = float(y.abs().max())
        if peak > 0:
            y = y * (self.cfg["peak_target"] / peak)

        # frame features
        mel = self.mel(y).clamp_min(_EPS)                 # [1, M, F]
        logmel = torch.log(mel).squeeze(0).T              # [F, M]

        mfcc = self.mfcc(y).squeeze(0).T                  # [F, C]
        d1   = AF.compute_deltas(mfcc.T).T
        d2   = AF.compute_deltas(d1.T).T

        spec = self.spec(y).squeeze(0).clamp_min(_EPS)    # [K, F]
        F_frames = spec.shape[1]
        freqs = torch.linspace(0, SR/2, spec.shape[0], device=spec.device)
        ps = spec                                         # already >= eps

        # spectral shape
        cen = (freqs[:,None] * ps).sum(0) / ps.sum(0)
        bw  = torch.sqrt(((freqs[:,None] - cen[None,:])**2 * ps).sum(0) / ps.sum(0))

        # rolloffs (contiguous for searchsorted)
        cs = torch.cumsum(ps, dim=0).contiguous()
        tot = cs[-1,:].contiguous()
        t85 = (0.85*tot).unsqueeze(1).contiguous()
        t95 = (0.95*tot).unsqueeze(1).contiguous()
        idx85 = torch.searchsorted(cs.T.contiguous(), t85).clamp(max=cs.shape[0]-1).squeeze(1)
        idx95 = torch.searchsorted(cs.T.contiguous(), t95).clamp(max=cs.shape[0]-1).squeeze(1)
        roll85, roll95 = freqs[idx85], freqs[idx95]

        # spectral flatness (geom/arith mean) — numerically safe
        geo = torch.exp(torch.log(ps).mean(0))
        arith = ps.mean(0).clamp_min(_EPS)
        flat = (geo / arith)

        # spectral flux (on L2-normalized magnitude)
        mag = torch.sqrt(ps)
        mag = mag / (mag.norm(p=2, dim=0, keepdim=True).clamp_min(_EPS))
        flux = torch.zeros(mag.shape[1], device=mag.device)
        flux[1:] = (mag[:,1:] - mag[:,:-1]).pow(2).sum(0).sqrt()

        # frame energy in dB (finite by construction)
        frame_energy_db = 10.0 * torch.log10(ps.mean(0).clamp_min(_EPS))

        # pitch (no hop_length/win_length) + align to spectrogram frames
        f0_raw = AF.detect_pitch_frequency(
            y, sample_rate=SR, frame_time=self.cfg["win_ms"]/1000.0
        ).squeeze(0)                              # [F0_frames]
        if f0_raw.numel() == 0:
            f0_rs = torch.zeros(F_frames, device=spec.device)
        else:
            f0_in = f0_raw.clone()
            f0_in[f0_in <= 0] = 0.0              # unvoiced -> 0 for interpolation
            f0_rs = Fnn.interpolate(
                f0_in.view(1,1,-1), size=F_frames, mode="linear", align_corners=False
            ).view(-1)
        voiced = f0_rs > 0
        f0 = torch.where(voiced, f0_rs, torch.nan)       # keep NaN for unvoiced; pooling handles it

        # stack frames x dims (KEEPING ALL FEATURES)
        F = torch.cat([
            mfcc, d1, d2,
            logmel,
            torch.stack([cen, bw, roll85, roll95, flat, flux, frame_energy_db], dim=1),
            f0.unsqueeze(1),
        ], dim=1)


        F = torch.nan_to_num(F, nan=0.0, posinf=0.0, neginf=0.0)

        # pooling from config (all + voiced-only; fixed size even if no voiced)
        def pool(A: torch.Tensor) -> torch.Tensor:
            parts = []
            if "mean"   in self.cfg["pooling"]: parts.append(A.mean(0))
            if "std"    in self.cfg["pooling"]: parts.append(A.std(0))
            if "median" in self.cfg["pooling"]: parts.append(A.median(0).values)
            if "p10"    in self.cfg["pooling"]: parts.append(torch.quantile(A, 0.10, dim=0))
            if "p90"    in self.cfg["pooling"]: parts.append(torch.quantile(A, 0.90, dim=0))
            if "slope"  in self.cfg["pooling"]:
                t = torch.linspace(0, 1, A.shape[0], device=A.device).unsqueeze(1)
                den = ((t - t.mean())**2).sum().clamp_min(1e-9)
                slope = (t * (A - A.mean(0))).sum(0) / den
                parts.append(slope)
            return torch.cat(parts, 0)

        v_all = pool(F)
        # voiced stats vector (same length as v_all); if no voiced frames, fill zeros to keep dimension
        if self.cfg.get("voiced_variant", True):
            if voiced.any():                v_vo = pool(F[voiced])
            else:
                v_vo = torch.zeros_like(v_all)
            v = torch.cat([v_all, v_vo], 0)
        else:
            v = v_all

        v = torch.nan_to_num(v, nan=0.0, posinf=0.0, neginf=0.0)
        return v.float().cpu().numpy()


## Actual extracting (Raw features, no scaling yet can save this if we want to)

In [7]:
# Parallel extraction helpers

import os
from joblib import Parallel, delayed
from tqdm import tqdm

# prevent thread oversubscription when using process-level parallelism
# if we dont set this each core will spawn its own threads which will lead to
# threads fighting over the process .. and overall instead of
# making this faster, we make it slower.
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
os.environ.setdefault("TORCH_NUM_THREADS", "1")
# Recycle workers to mitigate native leaks in long jobs
os.environ.setdefault("LOKY_MAX_JOBS_BEFORE_RESTART", "256")

def _infer_dim_from_any(df_subset, cfg):
    fx = FeatureExtractor(cfg)
    for path in df_subset["filepath"]:
        try:
            return fx.from_path(path).shape[0]
        except Exception:
            continue
    raise RuntimeError("Cannot infer feature dimension from subset (all failures).")

def _process_row(row, cfg, feature_dim):
    try:
        fx = FeatureExtractor(cfg)
        v = fx.from_path(row.filepath)
        return v, row.emotion
    except Exception as e:
        print(f"[warn] {row.filepath}: {e}")
        return np.zeros((feature_dim,), dtype=np.float32), row.emotion

def build_Xy_parallel(df_subset, cfg, n_jobs=-1, desc="extract"):
    rows = list(df_subset.itertuples(index=False))
    feat_dim = _infer_dim_from_any(df_subset, cfg)
    results = Parallel(n_jobs=n_jobs)(
        delayed(_process_row)(row, cfg, feat_dim) for row in tqdm(rows, desc=desc)
    )
    X_list, y_list = zip(*results) if results else ([], [])
    X = np.vstack(X_list).astype(np.float32) if X_list else np.zeros((0, feat_dim), dtype=np.float32)
    y = pd.Series(y_list).astype(str)  # keep strings here; encode once from TRAIN
    return X, y


In [8]:

# 7) Extract splits (parallel) + encode labels consistently

# choose your parallelism; -1 = all cores
# Set to a fixed number i.e. 5 = 5 cores if too much work
N_JOBS = -1

# TRAIN FIRST to lock class mapping
X_train, y_train_labels = build_Xy_parallel(manifest[manifest.split=="train"], CFG, n_jobs=N_JOBS, desc="train")

# choose a deterministic class order
# classes = ["angry","disgust","fear","happy","neutral","sad"]
"""
{'angry': 0, 'happy': 1, 'neutral': 2, 'sad': 3}

"""
classes = sorted(pd.unique(y_train_labels))
class_to_id = {c:i for i,c in enumerate(classes)}
y_train = np.array([class_to_id[s] for s in y_train_labels], dtype=np.int64)

# VAL
X_val, y_val_labels = build_Xy_parallel(manifest[manifest.split=="val"], CFG, n_jobs=N_JOBS, desc="val")
y_val = np.array([class_to_id.get(s, -1) for s in y_val_labels], dtype=np.int64)
if (y_val < 0).any():
    missing = sorted(set(y_val_labels[np.where(y_val<0)[0]]))
    print("[warn] val labels not seen in train:", missing)

# TEST
X_test, y_test_labels = build_Xy_parallel(manifest[manifest.split=="test"], CFG, n_jobs=N_JOBS, desc="test")
y_test = np.array([class_to_id.get(s, -1) for s in y_test_labels], dtype=np.int64)
if (y_test < 0).any():
    missing = sorted(set(y_test_labels[np.where(y_test<0)[0]]))
    print("[warn] test labels not seen in train:", missing)

print("dims:", X_train.shape, X_val.shape, X_test.shape)
print("classes:", classes)
print("NaNs in train:", np.isnan(X_train).sum())



train:   0%|          | 0/17714 [00:00<?, ?it/s][A
train:   0%|          | 22/17714 [00:00<03:09, 93.12it/s][A
train:   0%|          | 44/17714 [00:04<33:41,  8.74it/s][A
train:   0%|          | 66/17714 [00:04<19:24, 15.16it/s][A
train:   0%|          | 88/17714 [00:04<12:37, 23.27it/s][A
train:   1%|          | 110/17714 [00:04<08:37, 34.05it/s][A
train:   1%|          | 132/17714 [00:04<06:15, 46.84it/s][A
train:   1%|          | 154/17714 [00:05<04:40, 62.57it/s][A
train:   1%|          | 176/17714 [00:05<03:39, 79.82it/s][A
train:   1%|          | 198/17714 [00:05<02:59, 97.60it/s][A
train:   1%|          | 220/17714 [00:05<02:31, 115.33it/s][A
train:   1%|▏         | 242/17714 [00:05<02:13, 130.63it/s][A
train:   1%|▏         | 264/17714 [00:05<02:00, 145.24it/s][A
train:   2%|▏         | 286/17714 [00:05<01:53, 153.21it/s][A
train:   2%|▏         | 308/17714 [00:05<01:48, 159.72it/s][A
train:   2%|▏         | 330/17714 [00:06<01:46, 162.60it/s][A
train:   2%|▏  

dims: (17714, 1584) (2023, 1584) (2394, 1584)
classes: ['angry', 'happy', 'neutral', 'sad']
NaNs in train: 0


In [9]:
from sklearn.preprocessing import StandardScaler
import joblib

# 1. Initialize the Scaler
scaler = StandardScaler()

# 2. Fit the scaler ONLY on the training data, to avoid data leakage
print("\nFitting scaler on training data...")
scaler.fit(X_train)

# 3. Save the fitted scaler for deployment/inference
#
joblib.dump(scaler, 'feature_scaler.joblib')
print("Scaler saved to 'feature_scaler.joblib'")

# 4. Transform all three datasets using the SAME fitted scaler
print("Transforming train, val, and test sets...")
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

print("New scaled dims:", X_train_scaled.shape, X_val_scaled.shape, X_test_scaled.shape)





Fitting scaler on training data...
Scaler saved to 'feature_scaler.joblib'
Transforming train, val, and test sets...
New scaled dims: (17714, 1584) (2023, 1584) (2394, 1584)


In [10]:
# Statistics for both raw and scaled features
def print_stats(name, X):
    print(f"\nStats for {name}:")
    print("  Max:", np.max(X))
    print("  Min:", np.min(X))
    print("  Mean:", np.mean(X))
    print("  Std Dev:", np.std(X))
print_stats("Raw Train", X_train)
print_stats("Scaled Train", X_train_scaled)


Stats for Raw Train:
  Max: 7959.375
  Min: -8463.542
  Mean: 35.578327
  Std Dev: 354.76443

Stats for Scaled Train:
  Max: 17.163582
  Min: -11.583979
  Mean: 1.6705884e-09
  Std Dev: 1.0


# Saving features (both scaled and raw), for later experimenting

In [11]:
# Save features (generic helper used for both scaled and raw)
from pathlib import Path

def save_npz(out_dir: Path, split_name: str, X, y, classes):
    # ensure output directory exists
    out_dir.mkdir(parents=True, exist_ok=True)
    np.savez_compressed(out_dir / f"{split_name}.npz",
                        X=X, y=y.astype(np.int64), classes=np.array(classes))

# Save SCALED -> ./features_scaled
OUT_DIR_SCALED = Path("./features_scaled")

save_npz(OUT_DIR_SCALED, "train", X_train_scaled, y_train, classes)
save_npz(OUT_DIR_SCALED, "val",   X_val_scaled,   y_val,   classes)
save_npz(OUT_DIR_SCALED, "test",  X_test_scaled,  y_test,  classes)

print("Saved scaled features:", sorted(p.name for p in OUT_DIR_SCALED.glob("*.npz")))

# Save RAW -> ./features_raw
OUT_DIR_RAW = Path("./features_raw")

save_npz(OUT_DIR_RAW, "train", X_train, y_train, classes)
save_npz(OUT_DIR_RAW, "val",   X_val,   y_val,   classes)
save_npz(OUT_DIR_RAW, "test",  X_test,  y_test,  classes)

print("Saved raw features:", sorted(p.name for p in OUT_DIR_RAW.glob("*.npz")))


Saved scaled features: ['test.npz', 'train.npz', 'val.npz']
Saved raw features: ['test.npz', 'train.npz', 'val.npz']


# Loading features back (Be careful with paths)

In [None]:
import numpy as np

# load train set (scaled)
train_data = np.load("./features_scaled/train.npz", allow_pickle=False)
# To load the raw set instead, use:
# train_data = np.load("./features_raw/train.npz", allow_pickle=False)

X_trainLoaded = train_data["X"]
y_trainLoaded = train_data["y"]
classesLoaded = train_data["classes"].tolist()



## Just checking loaded data

In [None]:
# Printing type/shape of loaded features, what float type, and classes
print("\nLoaded Train set:")
print("X dtype:", X_trainLoaded.dtype)
print("y dtype:", y_trainLoaded.dtype)
print("Loaded X type:", type(X_trainLoaded), "shape:", X_trainLoaded.shape)
print("Loaded y type:", type(y_trainLoaded), "shape:", y_trainLoaded.shape)
print("Loaded classes:", classesLoaded)

In [None]:
# stats
print_stats("Loaded Train", X_trainLoaded)

# Loading scaler back for inference

In [None]:
import joblib

scaler = joblib.load("feature_scaler.joblib")
# checking stats of loaded scaler
print("\nLoaded scaler mean:", scaler.mean_)
print("Loaded scaler scale:", scaler.scale_)
# Now you can scale new data the same way:
# X_new_scaled = scaler.transform(X_new)
