# Phase 2 â€” Data Cleaning & Feature Extraction

This notebook performs:
1) Data cleaning / preprocessing of raw audio clips
2) Optional augmentation (controlled)
3) Feature extraction to fixed-size numeric vectors (for later ML)

Dataset structure (expected):
dataset/
  German/ Male/*.wav, Female/*.wav
  Italian/ Male/*.wav, Female/*.wav
  Korean/ Male/*.wav, Female/*.wav
  Spanish/ Male/*.wav, Female/*.wav

Outputs:
- artifacts/metadata.csv
- artifacts/features.csv (or .parquet if you prefer)
- artifacts/X.npy, artifacts/y.npy
- (optional) processed_audio/ cleaned wav files
- (optional) processed_audio_aug/ augmented wav files


In [None]:
import os
import glob
import math
import warnings
from pathlib import Path

import numpy as np
import pandas as pd

import librosa
import soundfile as sf
from scipy.stats import skew, kurtosis

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)


## Configuration

- `DATASET_DIR`: path to your `dataset` folder
- `SR`: target sample rate for all audio
- `TARGET_SECONDS`: pad/trim each clip to exactly this length
- `SAVE_CLEANED_AUDIO`: if True, write cleaned audio to disk (recommended for reproducibility)
- `USE_AUGMENTATION`: if True, create augmented variants (saved to disk)


In [None]:
DATASET_DIR = Path("dataset")
ARTIFACTS_DIR = Path("artifacts")
CLEAN_DIR = Path("processed_audio")
AUG_DIR = Path("processed_audio_aug")

SR = 16000
TARGET_SECONDS = 60.0
TARGET_SAMPLES = int(SR * TARGET_SECONDS)

TRIM_SILENCE = True
TOP_DB = 25

RMS_TARGET = 0.1
PEAK_MAX = 0.99

SAVE_CLEANED_AUDIO = True

USE_AUGMENTATION = False
AUG_COPIES_PER_FILE = 1

RNG_SEED = 42
rng = np.random.default_rng(RNG_SEED)

ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
if SAVE_CLEANED_AUDIO:
    CLEAN_DIR.mkdir(parents=True, exist_ok=True)
if USE_AUGMENTATION:
    AUG_DIR.mkdir(parents=True, exist_ok=True)

LANGUAGES = ["German", "Italian", "Korean", "Spanish"]
GENDERS = ["Male", "Female"]
AUDIO_EXTS = ["wav", "mp3", "flac", "m4a", "ogg"]


## 1) Index the dataset

We scan the folder tree and build a table with:
- file path
- language label
- gender label


In [None]:
def list_audio_files(dataset_dir: Path):
    rows = []
    for lang in LANGUAGES:
        for gender in GENDERS:
            base = dataset_dir / lang / gender
            if not base.exists():
                continue
            for ext in AUDIO_EXTS:
                for fp in glob.glob(str(base / f"**/*.{ext}"), recursive=True):
                    rows.append({"path": str(Path(fp)), "language": lang, "gender": gender})
    return pd.DataFrame(rows)

df = list_audio_files(DATASET_DIR)
df = df.sample(frac=1.0, random_state=RNG_SEED).reset_index(drop=True)

df.head(), df.shape


(                                                path language  gender
 0  dataset\Italian\Female\810104250_female_italia...  Italian  Female
 1  dataset\Italian\Female\810101502_female_italia...  Italian  Female
 2  dataset\German\Male\810103040_male_german_voic...   German    Male
 3  dataset\Italian\Male\810101441_male_italian_vo...  Italian    Male
 4  dataset\Korean\Female\810100094_female_korean_...   Korean  Female,
 (720, 3))

## 2) Audio loading + cleaning utilities

Cleaning steps:
- mono + resample to SR
- optional trim silence
- remove DC offset
- RMS normalize + peak guard
- pad/trim to TARGET_SECONDS
- quality checks (nan/inf, all-zero, too-short, clipping ratio)


In [None]:
def safe_load_audio(path, sr=SR):
    try:
        y, _ = librosa.load(path, sr=sr, mono=True)
        if y is None:
            return None, "load_failed"
        if len(y) == 0:
            return None, "empty_audio"
        if not np.isfinite(y).all():
            return None, "non_finite"
        return y.astype(np.float32), None
    except Exception:
        return None, "load_exception"

def trim_silence(y, top_db=TOP_DB):
    if len(y) == 0:
        return y
    yt, _ = librosa.effects.trim(y, top_db=top_db)
    return yt.astype(np.float32)

def rms_normalize(y, target_rms=RMS_TARGET):
    rms = float(np.sqrt(np.mean(y**2)) + 1e-12)
    gain = target_rms / rms
    return (y * gain).astype(np.float32)

def peak_guard(y, peak_max=PEAK_MAX):
    peak = float(np.max(np.abs(y)) + 1e-12)
    if peak > peak_max:
        y = y * (peak_max / peak)
    return y.astype(np.float32)

def pad_or_trim(y, target_len=TARGET_SAMPLES):
    n = len(y)
    if n == target_len:
        return y
    if n > target_len:
        return y[:target_len]
    pad = target_len - n
    return np.pad(y, (0, pad), mode="constant").astype(np.float32)

def clipping_ratio(y, thr=0.99):
    return float(np.mean(np.abs(y) >= thr))

def clean_audio(y):
    y = y - float(np.mean(y))
    if TRIM_SILENCE:
        y = trim_silence(y, top_db=TOP_DB)
    if len(y) < int(0.5 * SR):
        return None, "too_short_after_trim"
    y = rms_normalize(y, target_rms=RMS_TARGET)
    y = peak_guard(y, peak_max=PEAK_MAX)
    y = pad_or_trim(y, target_len=TARGET_SAMPLES)
    if not np.isfinite(y).all():
        return None, "non_finite_after_clean"
    return y, None


## 3) Run cleaning on all files

We:
- load each file safely
- clean it
- optionally save cleaned audio
- store metadata (duration, clipping ratio, errors)


In [None]:
def make_out_path(in_path: str, out_root: Path, lang: str, gender: str):
    in_path = Path(in_path)
    stem = in_path.stem
    out_dir = out_root / lang / gender
    out_dir.mkdir(parents=True, exist_ok=True)
    return out_dir / f"{stem}.wav"

meta_rows = []

for i, row in df.iterrows():
    p = row["path"]
    lang = row["language"]
    gender = row["gender"]

    y, err = safe_load_audio(p, sr=SR)
    if err is not None:
        meta_rows.append({
            "path": p, "language": lang, "gender": gender,
            "status": "bad", "error": err, "samples": 0,
            "seconds": 0.0, "clipping_ratio": np.nan
        })
        continue

    y_clean, err2 = clean_audio(y)
    if err2 is not None:
        meta_rows.append({
            "path": p, "language": lang, "gender": gender,
            "status": "bad", "error": err2, "samples": len(y),
            "seconds": float(len(y)/SR), "clipping_ratio": clipping_ratio(y)
        })
        continue

    out_path = ""
    if SAVE_CLEANED_AUDIO:
        out_fp = make_out_path(p, CLEAN_DIR, lang, gender)
        sf.write(out_fp, y_clean, SR)
        out_path = str(out_fp)

    meta_rows.append({
        "path": p, "clean_path": out_path,
        "language": lang, "gender": gender,
        "status": "ok", "error": "",
        "samples": int(len(y_clean)),
        "seconds": float(len(y_clean)/SR),
        "clipping_ratio": clipping_ratio(y_clean)
    })

meta = pd.DataFrame(meta_rows)
meta["language_id"] = meta["language"].astype("category").cat.codes
meta["gender_id"] = meta["gender"].astype("category").cat.codes

meta.head(), meta["status"].value_counts()


(                                                path  \
 0  dataset\Italian\Female\810104250_female_italia...   
 1  dataset\Italian\Female\810101502_female_italia...   
 2  dataset\German\Male\810103040_male_german_voic...   
 3  dataset\Italian\Male\810101441_male_italian_vo...   
 4  dataset\Korean\Female\810100094_female_korean_...   
 
                                           clean_path language  gender status  \
 0  processed_audio\Italian\Female\810104250_femal...  Italian  Female     ok   
 1  processed_audio\Italian\Female\810101502_femal...  Italian  Female     ok   
 2  processed_audio\German\Male\810103040_male_ger...   German    Male     ok   
 3  processed_audio\Italian\Male\810101441_male_it...  Italian    Male     ok   
 4  processed_audio\Korean\Female\810100094_female...   Korean  Female     ok   
 
   error  samples  seconds  clipping_ratio  language_id  gender_id  
 0         960000     60.0        0.000000            1          0  
 1         960000     60.0    

### Save metadata + quick cleaning diagnostics


In [None]:
meta_path = ARTIFACTS_DIR / "metadata.csv"
meta.to_csv(meta_path, index=False)

ok_meta = meta[meta["status"] == "ok"].copy()
bad_meta = meta[meta["status"] != "ok"].copy()

summary = {
    "total_files_indexed": int(len(meta)),
    "ok_files": int(len(ok_meta)),
    "bad_files": int(len(bad_meta)),
    "mean_clipping_ratio_ok": float(ok_meta["clipping_ratio"].mean()) if len(ok_meta) else np.nan
}
summary, bad_meta["error"].value_counts().head(10)


({'total_files_indexed': 720,
  'ok_files': 720,
  'bad_files': 0,
  'mean_clipping_ratio_ok': 1.9675925925925927e-07},
 Series([], Name: count, dtype: int64))

## 4) Optional augmentation

If enabled, we create `AUG_COPIES_PER_FILE` augmented versions per cleaned clip.
Augmentations:
- time shift
- additive noise
- time stretch
- pitch shift
- gain


In [None]:
def aug_time_shift(y, max_shift_sec=0.5):
    max_shift = int(max_shift_sec * SR)
    shift = int(rng.integers(-max_shift, max_shift + 1))
    return np.roll(y, shift).astype(np.float32)

def aug_add_noise(y, snr_db_low=15, snr_db_high=30):
    snr_db = float(rng.uniform(snr_db_low, snr_db_high))
    sig_power = float(np.mean(y**2) + 1e-12)
    noise_power = sig_power / (10 ** (snr_db / 10))
    noise = rng.normal(0, math.sqrt(noise_power), size=y.shape).astype(np.float32)
    return (y + noise).astype(np.float32)

def aug_time_stretch(y, rate_low=0.95, rate_high=1.05):
    rate = float(rng.uniform(rate_low, rate_high))
    ys = librosa.effects.time_stretch(y, rate=rate).astype(np.float32)
    return pad_or_trim(ys, TARGET_SAMPLES)

def aug_pitch_shift(y, steps_low=-1.0, steps_high=1.0):
    steps = float(rng.uniform(steps_low, steps_high))
    yp = librosa.effects.pitch_shift(y, sr=SR, n_steps=steps).astype(np.float32)
    return pad_or_trim(yp, TARGET_SAMPLES)

def aug_gain(y, db_low=-2.0, db_high=2.0):
    db = float(rng.uniform(db_low, db_high))
    g = 10 ** (db / 20)
    return peak_guard((y * g).astype(np.float32), PEAK_MAX)

def apply_random_augmentation(y):
    ops = [aug_time_shift, aug_add_noise, aug_time_stretch, aug_pitch_shift, aug_gain]
    k = int(rng.integers(1, 4))
    chosen = rng.choice(ops, size=k, replace=False)
    ya = y.copy()
    for op in chosen:
        ya = op(ya)
    ya = peak_guard(ya, PEAK_MAX)
    return ya.astype(np.float32)


In [None]:
AUGMENT_CONFIG = {
    "time_shift": {"enabled": True, "max_shift_sec": 0.5, "p": 0.60},
    "add_noise":  {"enabled": True, "snr_db_low": 18, "snr_db_high": 35, "p": 0.70},
    "time_stretch":{"enabled": True, "rate_low": 0.97, "rate_high": 1.03, "p": 0.40},
    "pitch_shift":{"enabled": True, "steps_low": -0.5, "steps_high": 0.5, "p": 0.35},
    "gain":       {"enabled": True, "db_low": -1.5, "db_high": 1.5, "p": 0.60},
}

AUGMENT_MIN_OPS = 1
AUGMENT_MAX_OPS = 3

def apply_random_augmentation(y):
    ya = y.astype(np.float32).copy()
    ops = []

    if AUGMENT_CONFIG["time_shift"]["enabled"]:
        ops.append(("time_shift", lambda x: aug_time_shift(x, max_shift_sec=AUGMENT_CONFIG["time_shift"]["max_shift_sec"]),
                    AUGMENT_CONFIG["time_shift"]["p"]))
    if AUGMENT_CONFIG["add_noise"]["enabled"]:
        ops.append(("add_noise", lambda x: aug_add_noise(x,
                                                        snr_db_low=AUGMENT_CONFIG["add_noise"]["snr_db_low"],
                                                        snr_db_high=AUGMENT_CONFIG["add_noise"]["snr_db_high"]),
                    AUGMENT_CONFIG["add_noise"]["p"]))
    if AUGMENT_CONFIG["time_stretch"]["enabled"]:
        ops.append(("time_stretch", lambda x: aug_time_stretch(x,
                                                              rate_low=AUGMENT_CONFIG["time_stretch"]["rate_low"],
                                                              rate_high=AUGMENT_CONFIG["time_stretch"]["rate_high"]),
                    AUGMENT_CONFIG["time_stretch"]["p"]))
    if AUGMENT_CONFIG["pitch_shift"]["enabled"]:
        ops.append(("pitch_shift", lambda x: aug_pitch_shift(x,
                                                            steps_low=AUGMENT_CONFIG["pitch_shift"]["steps_low"],
                                                            steps_high=AUGMENT_CONFIG["pitch_shift"]["steps_high"]),
                    AUGMENT_CONFIG["pitch_shift"]["p"]))
    if AUGMENT_CONFIG["gain"]["enabled"]:
        ops.append(("gain", lambda x: aug_gain(x,
                                              db_low=AUGMENT_CONFIG["gain"]["db_low"],
                                              db_high=AUGMENT_CONFIG["gain"]["db_high"]),
                    AUGMENT_CONFIG["gain"]["p"]))

    enabled_ops = [(name, fn, p) for (name, fn, p) in ops if p > 0]
    if len(enabled_ops) == 0:
        return ya

    k = int(rng.integers(AUGMENT_MIN_OPS, AUGMENT_MAX_OPS + 1))
    rng.shuffle(enabled_ops)

    applied = 0
    for name, fn, p in enabled_ops:
        if applied >= k:
            break
        if float(rng.random()) <= float(p):
            ya = fn(ya)
            applied += 1

    if applied == 0:
        name, fn, _ = enabled_ops[int(rng.integers(0, len(enabled_ops)))]
        ya = fn(ya)

    ya = peak_guard(ya, PEAK_MAX)
    ya = pad_or_trim(ya, TARGET_SAMPLES)
    return ya.astype(np.float32)


In [None]:
USE_AUGMENTATION = False
AUG_COPIES_PER_FILE = 1

AUG_DIR.mkdir(parents=True, exist_ok=True)

aug_rows = []

if USE_AUGMENTATION:
    ok_meta2 = ok_meta.copy()
    for _, r in ok_meta2.iterrows():
        src_path = r["clean_path"] if SAVE_CLEANED_AUDIO and r["clean_path"] else r["path"]
        y, err = safe_load_audio(src_path, sr=SR)
        if err is not None:
            continue
        y, err2 = clean_audio(y)
        if err2 is not None:
            continue

        for j in range(AUG_COPIES_PER_FILE):
            ya = apply_random_augmentation(y)
            out_fp = make_out_path(r["path"], AUG_DIR, r["language"], r["gender"])
            out_fp = out_fp.with_name(out_fp.stem + f"_aug{j+1}.wav")
            sf.write(out_fp, ya, SR)
            aug_rows.append({
                "orig_path": r["path"],
                "aug_path": str(out_fp),
                "language": r["language"],
                "gender": r["gender"]
            })

aug_df = pd.DataFrame(aug_rows, columns=["orig_path","aug_path","language","gender"])

aug_df.head(), aug_df.shape


NameError: name 'AUG_DIR' is not defined

In [None]:
print("Unique augmented languages:", aug_df["language"].value_counts())
print("Unique augmented genders:", aug_df["gender"].value_counts())


Unique augmented languages: language
Italian    180
German     180
Korean     180
Spanish    180
Name: count, dtype: int64
Unique augmented genders: gender
Female    360
Male      360
Name: count, dtype: int64


## 5) Feature extraction

We compute:
- MFCC (n=20) + delta + delta2
- Log-mel spectrogram (n_mels=64)
- Spectral descriptors: zcr, rms, centroid, bandwidth, rolloff, contrast

Then we aggregate each feature over time frames with:
mean, std, min, max, median, skew, kurtosis

Result: one fixed-size vector per audio clip.


In [None]:
def stats_1d(x):
    x = np.asarray(x, dtype=np.float64)
    return np.array([
        np.mean(x), np.std(x), np.min(x), np.max(x), np.median(x),
        skew(x, bias=False), kurtosis(x, bias=False)
    ], dtype=np.float64)

def stats_2d(M):
    feats = []
    for i in range(M.shape[0]):
        feats.append(stats_1d(M[i]))
    return np.concatenate(feats, axis=0)

def extract_features(y, sr=SR, n_mfcc=20, n_mels=64):
    y = y.astype(np.float32)

    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    d1 = librosa.feature.delta(mfcc)
    d2 = librosa.feature.delta(mfcc, order=2)

    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    logmel = librosa.power_to_db(mel + 1e-12)

    zcr = librosa.feature.zero_crossing_rate(y)
    rms = librosa.feature.rms(y=y)
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

    feat_parts = [
        stats_2d(mfcc), stats_2d(d1), stats_2d(d2),
        stats_2d(logmel),
        stats_2d(zcr), stats_2d(rms),
        stats_2d(centroid), stats_2d(bandwidth), stats_2d(rolloff),
        stats_2d(contrast)
    ]
    return np.concatenate(feat_parts, axis=0).astype(np.float32)


### Build the feature dataset (cleaned + optional augmented)

We extract features from:
- cleaned clips (always)
- augmented clips (if `USE_AUGMENTATION=True`)

We store:
- X: features
- y: language label
- plus a feature table with identifiers (path, language, gender)


In [None]:
feature_rows = []
X_list = []
y_list = []

def load_for_features(r):
    src = r["clean_path"] if SAVE_CLEANED_AUDIO and isinstance(r.get("clean_path",""), str) and len(r.get("clean_path","")) > 0 else r["path"]
    y, err = safe_load_audio(src, sr=SR)
    if err is not None:
        return None, err
    y, err2 = clean_audio(y)
    if err2 is not None:
        return None, err2
    return y, None

for _, r in ok_meta.iterrows():
    y, e = load_for_features(r)
    if e is not None:
        continue
    feats = extract_features(y, SR)
    X_list.append(feats)
    y_list.append(r["language"])
    feature_rows.append({
        "path": r["path"],
        "variant": "clean",
        "language": r["language"],
        "gender": r["gender"]
    })

if USE_AUGMENTATION and len(aug_df) > 0:
    for _, r in aug_df.iterrows():
        y, err = safe_load_audio(r["aug_path"], sr=SR)
        if err is not None:
            continue
        y, err2 = clean_audio(y)
        if err2 is not None:
            continue
        feats = extract_features(y, SR)
        X_list.append(feats)
        y_list.append(r["language"])
        feature_rows.append({
            "path": r["aug_path"],
            "variant": "aug",
            "language": r["language"],
            "gender": r["gender"]
        })

X = np.stack(X_list, axis=0)
y = np.array(y_list)

features_df = pd.DataFrame(feature_rows)
features_df["y"] = y

X.shape, features_df.head()


((1440, 952),
                                                 path variant language  gender  \
 0  dataset\Italian\Female\810104250_female_italia...   clean  Italian  Female   
 1  dataset\Italian\Female\810101502_female_italia...   clean  Italian  Female   
 2  dataset\German\Male\810103040_male_german_voic...   clean   German    Male   
 3  dataset\Italian\Male\810101441_male_italian_vo...   clean  Italian    Male   
 4  dataset\Korean\Female\810100094_female_korean_...   clean   Korean  Female   
 
          y  
 0  Italian  
 1  Italian  
 2   German  
 3  Italian  
 4   Korean  )

## 6) Save features for the next notebooks

We save:
- `X.npy`, `y.npy`
- `features.csv` containing metadata for each row in X

Next steps (in other notebooks):
- Classification.ipynb (train/test split + models)
- Clustering.ipynb
- Evaluation.ipynb


In [None]:
np.save(ARTIFACTS_DIR / "X.npy", X)
np.save(ARTIFACTS_DIR / "y.npy", y)

features_df.to_csv(ARTIFACTS_DIR / "features.csv", index=False)

{
    "X_shape": X.shape,
    "num_samples": int(len(y)),
    "languages": dict(pd.Series(y).value_counts())
}


{'X_shape': (1440, 952),
 'num_samples': 1440,
 'languages': {'Italian': np.int64(360),
  'German': np.int64(360),
  'Korean': np.int64(360),
  'Spanish': np.int64(360)}}

## 7) Quick sanity checks

We check:
- class balance
- feature NaNs
- basic per-language counts


In [None]:
balance = features_df.groupby(["variant", "language"]).size().unstack(fill_value=0)
nan_count = int(np.isnan(X).sum())
inf_count = int(np.isinf(X).sum())

balance, {"nan_count": nan_count, "inf_count": inf_count}


(language  German  Italian  Korean  Spanish
 variant                                   
 aug          180      180     180      180
 clean        180      180     180      180,
 {'nan_count': 0, 'inf_count': 0})