In [2]:
from IPython.utils import io
from IPython import get_ipython
with io.capture_output() as captured:  
   get_ipython().run_line_magic('run', '1.Preprocessing.ipynb')

# Imports

In [3]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Config

In [4]:
from config import run_config

run_config()

# Feature Extraction

## Enhanced Feature Extraction Pipeline (Traditional + Prosodic)

In [14]:
def extract_features(y, sr=16000, n_mfcc=20, n_fft=2048, hop_length=512):
    """
    Extracts various audio features from the given audio signal.
    Args:
        y (numpy.ndarray): Audio time series.
        sr (int): Sampling rate of `y`.
        n_mfcc (int): Number of MFCCs to return.
        n_fft (int): Length of the FFT window.
        hop_length (int): Number of samples between frames.
        Returns:
        numpy.ndarray: Extracted features.
    """
    # MFCCs
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    delta = librosa.feature.delta(mfcc)
    delta2 = librosa.feature.delta(mfcc, order=2)
    
    # Chroma
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)

    # Spectral Contrast
    contrast = librosa.feature.spectral_contrast(y=y, sr=sr)

    # Tonnetz
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=sr)

    # Energy & ZCR
    rmse = librosa.feature.rms(y=y)
    zcr = librosa.feature.zero_crossing_rate(y)

    # Concatenate all
    features = np.concatenate([
        mfcc.mean(axis=1),
        delta.mean(axis=1),
        delta2.mean(axis=1),
        chroma.mean(axis=1),
        contrast.mean(axis=1),
        tonnetz.mean(axis=1),
        [rmse.mean()],
        [zcr.mean()]
    ])
    
    return features


## `wav2vec` for Age & Gender Classification

In [6]:
# import torchaudio
# import torch
# from transformers import Wav2Vec2Model, Wav2Vec2Processor

# # Load pretrained model (wav2vec base or large)
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
# model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# def extract_wav2vec_features(y, sr=16000):
#     # Resample to 16kHz if necessary
#     if sr != 16000:
#         y = librosa.resample(y, orig_sr=sr, target_sr=16000)
    
#     input_values = processor(y, return_tensors="pt", sampling_rate=16000).input_values
#     with torch.no_grad():
#         embeddings = model(input_values).last_hidden_state

#     # Mean pooling across time dimension
#     features = embeddings.mean(dim=1).squeeze().numpy()
#     return features

In [15]:
tmp = extract_features(df['audio_data'][0], n_mfcc=40)
tmp.shape, tmp[:10]  # Check the shape and first 10 values of the feature vector

((147,),
 array([-350.35125732,   20.87301636,   43.61044312,   10.20368767,
          -8.32745934,   -1.96336699,  -30.76938057,   -8.0605526 ,
         -10.44402122,   -2.19611549]))

# Extract features

In [None]:
df['features'] = df['audio_data'].progress_apply(lambda y: extract_features(y=y, sr=16000, n_mfcc=40))

X = np.stack(df['features'].values)
y = df['label'].values

100%|██████████| 489/489 [00:03<00:00, 144.22it/s]


In [11]:
# Train/val/test split
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=0)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=0)