In [0]:
import boto3


AWS_ACCESS_KEY = ""
AWS_SECRET_KEY = ""
AWS_REGION = "" 
S3_BUCKET = ""
AWS_SESSION_TOKEN = ""

s3_client = boto3.client(
    "s3",
    aws_access_key_id=AWS_ACCESS_KEY,
    aws_secret_access_key=AWS_SECRET_KEY,
    aws_session_token=AWS_SESSION_TOKEN,   
)


✅ Cliente S3 configurado
   Bucket: amzn-s3-maia-mesd-2026


In [0]:

S3_PREFIX = ""

response = s3_client.list_objects_v2(
    Bucket=S3_BUCKET,
    Prefix=S3_PREFIX,
    MaxKeys=10
)

if 'Contents' in response:
    print(f"Archivos encontrados:")
    for obj in response['Contents']:
        if obj['Key'].endswith('.wav'):
            print(f"  - {obj['Key']}")
else:
    print("No se encontraron archivos")

✅ S3 funciona! Archivos encontrados:
  - raw/all-wavs/MexicanEmotionalSpeechDatabase/Anger_C_A_abajo.wav
  - raw/all-wavs/MexicanEmotionalSpeechDatabase/Anger_C_A_adios.wav
  - raw/all-wavs/MexicanEmotionalSpeechDatabase/Anger_C_A_antes.wav
  - raw/all-wavs/MexicanEmotionalSpeechDatabase/Anger_C_A_arriba.wav
  - raw/all-wavs/MexicanEmotionalSpeechDatabase/Anger_C_A_ayer.wav
  - raw/all-wavs/MexicanEmotionalSpeechDatabase/Anger_C_A_basta_ya.wav
  - raw/all-wavs/MexicanEmotionalSpeechDatabase/Anger_C_A_de_nada.wav
  - raw/all-wavs/MexicanEmotionalSpeechDatabase/Anger_C_A_delante.wav


In [0]:
import mlflow
import mlflow.sklearn
import warnings
import numpy as np
import librosa
import random
from collections import Counter
import io
import soundfile as sf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    f1_score, 
    accuracy_score,
    classification_report,
    precision_recall_fscore_support,
    confusion_matrix,
    ConfusionMatrixDisplay
)
import matplotlib.pyplot as plt

# Suppress librosa and sklearn warnings for cleaner console output
warnings.filterwarnings('ignore')

# ============================================================================
# MLFLOW CONFIGURATION
# ============================================================================
EXPERIMENT_NAME = "/Users/tique.yessicaadriana@gmail.com/emotion-s3-improved"
mlflow.set_experiment(EXPERIMENT_NAME)

# NOTE: Ensure s3_client, S3_BUCKET, and S3_PREFIX are properly initialized 
# before executing the pipeline.

# ============================================================================
# S3 UTILITIES
# ============================================================================

def load_audio_from_s3(bucket, key, sr=16000):
    """Fetches an audio file from S3 and loads it as a numpy array."""
    obj = s3_client.get_object(Bucket=bucket, Key=key)
    audio_bytes = obj['Body'].read()
    audio, orig_sr = sf.read(io.BytesIO(audio_bytes), dtype='float32')
    
    # Convert stereo to mono if necessary
    if len(audio.shape) > 1:
        audio = np.mean(audio, axis=1)
        
    # Resample audio to target sample rate
    if sr is not None and orig_sr != sr:
        audio = librosa.resample(audio, orig_sr=orig_sr, target_sr=sr)
        
    return audio

def list_all_wav_files(bucket, prefix=""):
    """Retrieves all .wav file keys from a given S3 bucket and prefix."""
    wav_files = []
    continuation_token = None
    
    while True:
        kwargs = {"Bucket": bucket, "Prefix": prefix}
        if continuation_token:
            kwargs["ContinuationToken"] = continuation_token
            
        response = s3_client.list_objects_v2(**kwargs)

        if 'Contents' in response:
            for obj in response['Contents']:
                if obj['Key'].endswith('.wav'):
                    wav_files.append(obj)

        if response.get('IsTruncated'):
            continuation_token = response['NextContinuationToken']
        else:
            break
            
    return wav_files

# ============================================================================
# FEATURE ENGINEERING & AUGMENTATION
# ============================================================================

def augment_audio(audio, sr=16000):
    """
    Applies data augmentation to the input audio.
    Returns 5 versions: original, pitch shifted (+/-2 steps), and time stretched (+/-10%).
    """
    versions = [
        audio,
        librosa.effects.pitch_shift(audio, sr=sr, n_steps=2),
        librosa.effects.pitch_shift(audio, sr=sr, n_steps=-2),
        librosa.effects.time_stretch(audio, rate=1.1),
        librosa.effects.time_stretch(audio, rate=0.9)
    ]
    return versions

def extract_features_audio(y, sr=16000):
    """
    Extracts a 288-dimensional feature vector from the audio signal.
    Includes MFCCs, Chroma, Spectral features, ZCR, RMS, and Pitch tracking.
    """
    try:
        # Trim silence
        y, _ = librosa.effects.trim(y, top_db=20)

        # Pad audio if it's shorter than the minimum duration (1.0 second)
        MIN_DURATION = 1.0
        if len(y) < int(MIN_DURATION * sr):
            y = np.pad(y, (0, int(MIN_DURATION * sr) - len(y)), mode="constant")

        # Peak normalization
        if np.max(np.abs(y)) > 0:
            y = y / np.max(np.abs(y))

        features = []
        N_MFCC = 20
        HOP_LENGTH = 512
        N_FFT = 2048

        # 1. MFCCs and Deltas
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=N_MFCC, hop_length=HOP_LENGTH, n_fft=N_FFT)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)

        for M in [mfcc, mfcc_delta, mfcc_delta2]:
            features.extend(np.mean(M, axis=1))
            features.extend(np.std(M, axis=1))
            features.extend(np.min(M, axis=1))
            features.extend(np.max(M, axis=1))

        # 2. Chroma STFT
        chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=HOP_LENGTH, n_fft=N_FFT)
        features.extend(np.mean(chroma, axis=1))
        features.extend(np.std(chroma, axis=1))

        # 3. Spectral Features
        centroid = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length=HOP_LENGTH)
        bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr, hop_length=HOP_LENGTH)
        rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, hop_length=HOP_LENGTH)

        for feat in [centroid, bandwidth, rolloff]:
            features.extend([float(np.mean(feat)), float(np.std(feat)), float(np.min(feat)), float(np.max(feat))])

        # 4. Zero Crossing Rate (ZCR)
        zcr = librosa.feature.zero_crossing_rate(y, hop_length=HOP_LENGTH)
        features.extend([float(np.mean(zcr)), float(np.std(zcr)), float(np.min(zcr)), float(np.max(zcr))])

        # 5. Root Mean Square (RMS) Energy
        rms = librosa.feature.rms(y=y, hop_length=HOP_LENGTH)
        features.extend([float(np.mean(rms)), float(np.std(rms)), float(np.min(rms)), float(np.max(rms))])

        # 6. Fundamental Frequency (Pitch)
        try:
            f0, _, _ = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), sr=sr)
            f0_clean = f0[~np.isnan(f0)]
            if len(f0_clean) > 0:
                features.extend([float(np.mean(f0_clean)), float(np.std(f0_clean)), 
                                 float(np.min(f0_clean)), float(np.max(f0_clean))])
            else:
                features.extend([0.0, 0.0, 0.0, 0.0])
        except Exception:
            features.extend([0.0, 0.0, 0.0, 0.0])

        return np.array(features, dtype=np.float32)
    
    except Exception as e:
        print(f"[ERROR] Feature extraction failed: {e}")
        return None

# ============================================================================
# DATASET GENERATION
# ============================================================================

def key_to_label(key: str) -> str:
    """Extracts the emotion label from the S3 file naming convention."""
    filename = key.split("/")[-1]
    return filename.split("_")[0]

def build_xy_from_keys(bucket, keys, use_aug=False, sr=16000):
    """Processes audio files and builds the feature matrix (X) and labels (y)."""
    X_list, y_list = [], []
    total_keys = len(keys)
    
    for i, key in enumerate(keys):
        try:
            audio = load_audio_from_s3(bucket, key, sr=sr)
            emotion = key_to_label(key)

            # Apply augmentation only if specified (typically for training set)
            audios = augment_audio(audio, sr=sr) if use_aug else [audio]
            
            for a in audios:
                feat = extract_features_audio(a, sr=sr)
                if feat is not None:
                    X_list.append(feat)
                    y_list.append(emotion)

            # Progress logging
            if (i + 1) % 50 == 0 or (i + 1) == total_keys:
                print(f"[INFO] Processing progress: {i+1}/{total_keys} files")
                
        except Exception as e:
            print(f"[WARNING] Skipping file {key} due to error: {e}")
            continue

    return np.array(X_list), y_list

# ============================================================================
# MLFLOW LOGGING & EVALUATION
# ============================================================================

def log_metrics_block(y_train, y_pred_train, y_val, y_pred_val, le, model_tag="model"):
    """Computes and logs classification metrics, reports, and confusion matrices to MLflow."""
    
    # Calculate global metrics
    train_acc = accuracy_score(y_train, y_pred_train)
    val_acc   = accuracy_score(y_val, y_pred_val)
    train_f1  = f1_score(y_train, y_pred_train, average='macro')
    val_f1    = f1_score(y_val, y_pred_val, average='macro')

    # Log global metrics
    mlflow.log_metric("train_acc", float(train_acc))
    mlflow.log_metric("val_acc", float(val_acc))
    mlflow.log_metric("train_f1_macro", float(train_f1))
    mlflow.log_metric("val_f1_macro", float(val_f1))
    mlflow.log_metric("gap_f1", float(train_f1 - val_f1))

    # Generate and log classification report artifact
    report = classification_report(y_val, y_pred_val, target_names=le.classes_, digits=4, zero_division=0)
    report_path = f"/tmp/{model_tag}_classification_report.txt"
    with open(report_path, "w") as f:
        f.write(report)
    mlflow.log_artifact(report_path)

    # Generate and log confusion matrix artifact
    cm = confusion_matrix(y_val, y_pred_val)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=le.classes_)
    fig, ax = plt.subplots(figsize=(8,6))
    disp.plot(ax=ax, xticks_rotation=45)
    plt.tight_layout()
    cm_path = f"/tmp/{model_tag}_confusion_matrix.png"
    plt.savefig(cm_path)
    plt.close(fig)
    mlflow.log_artifact(cm_path)

    return train_f1, val_f1

# ============================================================================
# PIPELINE EXECUTION
# ============================================================================

def execute_training_pipeline(max_files=None, test_size=0.15, random_state=42):
    """
    Executes the comprehensive ML pipeline:
    - Data splitting
    - Feature extraction (with data augmentation for training)
    - Model training (SVM, Highly Regularized Random Forest, Highly Regularized XGBoost)
    - MLflow logging
    """
    
    # ------------------------------------------------------------------------
    # 1. DATA COLLECTION & SPLITTING
    # ------------------------------------------------------------------------
    print("[INFO] Initiating S3 file listing...")
    wav_files = list_all_wav_files(S3_BUCKET, S3_PREFIX)
    random.shuffle(wav_files)
    
    if max_files:
        wav_files = wav_files[:max_files]

    keys = [o["Key"] for o in wav_files]
    labels = [key_to_label(k) for k in keys]

    print(f"[INFO] Total audio files located: {len(keys)}")
    print(f"[INFO] Class distribution: {dict(Counter(labels))}")

    # Encode labels
    le_keys = LabelEncoder()
    y_keys = le_keys.fit_transform(labels)

    # Stratified split to preserve class distribution
    train_keys, val_keys, _, _ = train_test_split(
        keys, y_keys, test_size=test_size, stratify=y_keys, random_state=random_state
    )
    print(f"[INFO] Data split successful. Train subset: {len(train_keys)}, Validation subset: {len(val_keys)}")

    # ------------------------------------------------------------------------
    # 2. FEATURE EXTRACTION
    # ------------------------------------------------------------------------
    print("\n[INFO] Extracting features for TRAIN set (Data Augmentation Enabled)...")
    X_train, y_train_str = build_xy_from_keys(S3_BUCKET, train_keys, use_aug=True, sr=16000)

    print("\n[INFO] Extracting features for VALIDATION set (Data Augmentation Disabled)...")
    X_val, y_val_str = build_xy_from_keys(S3_BUCKET, val_keys, use_aug=False, sr=16000)

    # Consistent label encoding across splits
    le = LabelEncoder()
    le.fit(le_keys.classes_)
    y_train = le.transform(y_train_str)
    y_val   = le.transform(y_val_str)

    print("\n[INFO] Feature Engineering Completed.")
    print(f"[INFO] X_train shape: {X_train.shape}")
    print(f"[INFO] X_val shape:   {X_val.shape}")
    print(f"[INFO] Detected Classes: {list(le.classes_)}")

    if len(le.classes_) <= 1:
        print("[ERROR] Dataset contains only one class. Training aborted.")
        return None

    # ------------------------------------------------------------------------
    # 3. MLFLOW MODEL TRACKING
    # ------------------------------------------------------------------------
    with mlflow.start_run(run_name="Classical_ML_Pipeline") as parent:
        
        # Log global pipeline parameters
        mlflow.log_params({
            "split_strategy": "by_file",
            "augmentation": "train_only_5x",
            "max_files": str(max_files),
            "test_size": test_size,
            "random_state": random_state,
            "train_samples_augmented": int(len(X_train)),
            "val_samples_original": int(len(X_val)),
        })

        # --- MODEL 1: Support Vector Machine (Baseline) ---
        with mlflow.start_run(run_name="SVM_RBF", nested=True) as run:
            print("\n" + "-"*60)
            print("[INFO] Training Model: Support Vector Machine (RBF)")
            print("-"*60)
            
            svm_pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('pca', PCA(n_components=0.80, random_state=42)),
                ('svm', SVC(C=0.676, gamma=0.00240, kernel='rbf', class_weight='balanced', random_state=42))
            ])
            
            mlflow.log_params({
                "model": "SVC",
                "pca_variance_retained": 0.80,
                "svm_C": 0.676,
                "svm_gamma": 0.00240,
                "svm_kernel": "rbf",
                "svm_class_weight": "balanced",
            })
            
            svm_pipeline.fit(X_train, y_train)
            train_f1, val_f1 = log_metrics_block(
                y_train, svm_pipeline.predict(X_train),
                y_val, svm_pipeline.predict(X_val),
                le, model_tag="SVM_RBF"
            )
            mlflow.sklearn.log_model(svm_pipeline, "model")
            
            print(f"[INFO] SVM Training Completed.")
            print(f"[INFO] Train F1 (Macro): {train_f1:.4f} | Val F1 (Macro): {val_f1:.4f}")

        # --- MODEL 2: Highly Regularized Random Forest ---
        with mlflow.start_run(run_name="RF_Highly_Regularized", nested=True) as run:
            print("\n" + "-"*60)
            print("[INFO] Training Model: Random Forest (Highly Regularized)")
            print("-"*60)
            
            rf_pipeline = Pipeline([
                ('scaler', StandardScaler()),
                ('rf', RandomForestClassifier(
                    n_estimators=500,
                    max_depth=6,              # Aggressive depth limit
                    min_samples_split=60,     # Prevents splitting augmented versions of the same file
                    min_samples_leaf=20,      # Enforces generalized leaf nodes
                    max_features=0.1,         # Forces feature diversity across trees
                    max_samples=0.6,          # Subsampling to prevent overfitting
                    bootstrap=True,
                    class_weight='balanced',
                    random_state=42,
                    n_jobs=-1
                ))
            ])
            
            mlflow.log_params({
                "model": "RandomForest_Regularized",
                "rf_n_estimators": 500,
                "rf_max_depth": 6,
                "rf_min_samples_split": 60,
                "rf_min_samples_leaf": 20,
                "rf_max_features": 0.1,
                "rf_max_samples": 0.6,
            })
            
            rf_pipeline.fit(X_train, y_train)
            train_f1, val_f1 = log_metrics_block(
                y_train, rf_pipeline.predict(X_train),
                y_val, rf_pipeline.predict(X_val),
                le, model_tag="RF_Regularized"
            )
            mlflow.sklearn.log_model(rf_pipeline, "model")
            
            print(f"[INFO] Random Forest Training Completed.")
            print(f"[INFO] Train F1 (Macro): {train_f1:.4f} | Val F1 (Macro): {val_f1:.4f}")

        # --- MODEL 3: Highly Regularized XGBoost ---
        try:
            from xgboost import XGBClassifier
            
            with mlflow.start_run(run_name="XGBoost_Highly_Regularized", nested=True) as run:
                print("\n" + "-"*60)
                print("[INFO] Training Model: XGBoost (Highly Regularized)")
                print("-"*60)
                
                xgb_pipeline = Pipeline([
                    ('scaler', StandardScaler()),
                    ('pca', PCA(n_components=0.75, random_state=42)),
                    ('xgb', XGBClassifier(
                        n_estimators=200,          
                        learning_rate=0.03,        # Slower learning rate for stability
                        max_depth=3,               # Reduced to shallow trees (stumps)
                        min_child_weight=20,       # High minimum weight to prevent specific splits
                        subsample=0.6,             
                        colsample_bytree=0.5,      
                        gamma=3.0,                 # High penalty for new node creation
                        reg_alpha=2.0,             # L1 regularization
                        reg_lambda=10.0,           # L2 regularization
                        objective="multi:softprob",
                        eval_metric="mlogloss",
                        random_state=42,
                        n_jobs=-1
                    ))
                ])
                
                mlflow.log_params({
                    "model": "XGBoost_Regularized",
                    "xgb_learning_rate": 0.03,
                    "xgb_max_depth": 3,
                    "xgb_min_child_weight": 20,
                    "xgb_subsample": 0.6,
                    "xgb_colsample_bytree": 0.5,
                    "xgb_gamma": 3.0,
                    "xgb_reg_alpha": 2.0,
                    "xgb_reg_lambda": 10.0,
                })
                
                xgb_pipeline.fit(X_train, y_train)
                train_f1, val_f1 = log_metrics_block(
                    y_train, xgb_pipeline.predict(X_train),
                    y_val, xgb_pipeline.predict(X_val),
                    le, model_tag="XGBoost_Regularized"
                )
                mlflow.sklearn.log_model(xgb_pipeline, "model")
                
                print(f"[INFO] XGBoost Training Completed.")
                print(f"[INFO] Train F1 (Macro): {train_f1:.4f} | Val F1 (Macro): {val_f1:.4f}")
                
        except ImportError:
            print("\n[WARNING] XGBoost library not found. Skipping XGBoost model training.")
        except Exception as e:
            print(f"\n[ERROR] XGBoost training failed: {e}")

        # --------------------------------------------------------------------
        # 4. SUMMARY
        # --------------------------------------------------------------------
        print("\n" + "="*60)
        print("[INFO] PIPELINE EXECUTION COMPLETED")
        print("="*60)
        print("[INFO] Evaluated Models:")
        print("       - Support Vector Machine (Baseline)")
        print("       - Random Forest (Highly Regularized)")
        print("       - XGBoost (Highly Regularized)")
        print("[INFO] Detailed metrics and artifacts are available in the MLflow UI.")

    return True

# ============================================================================
# ENTRY POINT
# ============================================================================
if __name__ == "__main__":
    # Ensure to configure S3_BUCKET and S3_PREFIX globally before execution
    execute_training_pipeline(max_files=None)

[INFO] Initiating S3 file listing...
[INFO] Total audio files located: 862
[INFO] Class distribution: {'Fear': 144, 'Anger': 143, 'Sadness': 144, 'Neutral': 143, 'Happiness': 144, 'Disgust': 144}
[INFO] Data split successful. Train subset: 732, Validation subset: 130

[INFO] Extracting features for TRAIN set (Data Augmentation Enabled)...
[INFO] Processing progress: 50/732 files
[INFO] Processing progress: 100/732 files
[INFO] Processing progress: 150/732 files
[INFO] Processing progress: 200/732 files
[INFO] Processing progress: 250/732 files
[INFO] Processing progress: 300/732 files
[INFO] Processing progress: 350/732 files
[INFO] Processing progress: 400/732 files
[INFO] Processing progress: 450/732 files
[INFO] Processing progress: 500/732 files
[INFO] Processing progress: 550/732 files
[INFO] Processing progress: 600/732 files
[INFO] Processing progress: 650/732 files
[INFO] Processing progress: 700/732 files
[INFO] Processing progress: 732/732 files

[INFO] Extracting features fo



[INFO] SVM Training Completed.
[INFO] Train F1 (Macro): 0.8636 | Val F1 (Macro): 0.7870

------------------------------------------------------------
[INFO] Training Model: Random Forest (Highly Regularized)
------------------------------------------------------------




[INFO] Random Forest Training Completed.
[INFO] Train F1 (Macro): 0.7167 | Val F1 (Macro): 0.6469

------------------------------------------------------------
[INFO] Training Model: XGBoost (Highly Regularized)
------------------------------------------------------------




[INFO] XGBoost Training Completed.
[INFO] Train F1 (Macro): 0.7386 | Val F1 (Macro): 0.5932

[INFO] PIPELINE EXECUTION COMPLETED
[INFO] Evaluated Models:
       - Support Vector Machine (Baseline)
       - Random Forest (Highly Regularized)
       - XGBoost (Highly Regularized)
[INFO] Detailed metrics and artifacts are available in the MLflow UI.
