# Phase 3: The Audio Gauntlet (MIMII Sound Dataset)

This notebook processes raw audio signal data from the MIMII sound dataset to build supervised anomaly detection models for industrial equipment.

## Objectives:
1. Load and process MIMII industrial machine sound data (.wav files)
2. Extract MFCC (Mel-Frequency Cepstral Coefficients) features from audio signals
3. Train supervised classification models to detect anomalous machine sounds
4. Register models in MLflow for production deployment

## Dataset: MIMII Sound Dataset
- **Source**: MIMII Dataset from Hitachi, Ltd. and Tokyo Institute of Technology
- **Type**: Industrial machine sound recordings (valve, pump, fan, slider)
- **Structure**: Normal and abnormal sound samples organized by machine type and ID
- **Features**: .wav files sampled at 16kHz with various noise conditions
- **Gauntlet Focus**: 6dB SNR valve and pump sounds

## Methodology:
- **Feature Extraction**: 40 MFCC coefficients per audio sample
- **Model**: RandomForest classifier for normal vs abnormal classification
- **Evaluation**: Classification metrics focused on anomaly detection performance

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import mlflow
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import os
import glob
from tqdm import tqdm
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

warnings.filterwarnings('ignore', category=UserWarning, module='librosa')

# MLflow configuration
tracking_uri = "http://mlflow:5000" if os.getenv("DOCKER_ENV") == "true" else "http://localhost:5000"
mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("Audio Gauntlet (MIMII)")

print(f"MLflow tracking URI set to: {mlflow.get_tracking_uri()}")
print(f"Current working directory: {os.getcwd()}")

# Ensure output directory exists
os.makedirs('docs/ml', exist_ok=True)

In [None]:
# --- Debug: Check Dataset Accessibility ---

print("=== Dataset Accessibility Check ===")
print(f"Current working directory: {os.getcwd()}")
print(f"Contents of current directory: {os.listdir('.')}")

# Check if data directory exists
if os.path.exists('data'):
    print(f"✅ 'data' directory exists")
    print(f"Contents of data directory: {os.listdir('data')}")
    
    # Check MIMII dataset specifically
    mimii_path = 'data/MIMII_sound_dataset'
    if os.path.exists(mimii_path):
        print(f"✅ MIMII dataset directory exists at: {mimii_path}")
        print(f"Contents: {os.listdir(mimii_path)}")
        
        # Check for the specific machine types we need
        for machine_dir in ['6_dB_valve', '6_dB_pump']:
            machine_path = os.path.join(mimii_path, machine_dir)
            if os.path.exists(machine_path):
                print(f"✅ {machine_dir} directory exists")
                print(f"Contents: {os.listdir(machine_path)}")
                
                # Check deeper structure
                machine_type = machine_dir.split('_')[-1]
                full_path = os.path.join(machine_path, machine_type)
                if os.path.exists(full_path):
                    print(f"✅ {full_path} exists")
                    id_dirs = [d for d in os.listdir(full_path) if d.startswith('id_')]
                    print(f"Found ID directories: {id_dirs}")
                    
                    # Check for a sample audio file
                    if id_dirs:
                        sample_id = id_dirs[0]
                        normal_path = os.path.join(full_path, sample_id, 'normal')
                        if os.path.exists(normal_path):
                            wav_files = [f for f in os.listdir(normal_path) if f.endswith('.wav')]
                            print(f"Sample normal audio files in {sample_id}: {len(wav_files)} files")
                            if wav_files:
                                print(f"First few files: {wav_files[:3]}")
                        else:
                            print(f"❌ Normal directory not found: {normal_path}")
                else:
                    print(f"❌ Machine type directory not found: {full_path}")
            else:
                print(f"❌ {machine_dir} directory not found")
    else:
        print(f"❌ MIMII dataset directory not found at: {mimii_path}")
else:
    print(f"❌ 'data' directory not found")

print("=== End Debug Check ===\n")

In [None]:
# --- 1. Audio Feature Extraction ---

def extract_features(file_path):
    """
    Loads an audio file and extracts MFCC features.
    
    Args:
        file_path (str): Path to the .wav audio file
        
    Returns:
        np.array: Array of 40 MFCC features (mean across time)
    """
    try:
        # Load audio file with librosa
        audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast', duration=10.0)
        
        # Extract 40 MFCC coefficients
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
        
        # Take mean across time dimension to get fixed-size feature vector
        mfccs_processed = np.mean(mfccs.T, axis=0)
        
        return mfccs_processed
    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

# Define the path to the dataset and initialize containers
data_path = 'data/MIMII_sound_dataset/'
machine_types = ['6_dB_valve', '6_dB_pump']
features_list = []
file_count = 0

print("Starting audio feature extraction from MIMII dataset...")
print(f"Target machine types: {machine_types}")

# Loop through the directory structure to process audio files
for machine_dir in machine_types:
    # Get machine type from directory name (e.g., 'valve')
    machine_type = machine_dir.split('_')[-1]
    machine_path = os.path.join(data_path, machine_dir, machine_type)
    
    print(f"\nProcessing {machine_type} sounds from: {machine_path}")
    
    # Use glob to find all id directories
    id_dirs = glob.glob(os.path.join(machine_path, 'id_*'))
    print(f"Found {len(id_dirs)} machine IDs for {machine_type}")
    
    for id_dir in sorted(id_dirs):
        machine_id = os.path.basename(id_dir)
        
        # Process normal files
        normal_files = glob.glob(os.path.join(id_dir, 'normal', '*.wav'))
        print(f"Processing {len(normal_files)} normal files for {machine_id}...")
        
        for file_path in tqdm(normal_files, desc=f"Normal {machine_id}"):
            features = extract_features(file_path)
            if features is not None:
                features_list.append([*features, 0, machine_type, machine_id])
                file_count += 1
        
        # Process abnormal files
        abnormal_files = glob.glob(os.path.join(id_dir, 'abnormal', '*.wav'))
        print(f"Processing {len(abnormal_files)} abnormal files for {machine_id}...")
        
        for file_path in tqdm(abnormal_files, desc=f"Abnormal {machine_id}"):
            features = extract_features(file_path)
            if features is not None:
                features_list.append([*features, 1, machine_type, machine_id])
                file_count += 1

# Create DataFrame with proper column names
feature_columns = [f'mfcc_{i}' for i in range(40)]
df = pd.DataFrame(features_list, columns=feature_columns + ['label', 'machine_type', 'machine_id'])

print(f"\n=== Feature Extraction Complete ===")
print(f"Total audio files processed: {file_count}")
print(f"DataFrame shape: {df.shape}")
print(f"Class distribution:\n{df['label'].value_counts()}")
print(f"Machine type distribution:\n{df['machine_type'].value_counts()}")

# Display sample of extracted features
print(f"\nSample of extracted features:")
df.head()

In [None]:
# --- 2. Data Preparation and Analysis ---

print("=== Dataset Analysis ===")

# Class balance analysis
print("Class Distribution:")
class_counts = df['label'].value_counts()
print(f"Normal (0): {class_counts[0]} samples ({class_counts[0]/len(df)*100:.1f}%)")
print(f"Abnormal (1): {class_counts[1]} samples ({class_counts[1]/len(df)*100:.1f}%)")

# Machine type analysis
print(f"\nMachine Type Distribution:")
for machine_type in df['machine_type'].unique():
    subset = df[df['machine_type'] == machine_type]
    normal_count = (subset['label'] == 0).sum()
    abnormal_count = (subset['label'] == 1).sum()
    print(f"{machine_type}: {len(subset)} total ({normal_count} normal, {abnormal_count} abnormal)")

# One-hot encode categorical features
print(f"\nPreparing features for modeling...")
df_encoded = pd.get_dummies(df, columns=['machine_type'], drop_first=False)

# Separate features and target
feature_cols = [col for col in df_encoded.columns if col not in ['label', 'machine_id']]
X = df_encoded[feature_cols]
y = df_encoded['label']

print(f"Feature matrix shape: {X.shape}")
print(f"Features: {feature_cols[:5]}... (+{len(feature_cols)-5} more)")

# Split data stratified by both label and machine type to ensure representative splits
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"\nTraining set: {X_train_scaled.shape[0]} samples")
print(f"Test set: {X_test_scaled.shape[0]} samples")
print(f"Training class distribution: {pd.Series(y_train).value_counts().to_dict()}")
print(f"Test class distribution: {pd.Series(y_test).value_counts().to_dict()}")

print("Data preparation complete! ✅")

In [None]:
# --- 3. Model Training and Evaluation ---

with mlflow.start_run(run_name="RandomForest_Audio_Anomaly_MIMII"):
    print("=== 🎵 Audio Gauntlet: Training RandomForest on MFCC Features ===")
    
    # Log dataset and experiment metadata
    mlflow.log_param("dataset", "MIMII_6dB_Valve_Pump")
    mlflow.log_param("feature_type", "MFCC_40_coefficients")
    mlflow.log_param("audio_duration_limit", "10_seconds")
    mlflow.log_param("total_samples", len(df))
    mlflow.log_param("train_samples", len(X_train))
    mlflow.log_param("test_samples", len(X_test))
    mlflow.log_param("feature_dimensions", X_train_scaled.shape[1])
    
    # Log class distribution
    train_normal = (y_train == 0).sum()
    train_abnormal = (y_train == 1).sum()
    mlflow.log_param("train_normal_samples", train_normal)
    mlflow.log_param("train_abnormal_samples", train_abnormal)
    mlflow.log_param("class_imbalance_ratio", train_abnormal / train_normal)
    
    # Configure and train the model
    model = RandomForestClassifier(
        n_estimators=100,
        random_state=42,
        n_jobs=-1,
        class_weight='balanced',  # Handle class imbalance
        max_depth=10,
        min_samples_split=5
    )
    
    # Log model hyperparameters
    mlflow.log_param("model_type", "RandomForestClassifier")
    mlflow.log_params(model.get_params())
    
    print("Training model...")
    model.fit(X_train_scaled, y_train)
    
    # Make predictions
    print("Making predictions...")
    y_pred = model.predict(X_test_scaled)
    y_pred_proba = model.predict_proba(X_test_scaled)[:, 1]  # Probability of abnormal class
    
    # Generate comprehensive evaluation metrics
    report = classification_report(y_test, y_pred, output_dict=True)
    
    # Extract key metrics
    accuracy = report['accuracy']
    precision_normal = report['0']['precision']
    recall_normal = report['0']['recall']
    f1_normal = report['0']['f1-score']
    
    precision_abnormal = report['1']['precision']
    recall_abnormal = report['1']['recall']
    f1_abnormal = report['1']['f1-score']
    
    macro_avg_f1 = report['macro avg']['f1-score']
    weighted_avg_f1 = report['weighted avg']['f1-score']
    
    # Log all metrics to MLflow
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("precision_normal", precision_normal)
    mlflow.log_metric("recall_normal", recall_normal)
    mlflow.log_metric("f1_score_normal", f1_normal)
    mlflow.log_metric("precision_abnormal", precision_abnormal)
    mlflow.log_metric("recall_abnormal", recall_abnormal)
    mlflow.log_metric("f1_score_abnormal", f1_abnormal)
    mlflow.log_metric("macro_avg_f1", macro_avg_f1)
    mlflow.log_metric("weighted_avg_f1", weighted_avg_f1)
    
    # Log classification report as artifact
    mlflow.log_dict(report, "classification_report.json")
    
    # Feature importance analysis
    feature_importance = pd.DataFrame({
        'feature': feature_cols,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    # Log top 10 most important features
    top_features = feature_importance.head(10)
    for i, (_, row) in enumerate(top_features.iterrows()):
        mlflow.log_metric(f"feature_importance_rank_{i+1}", row['importance'])
        mlflow.log_param(f"top_feature_{i+1}", row['feature'])
    
    # Print results
    print(f"\n=== 🎯 Audio Anomaly Detection Results ===")
    print(f"Overall Accuracy: {accuracy:.4f}")
    print(f"Abnormal Detection (Recall): {recall_abnormal:.4f}")
    print(f"Abnormal Precision: {precision_abnormal:.4f}")
    print(f"Abnormal F1-Score: {f1_abnormal:.4f}")
    print(f"Macro Average F1: {macro_avg_f1:.4f}")
    
    print(f"\n📊 Top 5 Most Important Features:")
    for i, (_, row) in enumerate(top_features.head(5).iterrows()):
        print(f"  {i+1}. {row['feature']}: {row['importance']:.4f}")
    
    # Log and register the model
    mlflow.sklearn.log_model(
        model, 
        "audio_anomaly_classifier",
        registered_model_name="RandomForest_MIMII_Audio_Benchmark"
    )
    
    # Also log the scaler for complete preprocessing pipeline
    mlflow.sklearn.log_model(
        scaler,
        "feature_scaler",
        registered_model_name="MIMII_Audio_Scaler"
    )
    
    print(f"\n✅ Models registered:")
    print(f"   - RandomForest_MIMII_Audio_Benchmark")
    print(f"   - MIMII_Audio_Scaler")

print(f"\n🎉 === Audio Gauntlet Complete! === 🎉")
print(f"Successfully trained audio anomaly detection model on {len(df)} MIMII sound samples")
print(f"Model achieves {f1_abnormal:.1%} F1-score for abnormal sound detection")