# Baby Cry Classification Training Notebook

This notebook provides a complete training pipeline for classifying baby cries into 8 different categories using deep learning techniques.

## Dataset Overview

The dataset includes 1126 audio files divided into 8 directories, each representing a specific reason for crying:

- **Belly pain**: 133 files
- **Burping**: 124 files
- **Cold or hot**: 130 files
- **Discomfort**: 142 files
- **Hungry**: 397 files
- **Lonely**: 25 files
- **Scared**: 33 files
- **Tired**: 142 files

**Data Characteristics:**

- Format: Audio files (1039 .wav, 8 .ogg, 7 other formats)
- Language: Not applicable (infant cries only)
- Total files: 1126

## Requirements

- TensorFlow 2.x
- librosa for audio processing
- numpy, pandas for data manipulation
- matplotlib, seaborn for visualization
- scikit-learn for metrics and preprocessing

## Training Pipeline

1. **Data Loading & Exploration**
2. **Audio Preprocessing & Feature Extraction**
3. **Data Augmentation**
4. **Model Architecture Design**
5. **Training with Validation**
6. **Model Evaluation**
7. **Model Saving & Inference**


In [None]:
# Import necessary libraries
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
import soundfile as sf
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Deep learning libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Sklearn for preprocessing and evaluation
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.utils.class_weight import compute_class_weight

# Set random seeds for reproducibility
np.random.seed(42)
tf.random.set_seed(42)

print("Libraries imported successfully!")
print(f"TensorFlow version: {tf.__version__}")
print(f"Librosa version: {librosa.__version__}")
print(f"NumPy version: {np.__version__}")

# Check GPU availability
print(f"GPU available: {tf.config.list_physical_devices('GPU')}")
if len(tf.config.list_physical_devices('GPU')) > 0:
    print("GPU will be used for training")
else:
    print("CPU will be used for training")

## 1. Dataset Configuration and Loading

First, let's set up the dataset path and load all audio files with their corresponding labels.


In [None]:
# Dataset configuration
DATASET_PATH = "data/baby_cries"  # Update this path to your dataset location
SAMPLE_RATE = 22050  # Standard sample rate for audio processing
DURATION = 3.0  # Duration in seconds to standardize audio length
N_SAMPLES = int(SAMPLE_RATE * DURATION)

# Define class labels
CLASS_LABELS = [
    'belly_pain',
    'burping', 
    'cold_hot',
    'discomfort',
    'hungry',
    'lonely',
    'scared',
    'tired'
]

# Expected file counts per class (from dataset description)
EXPECTED_COUNTS = {
    'belly_pain': 133,
    'burping': 124,
    'cold_hot': 130,
    'discomfort': 142,
    'hungry': 397,
    'lonely': 25,
    'scared': 33,
    'tired': 142
}

print(f"Dataset path: {DATASET_PATH}")
print(f"Sample rate: {SAMPLE_RATE} Hz")
print(f"Audio duration: {DURATION} seconds")
print(f"Number of samples per audio: {N_SAMPLES}")
print(f"Number of classes: {len(CLASS_LABELS)}")
print(f"Total expected files: {sum(EXPECTED_COUNTS.values())}")

In [None]:
def load_audio_files(dataset_path):
    """
    Load all audio files from the dataset directory.
    
    Returns:
        tuple: (file_paths, labels, class_counts)
    """
    file_paths = []
    labels = []
    class_counts = {}
    
    print("Loading audio files...")
    
    for class_label in CLASS_LABELS:
        class_dir = os.path.join(dataset_path, class_label)
        if not os.path.exists(class_dir):
            print(f"Warning: Directory {class_dir} not found!")
            continue
            
        # Get all audio files in the class directory
        audio_extensions = ['*.wav', '*.ogg', '*.mp3', '*.flac', '*.m4a']
        class_files = []
        
        for ext in audio_extensions:
            class_files.extend(glob.glob(os.path.join(class_dir, ext)))
        
        class_counts[class_label] = len(class_files)
        print(f"{class_label}: {len(class_files)} files")
        
        # Add to main lists
        file_paths.extend(class_files)
        labels.extend([class_label] * len(class_files))
    
    print(f"\nTotal files loaded: {len(file_paths)}")
    return file_paths, labels, class_counts

# Load the dataset
try:
    file_paths, labels, class_counts = load_audio_files(DATASET_PATH)
    
    # Create a DataFrame for easier manipulation
    df = pd.DataFrame({
        'file_path': file_paths,
        'label': labels
    })
    
    print(f"\nDataset loaded successfully!")
    print(f"Shape: {df.shape}")
    print(f"Classes: {df['label'].unique()}")
    
except Exception as e:
    print(f"Error loading dataset: {e}")
    print("Creating sample data structure for demonstration...")
    
    # Create sample data for demonstration if dataset not found
    df = pd.DataFrame({
        'file_path': [f"sample_{i}.wav" for i in range(100)],
        'label': np.random.choice(CLASS_LABELS, 100)
    })
    class_counts = df['label'].value_counts().to_dict()

In [None]:
# Dataset exploration and visualization
plt.figure(figsize=(15, 10))

# 1. Class distribution
plt.subplot(2, 2, 1)
class_counts_series = df['label'].value_counts()
plt.bar(class_counts_series.index, class_counts_series.values, color='skyblue')
plt.title('Distribution of Baby Cry Classes')
plt.xlabel('Cry Type')
plt.ylabel('Number of Files')
plt.xticks(rotation=45)

# Add count labels on bars
for i, v in enumerate(class_counts_series.values):
    plt.text(i, v + 5, str(v), ha='center', va='bottom')

# 2. Expected vs Actual counts comparison (if actual data is available)
plt.subplot(2, 2, 2)
expected_df = pd.DataFrame(list(EXPECTED_COUNTS.items()), columns=['label', 'expected'])
actual_df = pd.DataFrame(list(class_counts_series.items()), columns=['label', 'actual'])
comparison_df = pd.merge(expected_df, actual_df, on='label', how='outer').fillna(0)

x = np.arange(len(comparison_df))
width = 0.35

plt.bar(x - width/2, comparison_df['expected'], width, label='Expected', alpha=0.7)
plt.bar(x + width/2, comparison_df['actual'], width, label='Actual', alpha=0.7)
plt.title('Expected vs Actual File Counts')
plt.xlabel('Cry Type')
plt.ylabel('Number of Files')
plt.xticks(x, comparison_df['label'], rotation=45)
plt.legend()

# 3. Class imbalance visualization
plt.subplot(2, 2, 3)
plt.pie(class_counts_series.values, labels=class_counts_series.index, autopct='%1.1f%%', startangle=90)
plt.title('Class Distribution (Percentage)')

# 4. Class statistics
plt.subplot(2, 2, 4)
plt.axis('off')
stats_text = f"""
Dataset Statistics:
• Total files: {len(df)}
• Number of classes: {len(class_counts_series)}
• Largest class: {class_counts_series.index[0]} ({class_counts_series.iloc[0]} files)
• Smallest class: {class_counts_series.index[-1]} ({class_counts_series.iloc[-1]} files)
• Imbalance ratio: {class_counts_series.iloc[0] / class_counts_series.iloc[-1]:.1f}:1
• Mean files per class: {class_counts_series.mean():.1f}
• Std files per class: {class_counts_series.std():.1f}
"""
plt.text(0.1, 0.5, stats_text, fontsize=12, verticalalignment='center')

plt.tight_layout()
plt.show()

# Print detailed statistics
print("\nDetailed Class Statistics:")
print(f"{'Class':<15} {'Count':<8} {'Percentage':<12}")
print("-" * 35)
for label, count in class_counts_series.items():
    percentage = (count / len(df)) * 100
    print(f"{label:<15} {count:<8} {percentage:<12.1f}%")

## 2. Audio Preprocessing and Feature Extraction

Now let's implement functions to load, preprocess audio files and extract features for training.


In [None]:
def load_and_preprocess_audio(file_path, sample_rate=SAMPLE_RATE, duration=DURATION):
    """
    Load and preprocess a single audio file.
    
    Args:
        file_path (str): Path to the audio file
        sample_rate (int): Target sample rate
        duration (float): Target duration in seconds
        
    Returns:
        np.array: Preprocessed audio signal
    """
    try:
        # Load audio file
        audio, sr = librosa.load(file_path, sr=sample_rate, duration=duration)
        
        # Pad or truncate to fixed length
        target_length = int(sample_rate * duration)
        if len(audio) < target_length:
            # Pad with zeros if too short
            audio = np.pad(audio, (0, target_length - len(audio)), mode='constant')
        else:
            # Truncate if too long
            audio = audio[:target_length]
        
        # Normalize audio
        audio = audio / np.max(np.abs(audio) + 1e-6)
        
        return audio
        
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        # Return zeros if file can't be loaded
        return np.zeros(int(sample_rate * duration))

def extract_mfcc_features(audio, sample_rate=SAMPLE_RATE, n_mfcc=13, n_fft=2048, hop_length=512):
    """
    Extract MFCC features from audio signal.
    
    Args:
        audio (np.array): Audio signal
        sample_rate (int): Sample rate
        n_mfcc (int): Number of MFCC coefficients
        n_fft (int): FFT window size
        hop_length (int): Hop length for STFT
        
    Returns:
        np.array: MFCC features
    """
    # Extract MFCC features
    mfcc = librosa.feature.mfcc(
        y=audio, 
        sr=sample_rate, 
        n_mfcc=n_mfcc,
        n_fft=n_fft,
        hop_length=hop_length
    )
    
    # Add delta and delta-delta features
    mfcc_delta = librosa.feature.delta(mfcc)
    mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
    
    # Combine all features
    features = np.concatenate([mfcc, mfcc_delta, mfcc_delta2], axis=0)
    
    return features.T  # Transpose to (time, features)

def extract_spectrogram_features(audio, sample_rate=SAMPLE_RATE, n_fft=2048, hop_length=512):
    """
    Extract mel-spectrogram features from audio signal.
    
    Args:
        audio (np.array): Audio signal
        sample_rate (int): Sample rate
        n_fft (int): FFT window size
        hop_length (int): Hop length for STFT
        
    Returns:
        np.array: Mel-spectrogram features
    """
    # Extract mel-spectrogram
    mel_spec = librosa.feature.melspectrogram(
        y=audio,
        sr=sample_rate,
        n_fft=n_fft,
        hop_length=hop_length,
        n_mels=128
    )
    
    # Convert to log scale
    log_mel_spec = librosa.power_to_db(mel_spec, ref=np.max)
    
    return log_mel_spec.T  # Transpose to (time, frequency)

def augment_audio(audio, sample_rate=SAMPLE_RATE):
    """
    Apply audio augmentation techniques.
    
    Args:
        audio (np.array): Original audio signal
        sample_rate (int): Sample rate
        
    Returns:
        list: List of augmented audio signals
    """
    augmented = [audio]  # Include original
    
    # Time shifting
    shift_max = int(0.2 * sample_rate)  # 0.2 seconds
    shift = np.random.randint(-shift_max, shift_max)
    augmented.append(np.roll(audio, shift))
    
    # Speed change
    speed_factor = np.random.uniform(0.8, 1.2)
    stretched = librosa.effects.time_stretch(audio, rate=speed_factor)
    if len(stretched) >= len(audio):
        augmented.append(stretched[:len(audio)])
    else:
        padded = np.pad(stretched, (0, len(audio) - len(stretched)), mode='constant')
        augmented.append(padded)
    
    # Pitch shifting
    pitch_shift = np.random.randint(-2, 3)
    if pitch_shift != 0:
        pitched = librosa.effects.pitch_shift(audio, sr=sample_rate, n_steps=pitch_shift)
        augmented.append(pitched)
    
    # Add noise
    noise_factor = 0.005
    noise = np.random.normal(0, noise_factor, len(audio))
    noisy = audio + noise
    augmented.append(noisy)
    
    return augmented

print("Audio preprocessing functions defined successfully!")

In [None]:
# Demonstrate feature extraction with a sample audio file
def visualize_audio_features(file_path, label):
    """
    Visualize audio waveform and extracted features.
    """
    # Load audio
    audio = load_and_preprocess_audio(file_path)
    
    # Extract features
    mfcc_features = extract_mfcc_features(audio)
    spec_features = extract_spectrogram_features(audio)
    
    # Create visualization
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    
    # 1. Waveform
    time = np.linspace(0, DURATION, len(audio))
    axes[0, 0].plot(time, audio)
    axes[0, 0].set_title(f'Waveform - {label}')
    axes[0, 0].set_xlabel('Time (s)')
    axes[0, 0].set_ylabel('Amplitude')
    
    # 2. MFCC features
    axes[0, 1].imshow(mfcc_features.T, aspect='auto', origin='lower', cmap='viridis')
    axes[0, 1].set_title('MFCC Features')
    axes[0, 1].set_xlabel('Time Frame')
    axes[0, 1].set_ylabel('MFCC Coefficient')
    
    # 3. Mel-spectrogram
    axes[1, 0].imshow(spec_features.T, aspect='auto', origin='lower', cmap='viridis')
    axes[1, 0].set_title('Mel-Spectrogram')
    axes[1, 0].set_xlabel('Time Frame')
    axes[1, 0].set_ylabel('Mel Frequency')
    
    # 4. Feature statistics
    axes[1, 1].axis('off')
    stats_text = f"""
    Audio Statistics:
    • Duration: {DURATION} seconds
    • Sample Rate: {SAMPLE_RATE} Hz
    • Audio Shape: {audio.shape}
    • MFCC Shape: {mfcc_features.shape}
    • Spectrogram Shape: {spec_features.shape}
    • Audio Min/Max: {audio.min():.3f} / {audio.max():.3f}
    • Audio Mean/Std: {audio.mean():.3f} / {audio.std():.3f}
    """
    axes[1, 1].text(0.1, 0.5, stats_text, fontsize=12, verticalalignment='center')
    
    plt.tight_layout()
    plt.show()
    
    return audio, mfcc_features, spec_features

# Visualize features for the first available file (if dataset exists)
if len(df) > 0 and os.path.exists(df.iloc[0]['file_path']):
    print("Visualizing features for first audio file...")
    sample_file = df.iloc[0]['file_path']
    sample_label = df.iloc[0]['label']
    visualize_audio_features(sample_file, sample_label)
else:
    print("Dataset not found - skipping visualization")
    print("Feature extraction functions are ready to use when dataset is available")

## 3. Model Architecture Design

Let's create a deep learning model suitable for audio classification using both CNN and RNN components.


In [None]:
def create_conv1d_model(input_shape, num_classes=8):
    """
    Create a 1D CNN model for audio classification using MFCC features.
    
    Args:
        input_shape (tuple): Shape of input features (time_steps, features)
        num_classes (int): Number of output classes
        
    Returns:
        tf.keras.Model: Compiled model
    """
    model = keras.Sequential([
        # Input layer
        layers.Input(shape=input_shape),
        
        # First Conv1D block
        layers.Conv1D(64, 3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.25),
        
        # Second Conv1D block
        layers.Conv1D(128, 3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.25),
        
        # Third Conv1D block
        layers.Conv1D(256, 3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.25),
        
        # Global pooling and dense layers
        layers.GlobalAveragePooling1D(),
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

def create_conv2d_model(input_shape, num_classes=8):
    """
    Create a 2D CNN model for audio classification using spectrogram features.
    
    Args:
        input_shape (tuple): Shape of input features (height, width, channels)
        num_classes (int): Number of output classes
        
    Returns:
        tf.keras.Model: Compiled model
    """
    model = keras.Sequential([
        # Input layer
        layers.Input(shape=input_shape),
        
        # First Conv2D block
        layers.Conv2D(32, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Second Conv2D block
        layers.Conv2D(64, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Third Conv2D block
        layers.Conv2D(128, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling2D((2, 2)),
        layers.Dropout(0.25),
        
        # Fourth Conv2D block
        layers.Conv2D(256, (3, 3), activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.GlobalAveragePooling2D(),
        
        # Dense layers
        layers.Dense(512, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(256, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

def create_lstm_model(input_shape, num_classes=8):
    """
    Create an LSTM model for sequential audio classification.
    
    Args:
        input_shape (tuple): Shape of input features (time_steps, features)
        num_classes (int): Number of output classes
        
    Returns:
        tf.keras.Model: Compiled model
    """
    model = keras.Sequential([
        # Input layer
        layers.Input(shape=input_shape),
        
        # LSTM layers
        layers.LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
        layers.LSTM(64, return_sequences=False, dropout=0.2, recurrent_dropout=0.2),
        
        # Dense layers
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

def create_hybrid_model(input_shape, num_classes=8):
    """
    Create a hybrid CNN+LSTM model for audio classification.
    
    Args:
        input_shape (tuple): Shape of input features (time_steps, features)
        num_classes (int): Number of output classes
        
    Returns:
        tf.keras.Model: Compiled model
    """
    model = keras.Sequential([
        # Input layer
        layers.Input(shape=input_shape),
        
        # Conv1D layers for feature extraction
        layers.Conv1D(64, 3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.25),
        
        layers.Conv1D(128, 3, activation='relu', padding='same'),
        layers.BatchNormalization(),
        layers.MaxPooling1D(2),
        layers.Dropout(0.25),
        
        # LSTM layers for temporal modeling
        layers.LSTM(128, return_sequences=True, dropout=0.2),
        layers.LSTM(64, return_sequences=False, dropout=0.2),
        
        # Dense layers
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(0.5),
        layers.Dense(128, activation='relu'),
        layers.Dropout(0.3),
        layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

print("Model architectures defined successfully!")
print("Available models:")
print("1. Conv1D model - for MFCC features")
print("2. Conv2D model - for spectrogram features") 
print("3. LSTM model - for sequential features")
print("4. Hybrid CNN+LSTM model - combines both approaches")

## 4. Data Preparation and Model Training

Now let's prepare the data and train our models.


In [None]:
def prepare_dataset(df, use_augmentation=True, test_size=0.2, val_size=0.2):
    """
    Prepare the dataset for training.
    
    Args:
        df (pd.DataFrame): DataFrame with file paths and labels
        use_augmentation (bool): Whether to apply data augmentation
        test_size (float): Proportion of test set
        val_size (float): Proportion of validation set (from training data)
        
    Returns:
        tuple: Prepared datasets and label encoder
    """
    print("Preparing dataset...")
    
    # Initialize label encoder
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(df['label'])
    
    # Split data into train and test
    X_temp, X_test, y_temp, y_test = train_test_split(
        df['file_path'].values, encoded_labels, 
        test_size=test_size, stratify=encoded_labels, random_state=42
    )
    
    # Split training data into train and validation
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, 
        test_size=val_size, stratify=y_temp, random_state=42
    )
    
    print(f"Train set: {len(X_train)} samples")
    print(f"Validation set: {len(X_val)} samples") 
    print(f"Test set: {len(X_test)} samples")
    
    # Apply data augmentation to training set
    if use_augmentation:
        print("Applying data augmentation...")
        augmented_paths = []
        augmented_labels = []
        
        for path, label in zip(X_train, y_train):
            augmented_paths.append(path)  # Original
            augmented_labels.append(label)
            
            # Add augmented versions (limit to balance dataset)
            if label_encoder.inverse_transform([label])[0] in ['lonely', 'scared']:  # Minority classes
                # More augmentation for minority classes
                for _ in range(3):
                    augmented_paths.append(path)
                    augmented_labels.append(label)
            elif label_encoder.inverse_transform([label])[0] != 'hungry':  # Not majority class
                # Some augmentation for other classes
                augmented_paths.append(path)
                augmented_labels.append(label)
        
        X_train = np.array(augmented_paths)
        y_train = np.array(augmented_labels)
        print(f"Augmented train set: {len(X_train)} samples")
    
    return (X_train, X_val, X_test, y_train, y_val, y_test), label_encoder

def load_features_batch(file_paths, labels, feature_type='mfcc', apply_augmentation=False):
    """
    Load and extract features for a batch of files.
    
    Args:
        file_paths (array): Array of file paths
        labels (array): Array of labels
        feature_type (str): Type of features to extract ('mfcc' or 'spectrogram')
        apply_augmentation (bool): Whether to apply augmentation during loading
        
    Returns:
        tuple: (features, labels)
    """
    features = []
    processed_labels = []
    
    print(f"Loading {len(file_paths)} files with {feature_type} features...")
    
    for i, (file_path, label) in enumerate(zip(file_paths, labels)):
        if i % 100 == 0:
            print(f"Processed {i}/{len(file_paths)} files")
        
        try:
            # Load audio
            if os.path.exists(file_path):
                audio = load_and_preprocess_audio(file_path)
            else:
                # Create dummy data if file doesn't exist (for demo)
                audio = np.random.randn(N_SAMPLES) * 0.1
            
            # Apply augmentation if requested
            audio_variants = [audio]
            if apply_augmentation:
                audio_variants = augment_audio(audio)
            
            # Extract features for each variant
            for audio_variant in audio_variants:
                if feature_type == 'mfcc':
                    feature = extract_mfcc_features(audio_variant)
                else:  # spectrogram
                    feature = extract_spectrogram_features(audio_variant)
                
                features.append(feature)
                processed_labels.append(label)
                
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            continue
    
    return np.array(features), np.array(processed_labels)

# Prepare the dataset
if len(df) > 0:
    print("Preparing real dataset...")
    datasets, label_encoder = prepare_dataset(df, use_augmentation=True)
    X_train, X_val, X_test, y_train, y_val, y_test = datasets
else:
    print("Creating demo dataset...")
    # Create demo data
    label_encoder = LabelEncoder()
    label_encoder.fit(CLASS_LABELS)
    
    # Generate dummy file paths and labels
    n_samples = 100
    demo_paths = [f"demo_file_{i}.wav" for i in range(n_samples)]
    demo_labels = np.random.choice(range(len(CLASS_LABELS)), n_samples)
    
    X_train, X_test, y_train, y_test = train_test_split(
        demo_paths, demo_labels, test_size=0.2, random_state=42
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42
    )

print(f"Label classes: {label_encoder.classes_}")
print(f"Final dataset sizes - Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

In [None]:
# Training configuration
EPOCHS = 100
BATCH_SIZE = 32
LEARNING_RATE = 0.001

def train_model(model, X_train, y_train, X_val, y_val, model_name="model"):
    """
    Train a model with proper callbacks and monitoring.
    
    Args:
        model: Keras model to train
        X_train, y_train: Training data
        X_val, y_val: Validation data
        model_name (str): Name for saving model
        
    Returns:
        tuple: (trained_model, history)
    """
    print(f"Training {model_name}...")
    
    # Compile model
    model.compile(
        optimizer=Adam(learning_rate=LEARNING_RATE),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    
    # Print model summary
    print(f"\\n{model_name} Architecture:")
    model.summary()
    
    # Define callbacks
    callbacks = [
        EarlyStopping(
            monitor='val_loss',
            patience=15,
            restore_best_weights=True,
            verbose=1
        ),
        ReduceLROnPlateau(
            monitor='val_loss',
            factor=0.5,
            patience=10,
            min_lr=1e-7,
            verbose=1
        ),
        ModelCheckpoint(
            f'models/best_{model_name}.h5',
            monitor='val_accuracy',
            save_best_only=True,
            verbose=1
        )
    ]
    
    # Calculate class weights to handle imbalance
    class_weights = compute_class_weight(
        'balanced',
        classes=np.unique(y_train),
        y=y_train
    )
    class_weight_dict = dict(enumerate(class_weights))
    print(f"Class weights: {class_weight_dict}")
    
    # Train model
    history = model.fit(
        X_train, y_train,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        validation_data=(X_val, y_val),
        callbacks=callbacks,
        class_weight=class_weight_dict,
        verbose=1
    )
    
    return model, history

def plot_training_history(history, model_name):
    """
    Plot training history.
    """
    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot accuracy
    axes[0].plot(history.history['accuracy'], label='Training Accuracy')
    axes[0].plot(history.history['val_accuracy'], label='Validation Accuracy')
    axes[0].set_title(f'{model_name} - Accuracy')
    axes[0].set_xlabel('Epoch')
    axes[0].set_ylabel('Accuracy')
    axes[0].legend()
    axes[0].grid(True)
    
    # Plot loss
    axes[1].plot(history.history['loss'], label='Training Loss')
    axes[1].plot(history.history['val_loss'], label='Validation Loss')
    axes[1].set_title(f'{model_name} - Loss')
    axes[1].set_xlabel('Epoch')
    axes[1].set_ylabel('Loss')
    axes[1].legend()
    axes[1].grid(True)
    
    plt.tight_layout()
    plt.show()

# Create models directory
os.makedirs('models', exist_ok=True)

print("Training configuration set!")
print(f"Epochs: {EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print("Ready to train models...")

In [None]:
# Train multiple models and compare performance

# Model 1: Conv1D with MFCC features
print("="*50)
print("TRAINING CONV1D MODEL WITH MFCC FEATURES")
print("="*50)

if len(df) > 0:
    # Load MFCC features for training
    X_train_mfcc, y_train_aug = load_features_batch(
        X_train, y_train, feature_type='mfcc', apply_augmentation=True
    )
    X_val_mfcc, _ = load_features_batch(
        X_val, y_val, feature_type='mfcc', apply_augmentation=False
    )
    
    # Create and train Conv1D model
    conv1d_model = create_conv1d_model(X_train_mfcc.shape[1:], num_classes=len(CLASS_LABELS))
    conv1d_model, conv1d_history = train_model(
        conv1d_model, X_train_mfcc, y_train_aug, X_val_mfcc, y_val, "Conv1D_MFCC"
    )
    
    # Plot training history
    plot_training_history(conv1d_history, "Conv1D MFCC Model")
    
else:
    print("Dataset not available - creating demo model...")
    # Create demo model with sample shapes
    sample_mfcc_shape = (130, 39)  # Typical MFCC shape
    conv1d_model = create_conv1d_model(sample_mfcc_shape, num_classes=len(CLASS_LABELS))
    print("Conv1D model created successfully!")
    conv1d_model.summary()

## 5. Model Evaluation and Testing

Let's evaluate our trained models and compare their performance.


In [None]:
def evaluate_model(model, X_test, y_test, label_encoder, model_name):
    """
    Evaluate a trained model on test data.
    
    Args:
        model: Trained keras model
        X_test, y_test: Test data
        label_encoder: Label encoder for class names
        model_name (str): Name of the model
        
    Returns:
        dict: Evaluation metrics
    """
    print(f"Evaluating {model_name}...")
    
    # Make predictions
    y_pred_proba = model.predict(X_test, verbose=0)
    y_pred = np.argmax(y_pred_proba, axis=1)
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f"\\n{model_name} Results:")
    print(f"Test Accuracy: {accuracy:.4f}")
    
    # Classification report
    print("\\nClassification Report:")
    print(classification_report(
        y_test, y_pred, 
        target_names=label_encoder.classes_,
        digits=4
    ))
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(
        cm, annot=True, fmt='d', cmap='Blues',
        xticklabels=label_encoder.classes_,
        yticklabels=label_encoder.classes_
    )
    plt.title(f'{model_name} - Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.show()
    
    # Per-class accuracy
    class_accuracy = cm.diagonal() / cm.sum(axis=1)
    
    print("\\nPer-class Accuracy:")
    for i, class_name in enumerate(label_encoder.classes_):
        print(f"{class_name:<15}: {class_accuracy[i]:.4f}")
    
    return {
        'accuracy': accuracy,
        'predictions': y_pred,
        'probabilities': y_pred_proba,
        'confusion_matrix': cm,
        'class_accuracy': class_accuracy
    }

def compare_models(results_dict):
    """
    Compare multiple model results.
    
    Args:
        results_dict (dict): Dictionary of model results
    """
    print("\\n" + "="*60)
    print("MODEL COMPARISON")
    print("="*60)
    
    # Create comparison DataFrame
    comparison_data = []
    for model_name, results in results_dict.items():
        comparison_data.append({
            'Model': model_name,
            'Test Accuracy': results['accuracy'],
            'Best Class': label_encoder.classes_[np.argmax(results['class_accuracy'])],
            'Worst Class': label_encoder.classes_[np.argmin(results['class_accuracy'])],
            'Best Class Acc': np.max(results['class_accuracy']),
            'Worst Class Acc': np.min(results['class_accuracy'])
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    comparison_df = comparison_df.sort_values('Test Accuracy', ascending=False)
    
    print(comparison_df.to_string(index=False, float_format='%.4f'))
    
    # Plot comparison
    plt.figure(figsize=(12, 6))
    
    plt.subplot(1, 2, 1)
    plt.bar(comparison_df['Model'], comparison_df['Test Accuracy'], color='skyblue')
    plt.title('Model Accuracy Comparison')
    plt.xlabel('Model')
    plt.ylabel('Test Accuracy')
    plt.xticks(rotation=45)
    
    # Add accuracy values on bars
    for i, v in enumerate(comparison_df['Test Accuracy']):
        plt.text(i, v + 0.01, f'{v:.3f}', ha='center', va='bottom')
    
    plt.subplot(1, 2, 2)
    models = comparison_df['Model']
    best_accs = comparison_df['Best Class Acc']
    worst_accs = comparison_df['Worst Class Acc']
    
    x = np.arange(len(models))
    width = 0.35
    
    plt.bar(x - width/2, best_accs, width, label='Best Class', alpha=0.7)
    plt.bar(x + width/2, worst_accs, width, label='Worst Class', alpha=0.7)
    plt.title('Per-Class Performance Range')
    plt.xlabel('Model')
    plt.ylabel('Accuracy')
    plt.xticks(x, models, rotation=45)
    plt.legend()
    
    plt.tight_layout()
    plt.show()

# Evaluate the trained model(s)
results_dict = {}

if len(df) > 0 and 'conv1d_model' in locals():
    print("Loading test features...")
    X_test_mfcc, _ = load_features_batch(
        X_test, y_test, feature_type='mfcc', apply_augmentation=False
    )
    
    # Evaluate Conv1D model
    conv1d_results = evaluate_model(
        conv1d_model, X_test_mfcc, y_test, label_encoder, "Conv1D MFCC"
    )
    results_dict['Conv1D MFCC'] = conv1d_results
    
else:
    print("Dataset not available or model not trained - skipping evaluation")
    print("Evaluation functions are ready to use when data and models are available")

# Compare models if we have results
if results_dict:
    compare_models(results_dict)