# Multi-Output CNN for Speech Recognition and Gender Classification

This notebook implements a **Multi-Output CNN** that predicts both digit and gender simultaneously:

## 🧠 Methodology
1. **Feature Extraction**
   - Extract Mel-Frequency Cepstral Coefficients (MFCCs) from audio recordings using Librosa
   - Standardize input shape by padding/truncating sequences to a fixed length

2. **Data Preparation**
   - Convert audio into numpy arrays of shape (n_mfcc, time, 1)
   - Encode labels:
     - Gender → one-hot vector ([1,0] = male, [0,1] = female)
     - Digit → one-hot vector of length 10

3. **Model Architecture (Multi-Output CNN)**
   - Convolutional Neural Network (CNN) extracts time–frequency patterns
   - Shared convolutional layers → two output branches:
     - Gender classifier (softmax over 2 classes)
     - Digit classifier (softmax over 10 classes)

4. **Training & Evaluation**
   - Train/test split = 70/30
   - Loss function = categorical crossentropy (for both tasks)
   - Metrics = accuracy, precision, recall, F1-score
   - Visualization of training history (accuracy/loss curves)

## Dataset Structure
```
Dataset/
   d0/ (digit 0)
      male/   → male speakers saying "zero"
      female/ → female speakers saying "zero"
   d1/ (digit 1)
      male/   → male speakers saying "one"
      female/ → female speakers saying "one"
   ...
   d9/ (digit 9)
      male/   → male speakers saying "nine"
      female/ → female speakers saying "nine"
```

**Note**: We predict BOTH digit (0-9) AND gender (male/female) simultaneously using a Multi-Output CNN.


In [None]:
# Import necessary libraries
import os
import sys
import warnings
warnings.filterwarnings('ignore')

# Add src directory to path
sys.path.append('src')

# Import our Multi-Output CNN modules
from extract_features import create_cnn_dataset
from train_model import train_multi_output_cnn_pipeline
from evaluate_model import evaluate_multi_output_cnn_pipeline

# Standard libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("✅ All libraries imported successfully!")
print("🎯 Ready to implement gender classification for speech recognition")


## 1. Dataset Analysis and Multi-Output Structure

In [None]:
# Dataset configuration
DATASET_PATH = "Dataset"
OUTPUT_DIR = "output"

# Create output directory
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("=== MULTI-OUTPUT DATASET ANALYSIS ===")
print("📁 Analyzing dataset structure for Multi-Output CNN (Digit + Gender classification)...")

# Count files by digit and gender
digit_counts = {}
gender_counts = {'male': 0, 'female': 0}
total_files = 0

for digit in range(10):
    digit_dir = os.path.join(DATASET_PATH, f'd{digit}')
    if os.path.exists(digit_dir):
        digit_count = 0
        
        # Count male files
        male_dir = os.path.join(digit_dir, 'male')
        if os.path.exists(male_dir):
            male_files = len([f for f in os.listdir(male_dir) if f.endswith('.wav')])
            gender_counts['male'] += male_files
            digit_count += male_files

        # Count female files
        female_dir = os.path.join(digit_dir, 'female')
        if os.path.exists(female_dir):
            female_files = len([f for f in os.listdir(female_dir) if f.endswith('.wav')])
            gender_counts['female'] += female_files
            digit_count += female_files
        
        digit_counts[f'Digit {digit}'] = digit_count
        total_files += digit_count

print(f"📊 Multi-Output Dataset Statistics:")
print(f"   Total audio files: {total_files}")
print(f"   Male samples: {gender_counts['male']}")
print(f"   Female samples: {gender_counts['female']}")
print(f"   Gender distribution: {gender_counts['male']/total_files*100:.1f}% Male, {gender_counts['female']/total_files*100:.1f}% Female")

print(f"\n📊 Digit Distribution:")
for digit, count in digit_counts.items():
    print(f"   {digit}: {count} files")

# Visualize distributions
fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Gender distribution
axes[0].pie([gender_counts['male'], gender_counts['female']], 
           labels=['Male', 'Female'], autopct='%1.1f%%', 
           colors=['lightblue', 'lightpink'], startangle=90)
axes[0].set_title('Gender Distribution')

# Digit distribution
digits = list(digit_counts.keys())
counts = list(digit_counts.values())
bars = axes[1].bar(digits, counts, alpha=0.7, color='lightgreen')
axes[1].set_title('Digit Distribution')
axes[1].set_ylabel('Number of Samples')
axes[1].tick_params(axis='x', rotation=45)
for bar, count in zip(bars, counts):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 5,
                 str(count), ha='center', va='bottom')

# Combined distribution
axes[2].text(0.5, 0.7, f'Total Files: {total_files}', ha='center', va='center', 
             fontsize=16, fontweight='bold', transform=axes[2].transAxes)
axes[2].text(0.5, 0.5, f'Male: {gender_counts["male"]}', ha='center', va='center', 
             fontsize=14, transform=axes[2].transAxes)
axes[2].text(0.5, 0.3, f'Female: {gender_counts["female"]}', ha='center', va='center', 
             fontsize=14, transform=axes[2].transAxes)
axes[2].set_title('Summary')
axes[2].axis('off')

plt.tight_layout()
plt.show()

print("✅ Multi-output dataset analysis completed!")


## 2. Multi-Output CNN Pipeline: Feature Extraction, Training & Evaluation


In [None]:
print("=== MFCC FEATURE EXTRACTION ===")
print("🎵 Performing MFCC feature extraction from audio files...")

# Initialize gender classifier
classifier = GenderClassifier(sr=22050, n_mfcc=13)

# Load and process a sample audio file to demonstrate MFCC extraction
sample_file = None
for digit in range(10):
    digit_dir = os.path.join(DATASET_PATH, f'd{digit}')
    if os.path.exists(digit_dir):
        male_dir = os.path.join(digit_dir, 'male')
        if os.path.exists(male_dir):
            wav_files = [f for f in os.listdir(male_dir) if f.endswith('.wav')]
            if wav_files:
                sample_file = os.path.join(male_dir, wav_files[0])
                break

if sample_file:
    print(f"📁 Sample file: {sample_file}")
    
    # Extract MFCC features from sample
    sample_features = classifier.extract_mfcc_features(sample_file)
    
    print(f"🎯 MFCC Features extracted:")
    print(f"   Feature vector dimension: {len(sample_features)}")
    print(f"   MFCC coefficients: {classifier.n_mfcc}")
    print(f"   Statistical features per coefficient: 6 (mean, std, min, max, skew, kurtosis)")
    print(f"   Additional audio features: 6 (spectral centroid, rolloff, zero-crossing)")
    
    # Visualize MFCC features
    plt.figure(figsize=(12, 8))
    
    # Load audio for visualization
    import librosa
    y, sr = librosa.load(sample_file, sr=22050)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    
    # Plot MFCC
    plt.subplot(2, 2, 1)
    librosa.display.specshow(mfcc, sr=sr, x_axis='time')
    plt.colorbar(format='%+2.0f dB')
    plt.title('MFCC Features')
    plt.ylabel('MFCC Coefficients')
    
    # Plot audio waveform
    plt.subplot(2, 2, 2)
    plt.plot(y)
    plt.title('Audio Waveform')
    plt.xlabel('Sample')
    plt.ylabel('Amplitude')
    
    # Plot feature importance
    plt.subplot(2, 2, 3)
    feature_names = [f'MFCC{i//6}_{["mean", "std", "min", "max", "skew", "kurtosis"][i%6]}' 
                    for i in range(classifier.n_mfcc * 6)] + \
                   ['spec_centroid_mean', 'spec_centroid_std', 'spec_rolloff_mean', 
                    'spec_rolloff_std', 'zcr_mean', 'zcr_std']
    
    # Show first 20 features
    plt.bar(range(20), sample_features[:20])
    plt.title('First 20 Extracted Features')
    plt.xlabel('Feature Index')
    plt.ylabel('Feature Value')
    plt.xticks(rotation=45)
    
    # Plot feature distribution
    plt.subplot(2, 2, 4)
    plt.hist(sample_features, bins=30, alpha=0.7, edgecolor='black')
    plt.title('Feature Value Distribution')
    plt.xlabel('Feature Value')
    plt.ylabel('Frequency')
    
    plt.tight_layout()
    plt.show()
    
    print("✅ MFCC feature extraction demonstrated successfully!")
else:
    print("❌ No sample audio file found for demonstration")


In [None]:
# Run the complete Multi-Output CNN pipeline
print("🚀 Starting Multi-Output CNN Pipeline...")

# Step 1: Create CNN dataset with MFCC features
print("\n📊 Step 1: Creating CNN dataset with MFCC features...")
mfcc_features, digit_labels, gender_labels = create_cnn_dataset(
    dataset_path=DATASET_PATH,
    output_dir=os.path.join(OUTPUT_DIR, 'cnn_data'),
    sample_size=100  # Use small sample for demo
)

print(f"✅ Dataset created:")
print(f"   - MFCC features shape: {mfcc_features.shape}")
print(f"   - Digit labels shape: {digit_labels.shape}")
print(f"   - Gender labels shape: {gender_labels.shape}")

# Step 2: Train Multi-Output CNN
print("\n🧠 Step 2: Training Multi-Output CNN...")
trainer = train_multi_output_cnn_pipeline(
    dataset_path=DATASET_PATH,
    output_dir=OUTPUT_DIR,
    sample_size=100,  # Use small sample for demo
    epochs=50,
    batch_size=32
)

# Step 3: Load test data for evaluation
print("\n📈 Step 3: Loading test data for evaluation...")
from sklearn.model_selection import train_test_split

# Apply 70/30 split (same as training)
X_train, X_test, y_gender_train, y_gender_test, y_digit_train, y_digit_test = train_test_split(
    mfcc_features, gender_labels, digit_labels,
    test_size=0.3,
    random_state=42,
    stratify=np.argmax(gender_labels, axis=1)
)

# Step 4: Evaluate the trained model
print("\n📊 Step 4: Evaluating Multi-Output CNN...")
evaluator = evaluate_multi_output_cnn_pipeline(
    model_results=trainer.training_history,
    y_gender_test=y_gender_test,
    y_digit_test=y_digit_test,
    output_dir=OUTPUT_DIR
)

print("\n✅ Multi-Output CNN Pipeline Completed!")
print("📊 Results Summary:")
print(f"   - Best overall accuracy: {trainer.best_accuracy:.4f}")
print(f"   - Model architecture: Multi-Output CNN with shared conv layers")
print(f"   - Features: MFCC (13 coefficients, 100 time frames)")
print(f"   - Train/Test split: 70/30")
print(f"   - Loss function: Categorical crossentropy")
print(f"   - Evaluation metrics: Accuracy, Precision, Recall, F1-score")

print("\n🎉 Multi-Output CNN pipeline execution completed successfully!")
