# 🎤 Pronunciation Scoring Model Training

Train a neural network to predict pronunciation quality scores (0-100) from acoustic features.

**Pipeline:**
1. Load prepared dataset (CSV)
2. Preprocess & normalize features
3. Train neural network model
4. Evaluate performance
5. Export model for backend integration

**Requirements:**
- Upload `training_data.csv` from your data preparation
- Run on GPU for faster training

In [None]:
# Install dependencies
!pip install -q torch torchvision pandas numpy scikit-learn matplotlib seaborn tqdm

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
import json

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# Check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 📁 Upload Training Data

Upload the `training_data.csv` file created by `prepare_training_data.py`

In [None]:
from google.colab import files

print("Upload your training_data.csv file:")
uploaded = files.upload()

## 📊 Load and Explore Data

In [None]:
# Load dataset
df = pd.read_csv('training_data.csv')

print("Dataset Info:")
print(f"  Total samples: {len(df)}")
print(f"  Features: {len(df.columns) - 4}")
print(f"  Unique words: {df['word'].nunique()}")
print(f"  Unique participants: {df['participant_id'].nunique()}")
print(f"\nScore Statistics:")
print(df['score'].describe())

# Display first few rows
df.head()

In [None]:
# Visualize score distribution
plt.figure(figsize=(10, 5))

plt.subplot(1, 2, 1)
plt.hist(df['score'], bins=20, edgecolor='black')
plt.xlabel('Pronunciation Score')
plt.ylabel('Frequency')
plt.title('Score Distribution')

plt.subplot(1, 2, 2)
sns.boxplot(y=df['score'])
plt.ylabel('Pronunciation Score')
plt.title('Score Box Plot')

plt.tight_layout()
plt.show()

## 🔧 Data Preprocessing

In [None]:
# Separate features and labels
metadata_cols = ['word', 'participant_id', 'audio_file', 'score']
feature_cols = [col for col in df.columns if col not in metadata_cols]

X = df[feature_cols].values
y = df['score'].values

print(f"Feature matrix shape: {X.shape}")
print(f"Label vector shape: {y.shape}")

# Normalize scores to [0, 1] range for training
y_normalized = y / 100.0

# Split data: 70% train, 15% validation, 15% test
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y_normalized, test_size=0.15, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1765, random_state=42  # 0.1765 * 0.85 ≈ 0.15
)

print(f"\nTrain set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

# Standardize features (fit on train, transform all)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Save scaler parameters for inference
scaler_params = {
    'mean': scaler.mean_.tolist(),
    'scale': scaler.scale_.tolist(),
    'feature_names': feature_cols
}

with open('scaler_params.json', 'w') as f:
    json.dump(scaler_params, f, indent=2)

print("\n✅ Scaler parameters saved to scaler_params.json")

## 🧠 Define Neural Network Model

In [None]:
class PronunciationScorer(nn.Module):
    """Neural network for pronunciation quality scoring."""
    
    def __init__(self, input_dim, hidden_dims=[128, 64, 32], dropout=0.3):
        super(PronunciationScorer, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        # Output layer: single score [0, 1]
        layers.append(nn.Linear(prev_dim, 1))
        layers.append(nn.Sigmoid())  # Constrain output to [0, 1]
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)


# Initialize model
input_dim = X_train_scaled.shape[1]
model = PronunciationScorer(input_dim=input_dim).to(device)

print(f"Model architecture:")
print(model)
print(f"\nTotal parameters: {sum(p.numel() for p in model.parameters())}")

## 🔄 Training Setup

In [None]:
# Hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 0.001
NUM_EPOCHS = 100
PATIENCE = 15  # Early stopping patience

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5, verbose=True
)

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train_scaled).to(device)
y_train_tensor = torch.FloatTensor(y_train).unsqueeze(1).to(device)
X_val_tensor = torch.FloatTensor(X_val_scaled).to(device)
y_val_tensor = torch.FloatTensor(y_val).unsqueeze(1).to(device)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)
y_test_tensor = torch.FloatTensor(y_test).unsqueeze(1).to(device)

# Create data loaders
train_dataset = torch.utils.data.TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

print(f"Training setup complete")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Learning rate: {LEARNING_RATE}")
print(f"  Max epochs: {NUM_EPOCHS}")

## 🏋️ Train Model

In [None]:
# Training history
history = {
    'train_loss': [],
    'val_loss': [],
    'train_mae': [],
    'val_mae': []
}

best_val_loss = float('inf')
patience_counter = 0
best_model_state = None

print("Starting training...\n")

for epoch in range(NUM_EPOCHS):
    # Training phase
    model.train()
    train_loss = 0.0
    train_mae = 0.0
    
    for batch_X, batch_y in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        train_mae += torch.mean(torch.abs(outputs - batch_y)).item()
    
    train_loss /= len(train_loader)
    train_mae /= len(train_loader)
    
    # Validation phase
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor).item()
        val_mae = torch.mean(torch.abs(val_outputs - y_val_tensor)).item()
    
    # Record history
    history['train_loss'].append(train_loss)
    history['val_loss'].append(val_loss)
    history['train_mae'].append(train_mae * 100)  # Convert to 0-100 scale
    history['val_mae'].append(val_mae * 100)
    
    # Learning rate scheduling
    scheduler.step(val_loss)
    
    # Print progress
    if (epoch + 1) % 5 == 0:
        print(f"Epoch {epoch+1}/{NUM_EPOCHS}:")
        print(f"  Train Loss: {train_loss:.6f}, MAE: {train_mae*100:.2f}")
        print(f"  Val Loss: {val_loss:.6f}, MAE: {val_mae*100:.2f}")
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        best_model_state = model.state_dict().copy()
        print(f"  ✓ New best model (val_loss: {val_loss:.6f})")
    else:
        patience_counter += 1
        if patience_counter >= PATIENCE:
            print(f"\nEarly stopping at epoch {epoch+1}")
            break

# Load best model
model.load_state_dict(best_model_state)
print("\n✅ Training complete! Best model loaded.")

## 📈 Visualize Training Progress

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Loss plot
axes[0].plot(history['train_loss'], label='Train Loss')
axes[0].plot(history['val_loss'], label='Val Loss')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('MSE Loss')
axes[0].set_title('Training & Validation Loss')
axes[0].legend()
axes[0].grid(True)

# MAE plot
axes[1].plot(history['train_mae'], label='Train MAE')
axes[1].plot(history['val_mae'], label='Val MAE')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Mean Absolute Error (0-100 scale)')
axes[1].set_title('Training & Validation MAE')
axes[1].legend()
axes[1].grid(True)

plt.tight_layout()
plt.show()

print(f"Final Train MAE: {history['train_mae'][-1]:.2f}")
print(f"Final Val MAE: {history['val_mae'][-1]:.2f}")

## 🧪 Evaluate on Test Set

In [None]:
model.eval()
with torch.no_grad():
    test_outputs = model(X_test_tensor)
    test_loss = criterion(test_outputs, y_test_tensor).item()
    test_mae = torch.mean(torch.abs(test_outputs - y_test_tensor)).item() * 100

# Convert predictions to 0-100 scale
y_test_pred = test_outputs.cpu().numpy() * 100
y_test_actual = y_test * 100

print("Test Set Performance:")
print(f"  MSE Loss: {test_loss:.6f}")
print(f"  MAE: {test_mae:.2f}")
print(f"  RMSE: {np.sqrt(test_loss) * 100:.2f}")

# Scatter plot: predicted vs actual
plt.figure(figsize=(10, 6))
plt.scatter(y_test_actual, y_test_pred, alpha=0.6)
plt.plot([0, 100], [0, 100], 'r--', label='Perfect prediction')
plt.xlabel('Actual Score')
plt.ylabel('Predicted Score')
plt.title('Test Set: Predicted vs Actual Scores')
plt.legend()
plt.grid(True)
plt.show()

# Error distribution
errors = y_test_pred.flatten() - y_test_actual
plt.figure(figsize=(10, 5))
plt.hist(errors, bins=30, edgecolor='black')
plt.xlabel('Prediction Error')
plt.ylabel('Frequency')
plt.title(f'Error Distribution (Mean: {np.mean(errors):.2f}, Std: {np.std(errors):.2f})')
plt.axvline(x=0, color='r', linestyle='--')
plt.grid(True)
plt.show()

## 💾 Export Model for Backend

In [None]:
# Save model weights
torch.save(model.state_dict(), 'pronunciation_scorer.pth')

# Save model architecture info
model_info = {
    'input_dim': input_dim,
    'hidden_dims': [128, 64, 32],
    'dropout': 0.3,
    'test_mae': float(test_mae),
    'test_rmse': float(np.sqrt(test_loss) * 100),
    'num_samples': len(df),
    'num_features': input_dim
}

with open('model_info.json', 'w') as f:
    json.dump(model_info, f, indent=2)

print("✅ Model exported:")
print("   - pronunciation_scorer.pth (model weights)")
print("   - model_info.json (architecture)")
print("   - scaler_params.json (feature normalization)")
print("\nDownload these 3 files to your backend/models/ directory")

In [None]:
# Download files
from google.colab import files

print("Downloading model files...")
files.download('pronunciation_scorer.pth')
files.download('model_info.json')
files.download('scaler_params.json')
print("✅ Download complete!")

## 🎯 Sample Predictions

In [None]:
# Show some sample predictions
model.eval()
num_samples = 10

with torch.no_grad():
    sample_indices = np.random.choice(len(X_test), num_samples, replace=False)
    
    print("Sample Predictions:\n")
    print(f"{'Actual':<10} {'Predicted':<10} {'Error':<10}")
    print("-" * 35)
    
    for idx in sample_indices:
        actual = y_test[idx] * 100
        features = torch.FloatTensor(X_test_scaled[idx:idx+1]).to(device)
        predicted = model(features).item() * 100
        error = predicted - actual
        
        print(f"{actual:<10.1f} {predicted:<10.1f} {error:+.1f}")