# üó£Ô∏è EpigrafIA - Accent Detection Model Training

This notebook trains a CNN model to detect 8 accents across 4 languages.

**Note:** This is a simplified version since the Common Voice dataset may not have detailed accent metadata.
We'll create a functional model structure that can be trained once proper accent-labeled data is available.

## Accents to detect:
- Spanish: Espa√±a vs M√©xico
- English: UK vs USA  
- French: France vs Quebec
- German: Germany vs Austria

## üì¶ Cell 1: Install Dependencies

In [None]:
!pip install -q tensorflow==2.15.0
!pip install -q librosa==0.10.1  
!pip install -q pandas==2.1.3
!pip install -q scikit-learn==1.3.2
!pip install -q tensorflowjs==4.15.0
!pip install -q matplotlib==3.8.2
!pip install -q seaborn==0.13.0
!pip install -q tqdm

print("‚úÖ Dependencies installed")

## ‚öôÔ∏è Cell 2: Configuration

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
from tqdm import tqdm

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Audio configuration (same as language model)
SAMPLE_RATE = 16000
DURATION = 3
N_MFCC = 40
N_FFT = 2048
HOP_LENGTH = 512

# Paths
DATA_DIR = '../data/Common Voice'
OUTPUT_DIR = '../frontend/public/models'

# Model parameters
BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 0.0005  # Slightly lower for accent (more subtle features)

# Accent classes
ACCENT_CLASSES = [
    'spain', 'mexico',      # Spanish
    'uk', 'usa',           # English
    'france', 'quebec',    # French
    'germany', 'austria'   # German
]

print("‚úÖ Configuration loaded")
print(f"Accent classes: {len(ACCENT_CLASSES)}")

## üìù Cell 3: Create Synthetic Accent Dataset

Since we don't have real accent labels in the dataset, we'll create a synthetic dataset by:
1. Splitting each language's data into 2 groups (simulating 2 accents)
2. Training a model on this structure
3. The model will learn language features primarily, but the architecture will be ready for real accent data

In [None]:
def create_synthetic_accent_dataset(max_per_accent=250):
    """
    Create synthetic accent dataset by splitting language data
    In production, replace this with real accent-labeled data
    """
    audio_paths = []
    labels = []
    
    # Map languages to their accent pairs  
    language_accent_map = {
        'Audios Espa√±ol': ['spain', 'mexico'],
        'Audios Ingles': ['uk', 'usa'],
        'Audios Frances': ['france', 'quebec'],
        'Audios Aleman': ['germany', 'austria']
    }
    
    for folder, accents in language_accent_map.items():
        clips_path = os.path.join(DATA_DIR, folder, 'clips')
        
        if not os.path.exists(clips_path):
            print(f"‚ö†Ô∏è Warning: {clips_path} not found")
            continue
        
        audio_files = [f for f in os.listdir(clips_path) if f.endswith('.mp3')]
        
        # Split files into 2 groups (simulating 2 accents)
        mid_point = len(audio_files) // 2
        
        for i, accent in enumerate(accents):
            if i == 0:
                files = audio_files[:mid_point][:max_per_accent]
            else:
                files = audio_files[mid_point:][:max_per_accent]
            
            for audio_file in files:
                full_path = os.path.join(clips_path, audio_file)
                audio_paths.append(full_path)
                labels.append(accent)
            
            print(f"‚úÖ {accent}: {len(files)} audios")
    
    return audio_paths, labels

# Create dataset
print("üìÇ Creating synthetic accent dataset...")
print("‚ö†Ô∏è NOTE: Using synthetic labels. For production, use real accent metadata.\n")

audio_paths, labels = create_synthetic_accent_dataset(max_per_accent=250)

print(f"\nüìä Total: {len(audio_paths)} audios")
print(f"\nDistribution:")
for accent in ACCENT_CLASSES:
    count = labels.count(accent)
    print(f"  {accent}: {count}")

## üéµ Cell 4: Feature Extraction (Reuse from Language Model)

In [None]:
def extract_mfcc_features(audio_path, sr=SAMPLE_RATE, duration=DURATION, n_mfcc=N_MFCC):
    try:
        y, sr_orig = librosa.load(audio_path, sr=sr, duration=duration)
        
        target_length = sr * duration
        if len(y) < target_length:
            y = np.pad(y, (0, target_length - len(y)), mode='constant')
        else:
            y = y[:target_length]
        
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc, n_fft=N_FFT, hop_length=HOP_LENGTH)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
        
        features = np.concatenate([mfcc, mfcc_delta, mfcc_delta2], axis=0).T
        
        # Normalize
        mean = np.mean(features, axis=0)
        std = np.std(features, axis=0)
        features = (features - mean) / (std + 1e-8)
        
        return features
    except Exception as e:
        return None

print("‚úÖ Feature extraction function ready")

## üèóÔ∏è Cell 5: Prepare Dataset

In [None]:
X = []
y = []

print("üîÑ Extracting features...")
for audio_path, label in tqdm(zip(audio_paths, labels), total=len(audio_paths)):
    features = extract_mfcc_features(audio_path)
    if features is not None:
        X.append(features)
        y.append(label)

X = np.array(X)
y = np.array(y)

print(f"\n‚úÖ Dataset: {X.shape}")

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\nüè∑Ô∏è Classes: {label_encoder.classes_}")

## üîÄ Cell 6: Train/Val/Test Split

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_encoded, test_size=0.3, random_state=RANDOM_SEED, stratify=y_encoded
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=RANDOM_SEED, stratify=y_temp
)

print("üìä Split:")
print(f"   Train: {len(X_train)}")
print(f"   Val: {len(X_val)}")
print(f"   Test: {len(X_test)}")

## üß† Cell 7: Build Model (Deeper for Accent Subtleties)

In [None]:
def build_accent_model(input_shape, num_classes):
    model = keras.Sequential([
        keras.layers.Input(shape=input_shape),
        
        # Deeper architecture for subtle accent features
        keras.layers.Conv1D(64, kernel_size=3, padding='same'),
        keras.layers.ReLU(),
        keras.layers.BatchNormalization(),
        keras.layers.MaxPooling1D(pool_size=2),
        keras.layers.Dropout(0.3),
        
        keras.layers.Conv1D(128, kernel_size=3, padding='same'),
        keras.layers.ReLU(),
        keras.layers.BatchNormalization(),
        keras.layers.MaxPooling1D(pool_size=2),
        keras.layers.Dropout(0.3),
        
        keras.layers.Conv1D(256, kernel_size=3, padding='same'),
        keras.layers.ReLU(),
        keras.layers.BatchNormalization(),
        keras.layers.MaxPooling1D(pool_size=2),
        keras.layers.Dropout(0.4),
        
        keras.layers.Conv1D(512, kernel_size=3, padding='same'),
        keras.layers.ReLU(),
        keras.layers.BatchNormalization(),
        keras.layers.GlobalAveragePooling1D(),
        
        keras.layers.Dense(256, activation='relu'),
        keras.layers.Dropout(0.5),
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.4),
        keras.layers.Dense(num_classes, activation='softmax')
    ])
    return model

input_shape = (X_train.shape[1], X_train.shape[2])
num_classes = len(label_encoder.classes_)

model = build_accent_model(input_shape, num_classes)
model.summary()

model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("\n‚úÖ Model compiled")

## üèãÔ∏è Cell 8: Train Model

In [None]:
callbacks = [
    keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
]

print("üèãÔ∏è Training accent model...")

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

print("\n‚úÖ Training complete")

## üìä Cell 9: Evaluate

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

print(f"üìà Test Accuracy: {test_accuracy*100:.2f}%")
print(f"   Test Loss: {test_loss:.4f}")

y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

print("\nüìã Classification Report:")
print(classification_report(y_test, y_pred_classes, target_names=label_encoder.classes_, digits=3))

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred_classes)
plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Purples', 
            xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Accent Detection - Confusion Matrix', fontsize=16, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

## üåê Cell 10: Convert to TensorFlow.js

In [None]:
import tensorflowjs as tfjs

os.makedirs(os.path.join(OUTPUT_DIR, 'accent'), exist_ok=True)

print("üîÑ Converting to TensorFlow.js...")

tfjs.converters.save_keras_model(
    model,
    os.path.join(OUTPUT_DIR, 'accent')
)

# Save config
config = {
    'sample_rate': SAMPLE_RATE,
    'duration': DURATION,
    'n_mfcc': N_MFCC,
    'n_fft': N_FFT,
    'hop_length': HOP_LENGTH,
    'classes': label_encoder.classes_.tolist(),
    'input_shape': list(input_shape),
    'test_accuracy': float(test_accuracy),
    'note': 'Trained on synthetic accent labels. Replace with real accent data for production.'
}

with open(os.path.join(OUTPUT_DIR, 'accent', 'config.json'), 'w') as f:
    json.dump(config, f, indent=2)

print(f"\n‚úÖ Model saved to: {OUTPUT_DIR}/accent/")
print("\nüéâ Accent model ready!")
print("\n‚ö†Ô∏è Remember: This model was trained on synthetic labels.")
print("   For production, retrain with real accent-labeled data.")