# üß† EpigrafIA - Language Detection Model Training

This notebook trains a CNN model to detect 4 languages: Spanish, English, French, and German.

## Pipeline:
1. Load audio data from Common Voice dataset (2000 samples per language)
2. Extract MFCC features (40 coefficients + deltas + delta¬≤)
3. Build & train CNN architecture
4. Evaluate performance
5. Convert to TensorFlow.js format

## üì¶ Cell 1: Install Dependencies

In [None]:
!pip install -q tensorflow==2.15.0
!pip install -q librosa==0.10.1
!pip install -q pandas==2.1.3
!pip install -q scikit-learn==1.3.2
!pip install -q tensorflowjs==4.15.0
!pip install -q matplotlib==3.8.2
!pip install -q seaborn==0.13.0
!pip install -q tqdm

print("‚úÖ Dependencies installed")

## ‚öôÔ∏è Cell 2: Configuration & Imports

In [None]:
import os
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import json
from tqdm import tqdm

# Set random seeds
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# Audio configuration
SAMPLE_RATE = 16000
DURATION = 3  # seconds
N_MFCC = 40
N_FFT = 2048
HOP_LENGTH = 512

# Paths
DATA_DIR = '../data/Common Voice'
OUTPUT_DIR = '../frontend/public/models'

# Model parameters
BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 0.001

# Language mapping
LANGUAGE_FOLDERS = {
    'es': 'Audios Espa√±ol',
    'en': 'Audios Ingles',
    'fr': 'Audios Frances',
    'de': 'Audios Aleman'
}

print("‚úÖ Configuration loaded")
print(f"Sample Rate: {SAMPLE_RATE} Hz")
print(f"Duration: {DURATION} seconds")
print(f"MFCCs: {N_MFCC}")

## üìÇ Cell 3: Load Dataset

In [None]:
def load_audio_files(max_samples_per_lang=2000):
    """
    Load audio file paths from Common Voice dataset
    """
    audio_paths = []
    labels = []
    
    for lang_code, folder_name in LANGUAGE_FOLDERS.items():
        # Path to audio clips
        clips_path = os.path.join(DATA_DIR, folder_name, 'clips')
        
        if not os.path.exists(clips_path):
            print(f"‚ö†Ô∏è Warning: {clips_path} not found")
            continue
        
        # Get all MP3 files
        audio_files = [f for f in os.listdir(clips_path) if f.endswith('.mp3')]
        
        # Limit to max_samples_per_lang
        audio_files = audio_files[:max_samples_per_lang]
        
        # Add full paths
        for audio_file in audio_files:
            full_path = os.path.join(clips_path, audio_file)
            audio_paths.append(full_path)
            labels.append(lang_code)
        
        print(f"‚úÖ {lang_code}: {len(audio_files)} audios loaded")
    
    return audio_paths, labels

# Load data
print("üìÇ Loading dataset...")
audio_paths, labels = load_audio_files(max_samples_per_lang=2000)

print(f"\nüìä Total audios: {len(audio_paths)}")
print("\nDistribution by language:")
for lang in set(labels):
    count = labels.count(lang)
    print(f"  {lang}: {count} ({count/len(labels)*100:.1f}%)")

## üéµ Cell 4: MFCC Feature Extraction

In [None]:
def extract_mfcc_features(audio_path, sr=SAMPLE_RATE, duration=DURATION, n_mfcc=N_MFCC):
    """
    Extract MFCC + deltas + delta-deltas from audio file
    
    Returns:
        numpy array of shape (time_steps, n_mfcc * 3)
    """
    try:
        # Load audio
        y, sr_orig = librosa.load(audio_path, sr=sr, duration=duration)
        
        # Ensure fixed duration
        target_length = sr * duration
        if len(y) < target_length:
            # Pad with zeros
            y = np.pad(y, (0, target_length - len(y)), mode='constant')
        else:
            # Truncate
            y = y[:target_length]
        
        # Extract MFCC
        mfcc = librosa.feature.mfcc(
            y=y,
            sr=sr,
            n_mfcc=n_mfcc,
            n_fft=N_FFT,
            hop_length=HOP_LENGTH
        )
        
        # Extract deltas
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
        
        # Concatenate features
        features = np.concatenate([mfcc, mfcc_delta, mfcc_delta2], axis=0)
        
        # Transpose to (time_steps, features)
        features = features.T
        
        # Normalize
        mean = np.mean(features, axis=0)
        std = np.std(features, axis=0)
        features = (features - mean) / (std + 1e-8)
        
        return features
        
    except Exception as e:
        print(f"‚ùå Error processing {audio_path}: {e}")
        return None

# Test with one audio
print("üß™ Testing feature extraction...")
test_features = extract_mfcc_features(audio_paths[0])
if test_features is not None:
    print(f"‚úÖ Feature shape: {test_features.shape}")
    print(f"   (time_steps={test_features.shape[0]}, features={test_features.shape[1]})")

## üèóÔ∏è Cell 5: Prepare Full Dataset

In [None]:
def prepare_dataset(audio_paths, labels):
    """
    Process all audios and create arrays of features and labels
    """
    X = []
    y = []
    
    print("üîÑ Extracting features from all audios...")
    for audio_path, label in tqdm(zip(audio_paths, labels), total=len(audio_paths)):
        features = extract_mfcc_features(audio_path)
        
        if features is not None:
            X.append(features)
            y.append(label)
    
    X = np.array(X)
    y = np.array(y)
    
    return X, y

# Process dataset
X, y = prepare_dataset(audio_paths, labels)

print(f"\n‚úÖ Dataset prepared:")
print(f"   X shape: {X.shape}")
print(f"   y shape: {y.shape}")

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print(f"\nüè∑Ô∏è Classes: {label_encoder.classes_}")
print(f"   Encoded as: {np.unique(y_encoded)}")

## üîÄ Cell 6: Train/Val/Test Split

In [None]:
# Train/Val/Test split (70/15/15)
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y_encoded,
    test_size=0.3,
    random_state=RANDOM_SEED,
    stratify=y_encoded
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=RANDOM_SEED,
    stratify=y_temp
)

print("üìä Dataset split:")
print(f"   Train: {X_train.shape[0]} samples ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"   Val:   {X_val.shape[0]} samples ({X_val.shape[0]/len(X)*100:.1f}%)")
print(f"   Test:  {X_test.shape[0]} samples ({X_test.shape[0]/len(X)*100:.1f}%)")

print("\nüìà Distribution in Train set:")
unique, counts = np.unique(y_train, return_counts=True)
for cls, count in zip(unique, counts):
    print(f"   {label_encoder.classes_[cls]}: {count}")

## üß† Cell 7: Build CNN Model

In [None]:
def build_cnn_model(input_shape, num_classes):
    """
    CNN architecture for language detection
    """
    model = keras.Sequential([
        # Input layer
        keras.layers.Input(shape=input_shape),
        
        # Block 1: Low-level features
        keras.layers.Conv1D(64, kernel_size=3, padding='same'),
        keras.layers.ReLU(),
        keras.layers.BatchNormalization(),
        keras.layers.MaxPooling1D(pool_size=2),
        keras.layers.Dropout(0.3),
        
        # Block 2: Mid-level features
        keras.layers.Conv1D(128, kernel_size=3, padding='same'),
        keras.layers.ReLU(),
        keras.layers.BatchNormalization(),
        keras.layers.MaxPooling1D(pool_size=2),
        keras.layers.Dropout(0.3),
        
        # Block 3: High-level features
        keras.layers.Conv1D(256, kernel_size=3, padding='same'),
        keras.layers.ReLU(),
        keras.layers.BatchNormalization(),
        keras.layers.GlobalAveragePooling1D(),
        
        # Classification
        keras.layers.Dense(128, activation='relu'),
        keras.layers.Dropout(0.4),
        keras.layers.Dense(num_classes, activation='softmax')
    ])
    
    return model

# Create model
input_shape = (X_train.shape[1], X_train.shape[2])  # (time_steps, features)
num_classes = len(label_encoder.classes_)

model = build_cnn_model(input_shape, num_classes)

print("üß† Model Architecture:")
model.summary()

# Compile
model.compile(
    optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("\n‚úÖ Model compiled")

## üèãÔ∏è Cell 8: Train Model

In [None]:
# Callbacks
callbacks = [
    keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=10,
        restore_best_weights=True,
        verbose=1
    ),
    keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=5,
        min_lr=1e-6,
        verbose=1
    )
]

print("üèãÔ∏è Training model...")
print(f"Epochs: {EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")

# Train
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    callbacks=callbacks,
    verbose=1
)

print("\n‚úÖ Training completed!")

## üìä Cell 9: Visualize Training

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

# Accuracy
axes[0].plot(history.history['accuracy'], label='Train Accuracy', linewidth=2)
axes[0].plot(history.history['val_accuracy'], label='Val Accuracy', linewidth=2)
axes[0].set_title('Model Accuracy', fontsize=14, fontweight='bold')
axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Accuracy')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Loss
axes[1].plot(history.history['loss'], label='Train Loss', linewidth=2)
axes[1].plot(history.history['val_loss'], label='Val Loss', linewidth=2)
axes[1].set_title('Model Loss', fontsize=14, fontweight='bold')
axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('Loss')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("üìä Training visualization complete")

## üéØ Cell 10: Evaluate on Test Set

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Evaluate
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=0)

print(f"üìà Test Set Results:")
print(f"   Loss: {test_loss:.4f}")
print(f"   Accuracy: {test_accuracy*100:.2f}%")

# Predictions
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)

# Classification report
print("\nüìã Classification Report:")
print(classification_report(
    y_test,
    y_pred_classes,
    target_names=label_encoder.classes_,
    digits=3
))

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_classes)

plt.figure(figsize=(10, 8))
sns.heatmap(
    cm,
    annot=True,
    fmt='d',
    cmap='Blues',
    xticklabels=label_encoder.classes_,
    yticklabels=label_encoder.classes_,
    cbar_kws={'label': 'Count'}
)
plt.title('Confusion Matrix', fontsize=16, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

print("\n‚úÖ Evaluation complete")

## üíæ Cell 11: Save Model

In [None]:
# Save model
model.save('../models/language_model.h5')
print("‚úÖ Model saved: ../models/language_model.h5")

# Save label encoder
joblib.dump(label_encoder, '../models/language_label_encoder.pkl')
print("‚úÖ Label encoder saved")

# Save config
config = {
    'sample_rate': SAMPLE_RATE,
    'duration': DURATION,
    'n_mfcc': N_MFCC,
    'n_fft': N_FFT,
    'hop_length': HOP_LENGTH,
    'classes': label_encoder.classes_.tolist(),
    'input_shape': list(input_shape),
    'test_accuracy': float(test_accuracy)
}

with open('../models/language_config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("‚úÖ Configuration saved")

## üåê Cell 12: Convert to TensorFlow.js

In [None]:
import tensorflowjs as tfjs

# Create output directory
os.makedirs(os.path.join(OUTPUT_DIR, 'language'), exist_ok=True)

# Convert model
print("üîÑ Converting model to TensorFlow.js...")

tfjs.converters.save_keras_model(
    model,
    os.path.join(OUTPUT_DIR, 'language')
)

print(f"‚úÖ Model converted and saved in: {OUTPUT_DIR}/language/")
print("\nüìÅ Generated files:")
for file in os.listdir(os.path.join(OUTPUT_DIR, 'language')):
    file_path = os.path.join(OUTPUT_DIR, 'language', file)
    size = os.path.getsize(file_path) / (1024 * 1024)  # MB
    print(f"   - {file} ({size:.2f} MB)")

# Copy config
import shutil
shutil.copy('../models/language_config.json', os.path.join(OUTPUT_DIR, 'language', 'config.json'))

print("\nüéâ Conversion complete!")
print("\nModel is ready to use in the web application.")

## üìã Cell 13: Summary

In [None]:
print("=" * 60)
print("üéâ LANGUAGE DETECTION MODEL - SUMMARY")
print("=" * 60)

print(f"\nüìä Dataset:")
print(f"   Total audios: {len(X)}")
print(f"   Languages: {', '.join(label_encoder.classes_)}")
print(f"   Train: {len(X_train)} | Val: {len(X_val)} | Test: {len(X_test)}")

print(f"\nüß† Model:")
print(f"   Architecture: CNN (3 Conv1D blocks)")
print(f"   Parameters: {model.count_params():,}")
print(f"   Input shape: {input_shape}")
print(f"   Output classes: {num_classes}")

print(f"\nüéØ Performance:")
print(f"   Test Accuracy: {test_accuracy*100:.2f}%")
print(f"   Test Loss: {test_loss:.4f}")

print(f"\nüíæ Output Files:")
print(f"   ‚úÖ {OUTPUT_DIR}/language/model.json")
print(f"   ‚úÖ {OUTPUT_DIR}/language/*.bin")
print(f"   ‚úÖ {OUTPUT_DIR}/language/config.json")

print(f"\nüöÄ Next Steps:")
print(f"   1. Run the accent detection notebook (if needed)")
print(f"   2. cd ../frontend")
print(f"   3. npm install")
print(f"   4. npm run dev")
print(f"   5. Open http://localhost:4321")

print("\n" + "=" * 60)