# Turkish Pronunciation Analysis - Model Training
### PhoneticHybrid ML Training Pipeline

This notebook trains a deep learning model for Turkish pronunciation quality assessment.

**Architecture:** CNN/RNN hybrid for acoustic feature classification

**Features:**
- MFCCs (Mel-frequency cepstral coefficients)
- Formants (F1, F2, F3)
- Fundamental frequency (F0)
- RMS energy
- Duration features

**Training Environment:** Google Colab with GPU

## 1. Setup & Dependencies

In [None]:
# Install required packages
!pip install torch torchvision torchaudio
!pip install librosa soundfile
!pip install praat-parselmouth
!pip install phonemizer
!pip install scikit-learn pandas numpy matplotlib seaborn tqdm

In [None]:
# Import libraries
import os
import json
import numpy as np
import pandas as pd
import librosa
import parselmouth
from parselmouth.praat import call
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Set random seeds
torch.manual_seed(42)
np.random.seed(42)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

## 2. Mount Google Drive & Load Data

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set paths
DATA_DIR = '/content/drive/MyDrive/phoneizer/data'
MODELS_DIR = '/content/drive/MyDrive/phoneizer/models'
os.makedirs(MODELS_DIR, exist_ok=True)

In [None]:
# Load dataset
def load_audio_files(data_dir):
    """Load all audio files and metadata from participant directories."""
    data = []
    
    for participant_dir in os.listdir(data_dir):
        if not participant_dir.startswith('participant_'):
            continue
            
        participant_path = os.path.join(data_dir, participant_dir)
        words_dir = os.path.join(participant_path, 'kelimeler')
        
        if not os.path.exists(words_dir):
            continue
            
        # Load participant info
        info_path = os.path.join(participant_path, 'info.json')
        with open(info_path, 'r', encoding='utf-8') as f:
            participant_info = json.load(f)
        
        # Load audio files
        for audio_file in os.listdir(words_dir):
            if audio_file.endswith('.wav'):
                audio_path = os.path.join(words_dir, audio_file)
                word = audio_file.split('_')[1].replace('.wav', '')
                
                data.append({
                    'participant_id': participant_info['id'],
                    'audio_path': audio_path,
                    'word': word,
                    'age': participant_info['age'],
                    'gender': participant_info['gender']
                })
    
    return pd.DataFrame(data)

df = load_audio_files(DATA_DIR)
print(f'Loaded {len(df)} audio samples from {df.participant_id.nunique()} participants')
df.head()

## 3. Feature Extraction

In [None]:
def extract_acoustic_features(audio_path, sr=16000):
    """Extract comprehensive acoustic features from audio file."""
    try:
        # Load audio
        y, sr = librosa.load(audio_path, sr=sr)
        
        # 1. MFCCs (13 coefficients)
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_mean = np.mean(mfccs, axis=1)
        mfcc_std = np.std(mfccs, axis=1)
        
        # 2. Spectral features
        spectral_centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        spectral_rolloff = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
        spectral_bandwidth = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
        
        # 3. Zero crossing rate
        zcr = np.mean(librosa.feature.zero_crossing_rate(y))
        
        # 4. RMS energy
        rms = np.mean(librosa.feature.rms(y=y))
        
        # 5. Formants using Praat
        sound = parselmouth.Sound(audio_path)
        formant = call(sound, "To Formant (burg)", 0.0, 5, 5500, 0.025, 50)
        f1 = call(formant, "Get mean", 1, 0, 0, "hertz")
        f2 = call(formant, "Get mean", 2, 0, 0, "hertz")
        f3 = call(formant, "Get mean", 3, 0, 0, "hertz")
        
        # 6. Fundamental frequency (F0)
        pitch = call(sound, "To Pitch", 0.0, 75, 600)
        f0_mean = call(pitch, "Get mean", 0, 0, "hertz")
        f0_std = call(pitch, "Get standard deviation", 0, 0, "hertz")
        
        # 7. Duration
        duration = librosa.get_duration(y=y, sr=sr)
        
        # Combine all features
        features = np.concatenate([
            mfcc_mean,
            mfcc_std,
            [spectral_centroid, spectral_rolloff, spectral_bandwidth],
            [zcr, rms],
            [f1, f2, f3],
            [f0_mean, f0_std],
            [duration]
        ])
        
        return features
    
    except Exception as e:
        print(f"Error processing {audio_path}: {e}")
        return None

# Extract features for all samples
print('Extracting features...')
features_list = []
for idx, row in tqdm(df.iterrows(), total=len(df)):
    features = extract_acoustic_features(row['audio_path'])
    features_list.append(features)

df['features'] = features_list
df = df.dropna(subset=['features'])
print(f'Feature extraction complete. {len(df)} samples remaining.')

## 4. Label Generation

For supervised learning, we need labels. In a real scenario, you would have:
- Expert annotations
- Reference recordings
- Phonetic transcriptions

For now, we'll create synthetic labels based on feature quality metrics.

In [None]:
# TODO: Replace with actual labels from expert annotations
# For demonstration, create synthetic labels
def generate_synthetic_labels(features):
    """Generate synthetic quality scores for demonstration."""
    # This is a placeholder - replace with actual labels
    quality_score = np.random.uniform(0.3, 1.0)
    return quality_score

df['quality_score'] = df['features'].apply(generate_synthetic_labels)
df['label'] = (df['quality_score'] > 0.6).astype(int)  # Binary classification

print('Label distribution:')
print(df['label'].value_counts())

## 5. Data Preprocessing

In [None]:
# Prepare data
X = np.vstack(df['features'].values)
y = df['label'].values

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Standardize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to PyTorch tensors
X_train_t = torch.FloatTensor(X_train)
y_train_t = torch.FloatTensor(y_train)
X_test_t = torch.FloatTensor(X_test)
y_test_t = torch.FloatTensor(y_test)

print(f'Training set: {X_train.shape}')
print(f'Test set: {X_test.shape}')

In [None]:
# Create PyTorch Dataset and DataLoader
class AudioDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset = AudioDataset(X_train_t, y_train_t)
test_dataset = AudioDataset(X_test_t, y_test_t)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

## 6. Model Architecture

In [None]:
class PronunciationQualityNet(nn.Module):
    """Deep neural network for pronunciation quality assessment."""
    
    def __init__(self, input_size, hidden_sizes=[128, 64, 32], dropout=0.3):
        super(PronunciationQualityNet, self).__init__()
        
        layers = []
        prev_size = input_size
        
        for hidden_size in hidden_sizes:
            layers.append(nn.Linear(prev_size, hidden_size))
            layers.append(nn.ReLU())
            layers.append(nn.BatchNorm1d(hidden_size))
            layers.append(nn.Dropout(dropout))
            prev_size = hidden_size
        
        layers.append(nn.Linear(prev_size, 1))
        
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x)

# Initialize model
input_size = X_train.shape[1]
model = PronunciationQualityNet(input_size).to(device)

print(model)
print(f'\nTotal parameters: {sum(p.numel() for p in model.parameters())}')

## 7. Training

In [None]:
# Training configuration
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=5)

num_epochs = 50
best_loss = float('inf')
train_losses = []
test_losses = []

# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    
    for features, labels in train_loader:
        features = features.to(device)
        labels = labels.to(device).unsqueeze(1)
        
        # Forward pass
        outputs = model(features)
        loss = criterion(outputs, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Validation
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for features, labels in test_loader:
            features = features.to(device)
            labels = labels.to(device).unsqueeze(1)
            
            outputs = model(features)
            loss = criterion(outputs, labels)
            test_loss += loss.item()
            
            predicted = (torch.sigmoid(outputs) > 0.5).float()
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    train_loss /= len(train_loader)
    test_loss /= len(test_loader)
    accuracy = 100 * correct / total
    
    train_losses.append(train_loss)
    test_losses.append(test_loss)
    
    scheduler.step(test_loss)
    
    print(f'Epoch [{epoch+1}/{num_epochs}] '
          f'Train Loss: {train_loss:.4f} | '
          f'Test Loss: {test_loss:.4f} | '
          f'Accuracy: {accuracy:.2f}%')
    
    # Save best model
    if test_loss < best_loss:
        best_loss = test_loss
        torch.save(model.state_dict(), os.path.join(MODELS_DIR, 'best_model.pt'))
        print('✓ Model saved!')

print('\nTraining complete!')

## 8. Evaluation & Visualization

In [None]:
# Plot training curves
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training & Validation Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 2, 2)
plt.hist(y_train, bins=20, alpha=0.5, label='Train')
plt.hist(y_test, bins=20, alpha=0.5, label='Test')
plt.xlabel('Quality Score')
plt.ylabel('Count')
plt.title('Label Distribution')
plt.legend()

plt.tight_layout()
plt.savefig(os.path.join(MODELS_DIR, 'training_curves.png'))
plt.show()

## 9. Save Final Model

In [None]:
# Save complete model
torch.save(model, os.path.join(MODELS_DIR, 'trained_model.pt'))

# Save scaler
import pickle
with open(os.path.join(MODELS_DIR, 'scaler.pkl'), 'wb') as f:
    pickle.dump(scaler, f)

print('✓ Model and scaler saved successfully!')
print(f'Model path: {os.path.join(MODELS_DIR, "trained_model.pt")}')
print('\nNow copy this model to your local backend/models/ directory.')