# üèãÔ∏è Training Loop

Entra√Æner le mod√®le : Forward ‚Üí Loss ‚Üí Backward ‚Üí Update

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

np.random.seed(42)

## Loss Function - Cross Entropy

In [None]:
def cross_entropy_loss(logits, targets):
    """
    Calcule la cross-entropy loss.
    
    Args:
        logits: (batch_size, seq_len, vocab_size)
        targets: (batch_size, seq_len) - indices des tokens cibles
    
    Returns:
        loss: scalar
    """
    batch_size, seq_len, vocab_size = logits.shape
    
    # Reshape
    logits = logits.reshape(-1, vocab_size)
    targets = targets.reshape(-1)
    
    # Softmax
    logits_exp = np.exp(logits - np.max(logits, axis=-1, keepdims=True))
    probs = logits_exp / np.sum(logits_exp, axis=-1, keepdims=True)
    
    # Cross-entropy
    log_probs = -np.log(probs[np.arange(len(targets)), targets] + 1e-10)
    loss = np.mean(log_probs)
    
    return loss

# Test
batch_size, seq_len, vocab_size = 4, 8, 100
logits = np.random.randn(batch_size, seq_len, vocab_size)
targets = np.random.randint(0, vocab_size, (batch_size, seq_len))

loss = cross_entropy_loss(logits, targets)
print(f"Loss: {loss:.4f}")
print(f"Random baseline: {-np.log(1/vocab_size):.4f}")

## Optimizer - Adam

In [None]:
class AdamOptimizer:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.lr = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = {}  # First moment
        self.v = {}  # Second moment
        self.t = 0   # Timestep
    
    def update(self, params, grads):
        """
        Update parameters using Adam.
        
        Args:
            params: dict of parameter arrays
            grads: dict of gradient arrays
        """
        self.t += 1
        
        for key in params.keys():
            # Initialize moments if needed
            if key not in self.m:
                self.m[key] = np.zeros_like(params[key])
                self.v[key] = np.zeros_like(params[key])
            
            # Update biased moments
            self.m[key] = self.beta1 * self.m[key] + (1 - self.beta1) * grads[key]
            self.v[key] = self.beta2 * self.v[key] + (1 - self.beta2) * (grads[key] ** 2)
            
            # Bias correction
            m_hat = self.m[key] / (1 - self.beta1 ** self.t)
            v_hat = self.v[key] / (1 - self.beta2 ** self.t)
            
            # Update parameters
            params[key] -= self.lr * m_hat / (np.sqrt(v_hat) + self.epsilon)

## Training Loop

In [None]:
def train_model(model, train_data, val_data, config):
    """
    Entra√Æne le mod√®le GPT.
    
    Args:
        model: GPT model
        train_data: Training dataset
        val_data: Validation dataset
        config: dict with training hyperparameters
    """
    optimizer = AdamOptimizer(learning_rate=config['lr'])
    
    train_losses = []
    val_losses = []
    
    for epoch in range(config['epochs']):
        # Training
        epoch_loss = 0
        num_batches = 0
        
        for step in range(config['steps_per_epoch']):
            # Get batch
            x, y = get_batch(train_data, config['block_size'], config['batch_size'])
            
            # Forward
            logits = model.forward(x)
            loss = cross_entropy_loss(logits, y)
            
            # Backward (simplified - assume model has backward method)
            grads = model.backward(y)
            
            # Update
            optimizer.update(model.parameters, grads)
            
            epoch_loss += loss
            num_batches += 1
        
        avg_train_loss = epoch_loss / num_batches
        train_losses.append(avg_train_loss)
        
        # Validation
        val_loss = evaluate_model(model, val_data, config)
        val_losses.append(val_loss)
        
        print(f"Epoch {epoch+1}/{config['epochs']} - "
              f"Train Loss: {avg_train_loss:.4f} - "
              f"Val Loss: {val_loss:.4f}")
    
    return train_losses, val_losses

def evaluate_model(model, data, config):
    """√âvalue le mod√®le sur les donn√©es de validation."""
    losses = []
    
    for _ in range(config['eval_steps']):
        x, y = get_batch(data, config['block_size'], config['batch_size'])
        logits = model.forward(x)
        loss = cross_entropy_loss(logits, y)
        losses.append(loss)
    
    return np.mean(losses)

## Configuration et Entra√Ænement

In [None]:
# Configuration
config = {
    'vocab_size': 65,
    'd_model': 256,
    'num_layers': 4,
    'num_heads': 8,
    'd_ff': 1024,
    'max_len': 256,
    'block_size': 64,
    'batch_size': 32,
    'lr': 0.001,
    'epochs': 10,
    'steps_per_epoch': 100,
    'eval_steps': 20
}

print("Configuration d'entra√Ænement:")
for key, value in config.items():
    print(f"  {key}: {value}")

# Initialiser le mod√®le
# model = GPT(**config)

# Entra√Æner
# train_losses, val_losses = train_model(model, train_data, val_data, config)

## Visualisation de l'Entra√Ænement

In [None]:
# Simuler des losses pour la visualisation
train_losses = [4.5 - 0.3*i + 0.1*np.random.randn() for i in range(10)]
val_losses = [4.5 - 0.25*i + 0.15*np.random.randn() for i in range(10)]

plt.figure(figsize=(10, 5))
plt.plot(train_losses, label='Train Loss', marker='o')
plt.plot(val_losses, label='Val Loss', marker='s')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

print(f"\nFinal Train Loss: {train_losses[-1]:.4f}")
print(f"Final Val Loss: {val_losses[-1]:.4f}")