# üéØ Fine-Tuning

Adapter le mod√®le √† des t√¢ches sp√©cifiques

In [None]:
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(42)

## Qu'est-ce que le Fine-Tuning ?

### Pr√©-entra√Ænement vs Fine-tuning

**Pr√©-entra√Ænement** :
- Large corpus (ex: tout Internet)
- T√¢che g√©n√©rale : pr√©dire le prochain token
- Apprentissage non supervis√©
- Co√ªteux en temps et ressources

**Fine-tuning** :
- Dataset sp√©cifique (ex: dialogues, code, m√©dical)
- T√¢che cibl√©e (ex: Q&A, classification, g√©n√©ration)
- Apprentissage supervis√©
- Rapide et efficace

## Strat√©gies de Fine-Tuning

### 1. Full Fine-Tuning
Mettre √† jour tous les poids du mod√®le.

In [None]:
def full_fine_tuning(model, task_data, config):
    """
    Fine-tune tous les param√®tres du mod√®le.
    
    Args:
        model: Mod√®le pr√©-entra√Æn√©
        task_data: Dataset sp√©cifique √† la t√¢che
        config: Configuration (learning rate, epochs, etc.)
    """
    optimizer = AdamOptimizer(learning_rate=config['lr'])
    
    for epoch in range(config['epochs']):
        for x, y in task_data:
            # Forward
            logits = model.forward(x)
            loss = cross_entropy_loss(logits, y)
            
            # Backward
            grads = model.backward(y)
            
            # Update ALL parameters
            optimizer.update(model.parameters, grads)
    
    return model

print("‚úÖ Full Fine-Tuning: Met √† jour tous les poids")
print("   Avantages: Meilleure adaptation")
print("   Inconv√©nients: Co√ªteux, risque de catastrophic forgetting")

### 2. Feature Extraction (Frozen Embeddings)
Geler les embeddings, entra√Æner seulement les couches sup√©rieures.

In [None]:
def feature_extraction_fine_tuning(model, task_data, config):
    """
    Fine-tune avec embeddings gel√©s.
    """
    optimizer = AdamOptimizer(learning_rate=config['lr'])
    
    # Freeze embeddings
    frozen_params = ['token_embedding', 'pos_encoding']
    
    for epoch in range(config['epochs']):
        for x, y in task_data:
            logits = model.forward(x)
            loss = cross_entropy_loss(logits, y)
            grads = model.backward(y)
            
            # Update only non-frozen parameters
            trainable_params = {k: v for k, v in model.parameters.items() 
                               if k not in frozen_params}
            trainable_grads = {k: v for k, v in grads.items() 
                              if k not in frozen_params}
            
            optimizer.update(trainable_params, trainable_grads)
    
    return model

print("‚ùÑÔ∏è Feature Extraction: G√®le les embeddings")
print("   Avantages: Plus rapide, moins de param√®tres")
print("   Inconv√©nients: Adaptation limit√©e")

### 3. Layer-wise Learning Rate
Learning rates diff√©rents par couche.

In [None]:
def layerwise_lr_fine_tuning(model, task_data, config):
    """
    Fine-tune avec learning rates diff√©rents par couche.
    
    Principe: Les couches basses (embeddings) changent peu,
              les couches hautes (output) changent plus.
    """
    # Learning rates d√©croissants par couche
    layer_lrs = {
        'token_embedding': config['lr'] * 0.1,
        'pos_encoding': config['lr'] * 0.1,
        'blocks': config['lr'] * 0.5,
        'output_proj': config['lr'] * 1.0
    }
    
    optimizers = {name: AdamOptimizer(learning_rate=lr) 
                 for name, lr in layer_lrs.items()}
    
    for epoch in range(config['epochs']):
        for x, y in task_data:
            logits = model.forward(x)
            loss = cross_entropy_loss(logits, y)
            grads = model.backward(y)
            
            # Update avec diff√©rents learning rates
            for layer_name, optimizer in optimizers.items():
                layer_params = {k: v for k, v in model.parameters.items() 
                               if layer_name in k}
                layer_grads = {k: v for k, v in grads.items() 
                              if layer_name in k}
                optimizer.update(layer_params, layer_grads)
    
    return model

print("üìä Layer-wise LR: Learning rates adaptatifs")
print("   Couches basses: LR faible (0.1x)")
print("   Couches interm√©diaires: LR moyen (0.5x)")
print("   Couches hautes: LR fort (1.0x)")

## Exemple: Fine-Tuning pour Q&A

In [None]:
# Dataset de Q&A
qa_dataset = [
    {"question": "What is the capital of France?", 
     "answer": "The capital of France is Paris."},
    {"question": "Who wrote Romeo and Juliet?", 
     "answer": "William Shakespeare wrote Romeo and Juliet."},
    {"question": "What is 2 + 2?", 
     "answer": "2 + 2 equals 4."},
]

# Format pour le fine-tuning
def prepare_qa_data(qa_dataset):
    """
    Formate les paires Q&A pour l'entra√Ænement.
    
    Format: "Q: {question}\nA: {answer}"
    """
    formatted_data = []
    
    for item in qa_dataset:
        text = f"Q: {item['question']}\nA: {item['answer']}"
        
        # Encode
        tokens = encode(text)
        
        # Input = tout sauf dernier token
        # Target = tout sauf premier token
        x = tokens[:-1]
        y = tokens[1:]
        
        formatted_data.append((x, y))
    
    return formatted_data

# Pr√©parer les donn√©es
train_data = prepare_qa_data(qa_dataset)

print(f"Dataset Q&A pr√©par√©: {len(train_data)} exemples")
print(f"\nExemple:")
print(f"  Question: {qa_dataset[0]['question']}")
print(f"  R√©ponse: {qa_dataset[0]['answer']}")

## Pr√©venir le Catastrophic Forgetting

### Techniques :

1. **Lower Learning Rate** : Utiliser un LR plus faible que le pr√©-entra√Ænement
2. **Fewer Epochs** : Ne pas surentra√Æner
3. **Regularization** : L2, Dropout
4. **Mix General Data** : M√©langer donn√©es g√©n√©rales + sp√©cifiques

In [None]:
# Configuration pour √©viter le catastrophic forgetting
safe_config = {
    'lr': 0.0001,  # 10x plus petit que pr√©-entra√Ænement
    'epochs': 3,    # Peu d'epochs
    'batch_size': 16,
    'dropout': 0.1,
    'weight_decay': 0.01  # L2 regularization
}

print("üõ°Ô∏è Configuration s√ªre pour le fine-tuning:")
for key, value in safe_config.items():
    print(f"  {key}: {value}")

## √âvaluation du Fine-Tuning

In [None]:
def evaluate_qa_model(model, test_questions):
    """
    √âvalue le mod√®le fine-tun√© sur des questions.
    """
    for question in test_questions:
        prompt = f"Q: {question}\nA:"
        
        # G√©n√©rer la r√©ponse
        answer = generate_text(model, prompt, max_length=20, 
                              strategy='top_p', p=0.9, temperature=0.7)
        
        print(f"Q: {question}")
        print(f"A: {answer}\n")

# Test questions
test_questions = [
    "What is the capital of Italy?",
    "Who painted the Mona Lisa?",
    "What is 10 + 5?"
]

print("üß™ Test du mod√®le fine-tun√©:")
# evaluate_qa_model(finetuned_model, test_questions)

## Visualisation: Avant vs Apr√®s Fine-Tuning

In [None]:
# Simuler les performances
metrics = {
    'General Tasks': [0.75, 0.73],  # L√©g√®re baisse acceptable
    'Q&A Task': [0.45, 0.92],       # Forte am√©lioration
}

fig, ax = plt.subplots(figsize=(10, 6))

x = np.arange(len(metrics))
width = 0.35

before = [v[0] for v in metrics.values()]
after = [v[1] for v in metrics.values()]

ax.bar(x - width/2, before, width, label='Avant Fine-Tuning', alpha=0.8)
ax.bar(x + width/2, after, width, label='Apr√®s Fine-Tuning', alpha=0.8)

ax.set_ylabel('Accuracy')
ax.set_title('Impact du Fine-Tuning')
ax.set_xticks(x)
ax.set_xticklabels(metrics.keys())
ax.legend()
ax.set_ylim([0, 1])

plt.tight_layout()
plt.show()

print("\nüìà R√©sultats:")
print("  ‚Ä¢ T√¢ches g√©n√©rales: L√©g√®re baisse (75% ‚Üí 73%)")
print("  ‚Ä¢ T√¢che Q&A: Forte am√©lioration (45% ‚Üí 92%)")
print("  ‚úÖ Trade-off acceptable !")