In [11]:
"""
Syst√®me de G√©n√©ration de Texte Conversationnel Contr√¥l√© par √âmotions
Bas√© sur DialoGPT avec le dataset EmpatheticDialogues
Optimis√© pour 8GB RAM, processeur i3
AVEC INTERFACE WEB GRADIO POUR TEST LOCAL
"""

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset, Dataset
import pandas as pd
import numpy as np
from torch.utils.data import Dataset as TorchDataset
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import json
import os
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üñ•Ô∏è  Device: {device}")

üñ•Ô∏è  Device: cuda


In [12]:
# ============================================
# PARTIE 1: CHARGEMENT ET PR√âPARATION DES DONN√âES
# ============================================

class DatasetManager:
    """Gestion compl√®te du dataset EmpatheticDialogues"""

    def __init__(self):
        self.emotions = []
        self.train_data = None
        self.valid_data = None
        self.test_data = None

    def load_empathetic_dialogues(self, train_size=10000, valid_size=1000, test_size=500):
        """
        Charge et pr√©pare le dataset EmpatheticDialogues

        Args:
            train_size: Nombre d'exemples d'entra√Ænement
            valid_size: Nombre d'exemples de validation
            test_size: Nombre d'exemples de test
        """
        print("\n" + "="*60)
        print("üìö CHARGEMENT DU DATASET")
        print("="*60)

        try:
            dataset = load_dataset("empathetic_dialogues", trust_remote_code=True)

            # Pr√©parer les splits
            self.train_data = dataset['train'].select(range(min(train_size, len(dataset['train']))))
            self.valid_data = dataset['validation'].select(range(min(valid_size, len(dataset['validation']))))

            # Cr√©er un set de test √† partir de la validation
            test_indices = list(range(valid_size, valid_size + test_size))
            self.test_data = dataset['validation'].select(test_indices)

            # Extraire les √©motions uniques
            self.emotions = list(set(self.train_data['context']))

            print(f"‚úÖ Dataset charg√© avec succ√®s")
            print(f"   - Entra√Ænement: {len(self.train_data)} exemples")
            print(f"   - Validation: {len(self.valid_data)} exemples")
            print(f"   - Test: {len(self.test_data)} exemples")
            print(f"   - √âmotions: {len(self.emotions)}")

            return True

        except Exception as e:
            print(f"‚ùå Erreur: {e}")
            print("üìù Cr√©ation d'un dataset de d√©monstration...")
            self._create_demo_dataset(train_size, valid_size, test_size)
            return False

    def _create_demo_dataset(self, train_size, valid_size, test_size):
        """Cr√©e un dataset de d√©monstration"""
        emotions = ['joyful', 'sad', 'angry', 'surprised', 'afraid', 'excited',
                   'proud', 'grateful', 'anxious', 'disappointed']

        prompts = {
            'joyful': ["Je viens de r√©ussir!", "C'est une belle journ√©e", "J'ai gagn√©!"],
            'sad': ["Je me sens seul", "J'ai perdu quelque chose", "C'est difficile"],
            'angry': ["On m'a menti", "C'est injuste", "Je suis furieux"],
            'surprised': ["Je ne m'attendais pas √† √ßa", "Incroyable!", "Wow!"],
            'afraid': ["J'ai peur", "C'est inqui√©tant", "Je suis anxieux"]
        }

        responses = {
            'joyful': ["C'est fantastique!", "Je suis heureux pour toi!", "Super!"],
            'sad': ["Je comprends", "C'est difficile", "Je suis l√† pour toi"],
            'angry': ["Je comprends ta frustration", "C'est vraiment √©nervant", "Tu as raison"],
            'surprised': ["C'est incroyable!", "Wow!", "Je suis surpris aussi!"],
            'afraid': ["C'est normal d'avoir peur", "Je comprends", "Tu n'es pas seul"]
        }

        data = []
        for _ in range(train_size + valid_size + test_size):
            emotion = np.random.choice(list(prompts.keys()))
            data.append({
                'context': emotion,
                'prompt': np.random.choice(prompts[emotion]),
                'utterance': np.random.choice(responses[emotion])
            })

        dataset = Dataset.from_list(data)
        self.train_data = Dataset.from_list(data[:train_size])
        self.valid_data = Dataset.from_list(data[train_size:train_size+valid_size])
        self.test_data = Dataset.from_list(data[train_size+valid_size:])
        self.emotions = list(prompts.keys())

    def analyze_dataset(self):
        """Analyse statistique du dataset"""
        print("\n" + "="*60)
        print("üìä ANALYSE DU DATASET")
        print("="*60)

        # Distribution des √©motions
        emotion_counts = {}
        for item in self.train_data:
            emotion = item.get('context', 'unknown')
            emotion_counts[emotion] = emotion_counts.get(emotion, 0) + 1

        print("\nüé≠ Distribution des √©motions (train):")
        for emotion, count in sorted(emotion_counts.items(), key=lambda x: x[1], reverse=True)[:10]:
            print(f"   {emotion:20s}: {count:4d} ({count/len(self.train_data)*100:.1f}%)")

        # Longueur moyenne
        avg_prompt_len = np.mean([len(item.get('prompt', '').split()) for item in self.train_data])
        avg_response_len = np.mean([len(item.get('utterance', '').split()) for item in self.train_data])

        print(f"\nüìè Statistiques de longueur:")
        print(f"   Prompts: {avg_prompt_len:.1f} mots en moyenne")
        print(f"   R√©ponses: {avg_response_len:.1f} mots en moyenne")

        return emotion_counts


In [13]:
# ============================================
# PARTIE 2: DATASET PYTORCH CUSTOM
# ============================================

class EmotionalDialogueDataset(TorchDataset):
    """Dataset PyTorch avec pr√©fixe √©motionnel"""

    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]

        emotion = item.get('context', 'neutral')
        prompt = item.get('prompt', '')
        response = item.get('utterance', '')

        # Format avec pr√©fixe √©motionnel
        text = f"[{emotion.upper()}] User: {prompt} {self.tokenizer.eos_token} Bot: {response} {self.tokenizer.eos_token}"

        encoding = self.tokenizer(
            text,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': encoding['input_ids'].squeeze()
        }

In [14]:
# ============================================
# PARTIE 3: MOD√àLE ET FINE-TUNING
# ============================================

class EmotionalChatbotTrainer:
    """Gestionnaire d'entra√Ænement avec m√©triques"""

    def __init__(self, model_name="microsoft/DialoGPT-small"):
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.trainer = None
        self.training_history = {
            'train_loss': [],
            'eval_loss': [],
            'perplexity': []
        }

    def initialize_model(self):
        """Initialise le mod√®le et le tokenizer"""
        print("\n" + "="*60)
        print("ü§ñ INITIALISATION DU MOD√àLE")
        print("="*60)

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        self.tokenizer.pad_token = self.tokenizer.eos_token

        self.model = AutoModelForCausalLM.from_pretrained(self.model_name)
        self.model.to(device)

        n_params = sum(p.numel() for p in self.model.parameters())
        n_trainable = sum(p.numel() for p in self.model.parameters() if p.requires_grad)

        print(f"‚úÖ Mod√®le: {self.model_name}")
        print(f"   - Param√®tres totaux: {n_params/1e6:.1f}M")
        print(f"   - Param√®tres entra√Ænables: {n_trainable/1e6:.1f}M")
        print(f"   - Device: {device}")

    def prepare_training(self, train_dataset, eval_dataset, output_dir="./results"):
        """Configure l'entra√Ænement"""
        training_args = TrainingArguments(
            output_dir=output_dir,
            overwrite_output_dir=True,
            num_train_epochs=3,
            per_device_train_batch_size=2,
            per_device_eval_batch_size=2,
            gradient_accumulation_steps=4,
            learning_rate=5e-5,
            warmup_steps=500,
            weight_decay=0.01,
            logging_dir='./logs',
            logging_steps=50,
            save_steps=500,
            save_total_limit=2,
            eval_strategy="steps",
            eval_steps=250,
            use_cpu=(device == "cpu"),
            load_best_model_at_end=True,
            metric_for_best_model="eval_loss",
            report_to="none"  # D√©sactiver W&B
        )

        self.trainer = Trainer(
            model=self.model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
        )

    def train(self):
        """Lance l'entra√Ænement"""
        print("\n" + "="*60)
        print("üöÄ D√âBUT DU FINE-TUNING")
        print("="*60)
        print("‚è±Ô∏è  Dur√©e estim√©e: 2-4 heures sur CPU i3")
        print("üí° Conseil: Laissez tourner pendant la nuit\n")

        # Entra√Æner
        self.trainer.train()

        # Extraire l'historique
        log_history = self.trainer.state.log_history
        for log in log_history:
            if 'loss' in log:
                self.training_history['train_loss'].append(log['loss'])
            if 'eval_loss' in log:
                self.training_history['eval_loss'].append(log['eval_loss'])
                # Calculer la perplexit√©
                perplexity = np.exp(log['eval_loss'])
                self.training_history['perplexity'].append(perplexity)

        print("\n‚úÖ Fine-tuning termin√©!")

    def save_model(self, save_path="./emotion_chatbot_final"):
        """Sauvegarde le mod√®le fine-tun√©"""
        self.model.save_pretrained(save_path)
        self.tokenizer.save_pretrained(save_path)
        print(f"üíæ Mod√®le sauvegard√©: {save_path}")



In [15]:
# ============================================
# PARTIE 4: √âVALUATION QUANTITATIVE
# ============================================

class ModelEvaluator:
    """√âvaluation quantitative du mod√®le"""

    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.model.eval()

    def calculate_perplexity(self, dataset):
        """Calcule la perplexit√© sur un dataset"""
        total_loss = 0
        total_tokens = 0

        print("\nüìä Calcul de la perplexit√©...")

        with torch.no_grad():
            for i, item in enumerate(dataset):
                if i % 100 == 0:
                    print(f"   Progression: {i}/{len(dataset)}")

                inputs = self.tokenizer(
                    item.get('utterance', ''),
                    return_tensors='pt',
                    truncation=True,
                    max_length=128
                ).to(device)

                outputs = self.model(**inputs, labels=inputs['input_ids'])
                loss = outputs.loss

                total_loss += loss.item() * inputs['input_ids'].size(1)
                total_tokens += inputs['input_ids'].size(1)

        perplexity = np.exp(total_loss / total_tokens)
        print(f"‚úÖ Perplexit√©: {perplexity:.2f}")

        return perplexity

    def generate_responses(self, test_cases, emotion='neutral', max_length=80):
        """G√©n√®re des r√©ponses pour des cas de test"""
        results = []

        for case in test_cases:
            prompt = f"[{emotion.upper()}] User: {case} {self.tokenizer.eos_token} Bot:"
            inputs = self.tokenizer(prompt, return_tensors='pt').to(device)

            with torch.no_grad():
                outputs = self.model.generate(
                    inputs['input_ids'],
                    max_length=max_length,
                    num_return_sequences=1,
                    temperature=0.8,
                    top_k=50,
                    top_p=0.95,
                    do_sample=True,
                    pad_token_id=self.tokenizer.eos_token_id,
                    eos_token_id=self.tokenizer.eos_token_id
                )

            full_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
            response = full_text.split("Bot:")[-1].strip() if "Bot:" in full_text else full_text

            results.append({
                'prompt': case,
                'emotion': emotion,
                'response': response
            })

        return results


In [16]:
# ============================================
# PARTIE 5: VISUALISATION ET RAPPORT
# ============================================

class ReportGenerator:
    """G√©n√®re un rapport acad√©mique complet"""

    def __init__(self, output_dir="./report"):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

    def plot_training_curves(self, training_history):
        """Visualise les courbes d'entra√Ænement"""
        fig, axes = plt.subplots(1, 2, figsize=(14, 5))

        # Loss
        if training_history['train_loss']:
            axes[0].plot(training_history['train_loss'], label='Train Loss', linewidth=2)
        if training_history['eval_loss']:
            axes[0].plot(training_history['eval_loss'], label='Eval Loss', linewidth=2)
        axes[0].set_xlabel('Steps')
        axes[0].set_ylabel('Loss')
        axes[0].set_title('Training and Evaluation Loss')
        axes[0].legend()
        axes[0].grid(True, alpha=0.3)

        # Perplexity
        if training_history['perplexity']:
            axes[1].plot(training_history['perplexity'], color='green', linewidth=2)
            axes[1].set_xlabel('Evaluation Steps')
            axes[1].set_ylabel('Perplexity')
            axes[1].set_title('Model Perplexity Over Time')
            axes[1].grid(True, alpha=0.3)

        plt.tight_layout()
        plt.savefig(f"{self.output_dir}/training_curves.png", dpi=300)
        print(f"üìä Graphiques sauvegard√©s: {self.output_dir}/training_curves.png")
        plt.close()

    def generate_markdown_report(self, config):
        """G√©n√®re un rapport Markdown"""
        report = f"""# Rapport de Projet - Deep Learning
## G√©n√©ration de Texte Conversationnel Contr√¥l√©e par √âmotions

**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M')}
**Auteur:** [Votre Nom]

---

## 1. Introduction

Ce projet impl√©mente un syst√®me de g√©n√©ration de texte conversationnel avec contr√¥le √©motionnel,
bas√© sur l'architecture Transformer et fine-tun√© sur le dataset EmpatheticDialogues.

## 2. M√©thodologie

### 2.1 Dataset
- **Nom:** EmpatheticDialogues
- **Taille train:** {config.get('train_size', 'N/A')} exemples
- **Taille validation:** {config.get('valid_size', 'N/A')} exemples
- **Taille test:** {config.get('test_size', 'N/A')} exemples
- **Nombre d'√©motions:** {config.get('n_emotions', 'N/A')}

### 2.2 Mod√®le
- **Architecture:** DialoGPT (Transformer)
- **Param√®tres:** {config.get('n_params', 'N/A')}M
- **Technique:** Fine-tuning complet
- **Device:** {config.get('device', 'N/A')}

### 2.3 Hyperparam√®tres
- **Learning rate:** 5e-5
- **Batch size:** 2 (gradient accumulation: 4)
- **Epochs:** 3
- **Max length:** 128 tokens

## 3. R√©sultats

### 3.1 M√©triques Quantitatives
- **Loss finale:** {config.get('final_loss', 'N/A')}
- **Perplexit√©:** {config.get('perplexity', 'N/A')}

### 3.2 Courbes d'Apprentissage
![Training Curves](training_curves.png)

## 4. Exemples Qualitatifs

{config.get('qualitative_examples', '')}

## 5. Conclusion

Le mod√®le fine-tun√© d√©montre une capacit√© √† g√©n√©rer des r√©ponses contextuellement appropri√©es
selon l'√©motion sp√©cifi√©e, validant l'approche de pr√©fixe √©motionnel pour le contr√¥le conditionnel.

## 6. Perspectives

- Augmenter la taille du dataset
- Tester d'autres architectures (GPT-2, LLaMA)
- Impl√©menter des techniques d'optimisation (LoRA, QLoRA)
- √âvaluation humaine plus approfondie

---

**Code source:** Disponible dans le projet
"""

        with open(f"{self.output_dir}/rapport.md", 'w', encoding='utf-8') as f:
            f.write(report)

        print(f"üìÑ Rapport g√©n√©r√©: {self.output_dir}/rapport.md")


In [17]:
# ============================================
# PROGRAMME PRINCIPAL
# ============================================

def main():
    print("\n" + "="*60)
    print("üéì PROJET MASTER - DEEP LEARNING")
    print("Chatbot √âmotionnel avec Fine-tuning")
    print("="*60 + "\n")

    # 1. Charger le dataset
    dm = DatasetManager()
    dm.load_empathetic_dialogues(train_size=10000, valid_size=1000, test_size=500)
    emotion_dist = dm.analyze_dataset()

    # 2. Initialiser le mod√®le
    trainer_obj = EmotionalChatbotTrainer()
    trainer_obj.initialize_model()

    # 3. Pr√©parer les datasets
    train_dataset = EmotionalDialogueDataset(dm.train_data, trainer_obj.tokenizer)
    eval_dataset = EmotionalDialogueDataset(dm.valid_data, trainer_obj.tokenizer)
    test_dataset = EmotionalDialogueDataset(dm.test_data, trainer_obj.tokenizer)

    # 4. Configurer l'entra√Ænement
    trainer_obj.prepare_training(train_dataset, eval_dataset)

    # 5. Entra√Æner
    print("\n‚ö†Ô∏è  L'entra√Ænement va commencer. Voulez-vous continuer? (o/n)")
    choice = input("> ").lower()

    if choice != 'o':
        print("‚ùå Entra√Ænement annul√©")
        return

    trainer_obj.train()

    # 6. Sauvegarder
    trainer_obj.save_model("./emotion_chatbot_final")

    # 7. √âvaluer
    evaluator = ModelEvaluator(trainer_obj.model, trainer_obj.tokenizer)
    perplexity = evaluator.calculate_perplexity(dm.test_data)

    # 8. G√©n√©rer des exemples
    test_cases = [
        "Je viens de r√©ussir mon examen!",
        "Je me sens triste aujourd'hui",
        "Je suis en col√®re contre lui",
        "Quelle surprise incroyable!"
    ]

    results = []
    for emotion in ['joyful', 'sad', 'angry', 'surprised']:
        results.extend(evaluator.generate_responses([test_cases[0]], emotion=emotion))

    print("\n" + "="*60)
    print("üìù EXEMPLES DE G√âN√âRATION")
    print("="*60)
    for r in results[:4]:
        print(f"\nüé≠ [{r['emotion'].upper()}]")
        print(f"   User: {r['prompt']}")
        print(f"   Bot: {r['response']}")

    # 9. G√©n√©rer le rapport
    report_gen = ReportGenerator()
    report_gen.plot_training_curves(trainer_obj.training_history)

    config = {
        'train_size': len(dm.train_data),
        'valid_size': len(dm.valid_data),
        'test_size': len(dm.test_data),
        'n_emotions': len(dm.emotions),
        'n_params': f"{sum(p.numel() for p in trainer_obj.model.parameters())/1e6:.1f}",
        'device': device,
        'final_loss': trainer_obj.training_history['train_loss'][-1] if trainer_obj.training_history['train_loss'] else 'N/A',
        'perplexity': f"{perplexity:.2f}",
        'qualitative_examples': '\n'.join([f"- [{r['emotion']}] {r['prompt']} ‚Üí {r['response']}" for r in results[:4]])
    }

    report_gen.generate_markdown_report(config)

    print("\n" + "="*60)
    print("‚úÖ PROJET TERMIN√â!")
    print("="*60)
    print(f"üìÅ Fichiers g√©n√©r√©s:")
    print(f"   - Mod√®le: ./emotion_chatbot_final/")
    print(f"   - Rapport: ./report/rapport.md")
    print(f"   - Graphiques: ./report/training_curves.png")

if __name__ == "__main__":
    # pip install torch transformers datasets matplotlib seaborn scikit-learn
    main()

`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'empathetic_dialogues' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
ERROR:datasets.load:`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'empathetic_dialogues' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.



üéì PROJET MASTER - DEEP LEARNING
Chatbot √âmotionnel avec Fine-tuning


üìö CHARGEMENT DU DATASET
‚ùå Erreur: Dataset scripts are no longer supported, but found empathetic_dialogues.py
üìù Cr√©ation d'un dataset de d√©monstration...

üìä ANALYSE DU DATASET

üé≠ Distribution des √©motions (train):
   sad                 : 2059 (20.6%)
   afraid              : 2014 (20.1%)
   surprised           : 2003 (20.0%)
   joyful              : 1964 (19.6%)
   angry               : 1960 (19.6%)

üìè Statistiques de longueur:
   Prompts: 2.9 mots en moyenne
   R√©ponses: 2.9 mots en moyenne

ü§ñ INITIALISATION DU MOD√àLE
‚úÖ Mod√®le: microsoft/DialoGPT-small
   - Param√®tres totaux: 124.4M
   - Param√®tres entra√Ænables: 124.4M
   - Device: cuda

‚ö†Ô∏è  L'entra√Ænement va commencer. Voulez-vous continuer? (o/n)
> o

üöÄ D√âBUT DU FINE-TUNING
‚è±Ô∏è  Dur√©e estim√©e: 2-4 heures sur CPU i3
üí° Conseil: Laissez tourner pendant la nuit



Step,Training Loss,Validation Loss
250,0.1196,0.085785
500,0.0399,0.036241
750,0.0347,0.031475
1000,0.0332,0.032427
1250,0.2024,0.031534
1500,0.0325,0.031069
1750,0.0324,0.032243
2000,0.0319,0.031118
2250,0.0316,0.030051
2500,0.0313,0.030747


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].



‚úÖ Fine-tuning termin√©!
üíæ Mod√®le sauvegard√©: ./emotion_chatbot_final

üìä Calcul de la perplexit√©...
   Progression: 0/500
   Progression: 100/500
   Progression: 200/500
   Progression: 300/500
   Progression: 400/500
‚úÖ Perplexit√©: 29.90

üìù EXEMPLES DE G√âN√âRATION

üé≠ [JOYFUL]
   User: Je viens de r√©ussir mon examen!
   Bot: Super!

üé≠ [SAD]
   User: Je viens de r√©ussir mon examen!
   Bot: Je comprends

üé≠ [ANGRY]
   User: Je viens de r√©ussir mon examen!
   Bot: Je comprends ta frustration

üé≠ [SURPRISED]
   User: Je viens de r√©ussir mon examen!
   Bot: C'est incroyable!
üìä Graphiques sauvegard√©s: ./report/training_curves.png
üìÑ Rapport g√©n√©r√©: ./report/rapport.md

‚úÖ PROJET TERMIN√â!
üìÅ Fichiers g√©n√©r√©s:
   - Mod√®le: ./emotion_chatbot_final/
   - Rapport: ./report/rapport.md
   - Graphiques: ./report/training_curves.png
