In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!cp /content/drive/MyDrive/chatbot_complet.zip /content/


In [None]:
!unzip chatbot_complet.zip


In [None]:
# D√©place le mod√®le au niveau racine
!mv /content/content/DL_Text-Synthesizing/emotion_chatbot_final /content/emotion_chatbot_final

# Si tu as le dataset, d√©place-le aussi
!mv /content/content/DL_Text-Synthesizing/empathetic_dialogues_local /content/empathetic_dialogues_local 2>/dev/null || echo "Dataset non pr√©sent"

# V√©rifie
!ls /content/emotion_chatbot_final/

In [None]:
!pip install datasets  # si pas d√©j√† install√©

from datasets import load_dataset

# Chargement une fois (internet requis ici seulement)
dataset = load_dataset("Estwld/empathetic_dialogues_llm")

# Sauvegarde locale (dans /content/ du Colab)
dataset.save_to_disk("/content/empathetic_dialogues_local")

print("‚úÖ Dataset t√©l√©charg√© et sauvegard√© localement !")

In [None]:
# ================================================
# CHATBOT √âMOTIONNEL COMPLET - Version Finale Colab
# Dataset local + Fine-tuning + Rapport + Interface Gradio
# ================================================

# Force Hugging Face Hub to operate in offline mode BEFORE importing transformers
import os
os.environ["HF_HUB_OFFLINE"] = "1"

# 1. Installation des d√©pendances
!pip install -q transformers datasets torch gradio huggingface_hub accelerate matplotlib

# 2. Imports
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
from datasets import load_from_disk
import gradio as gr
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime

In [None]:
# 3. Configuration
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"üñ•Ô∏è Device d√©tect√© : {device}")

In [None]:
# 4. Chargement du dataset LOCAL (celui que tu as sauvegard√©)
print("\nüìö Chargement du dataset local...")
dataset = load_from_disk("/content/empathetic_dialogues_local")

# Splits
train_raw = dataset["train"].select(range(12000))           # 12k pour entra√Ænement
valid_raw = dataset["valid"].select(range(1000))       # validation
test_raw  = dataset["valid"].select(range(1000, 1500))  # 500 pour test (comme dans ton code original)

print(f"‚úÖ Dataset charg√© :")
print(f"   - Train: {len(train_raw)} exemples")
print(f"   - Valid: {len(valid_raw)} exemples")
print(f"   - Test : {len(test_raw)} exemples")
emotions = sorted(list(set(train_raw["emotion"])))
print(f"   - √âmotions uniques: {len(emotions)}")

In [None]:
print("Exemple du dataset :")
print(dataset["train"][0])
print("\nCl√©s disponibles :", dataset["train"][0].keys())

In [None]:
# Cellule 5 : Initialisation du mod√®le (ex√©cute √ßa D'ABORD)
model_name = "microsoft/DialoGPT-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Tr√®s important

model = AutoModelForCausalLM.from_pretrained(model_name)
model.to(device)

print("‚úÖ Tokenizer et mod√®le charg√©s avec succ√®s !")

In [None]:
# 6. Extraction des paires + tokenisation (corrig√© et optimis√©)
print("\nüîÑ Extraction des paires et tokenisation en cours...")

def create_training_example(emotion, user_msg, assistant_msg):
    """Cr√©e un exemple tokenis√© avec pr√©fixe √©motionnel"""
    text = f"[{emotion.upper()}] User: {user_msg} {tokenizer.eos_token} Bot: {assistant_msg} {tokenizer.eos_token}"

    encoding = tokenizer(
        text,
        truncation=True,
        max_length=128,
        padding="max_length",
        return_tensors="pt"
    )

    return {
        "input_ids": encoding["input_ids"].flatten(),
        "attention_mask": encoding["attention_mask"].flatten(),
        "labels": encoding["input_ids"].flatten().clone()
    }

# Extraction des paires
train_encodings = []
valid_encodings = []
test_encodings  = []

for split_name, split_data, encodings_list in [
    ("train", train_raw, train_encodings),
    ("valid", valid_raw, valid_encodings),
    ("test",  test_raw,  test_encodings)
]:
    print(f"   Traitement {split_name}...")
    count = 0
    for example in split_data:
        emotion = example["emotion"]
        conversations = example["conversations"]

        for i in range(len(conversations) - 1):
            if (conversations[i]['role'] == 'user' and
                conversations[i+1]['role'] == 'assistant'):
                user_msg = conversations[i]['content']
                assistant_msg = conversations[i+1]['content']

                encodings_list.append(
                    create_training_example(emotion, user_msg, assistant_msg)
                )
                count += 1

    print(f"      ‚Üí {count} paires extraites pour {split_name}")

print(f"\n‚úÖ Tokenisation termin√©e !")
print(f"   Train : {len(train_encodings)} exemples")
print(f"   Valid : {len(valid_encodings)} exemples")
print(f"   Test  : {len(test_encodings)} exemples")

# Dataset PyTorch
class EmotionDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {k: v for k, v in self.encodings[idx].items()}

    def __len__(self):
        return len(self.encodings)

train_dataset = EmotionDataset(train_encodings)
valid_dataset = EmotionDataset(valid_encodings)
test_dataset  = EmotionDataset(test_encodings)

In [None]:
# 7. Param√®tres d'entra√Ænement (corrig√©s pour versions r√©centes de transformers)
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,        # Batch effectif = 16
    learning_rate=5e-5,
    warmup_steps=100,
    weight_decay=0.01,
    logging_steps=50,
    eval_strategy="steps",               # ‚Üê Chang√© : evaluation_strategy ‚Üí eval_strategy
    eval_steps=300,
    save_strategy="steps",               # Optionnel mais recommand√© pour coh√©rence
    save_steps=600,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to="none",
    fp16=(device == "cuda"),             # Acc√©l√©ration GPU
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)

print("\nüöÄ D√©but du fine-tuning...")
print("‚è±Ô∏è Dur√©e estim√©e : 45-90 minutes sur GPU T4 gratuit")

trainer.train()

In [None]:
# 8. Sauvegarde finale
save_path = "./emotion_chatbot_final"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)
print(f"üíæ Mod√®le final sauvegard√© dans {save_path}")

In [None]:
# 9. Calcul de la perplexit√© sur le set de test
print("\nüìä Calcul de la perplexit√© sur le set de test...")
test_results = trainer.evaluate(test_dataset)
final_perplexity = np.exp(test_results["eval_loss"])
print(f"‚úÖ Perplexit√© finale : {final_perplexity:.2f}")

In [None]:
# 10. G√©n√©ration d'exemples qualitatifs
def generate_response(prompt, emotion="joyful"):
    input_text = f"[{emotion.upper()}] User: {prompt} {tokenizer.eos_token} Bot:"
    inputs = tokenizer(input_text, return_tensors="pt").to(device)
    outputs = model.generate(**inputs, max_new_tokens=80, temperature=0.9, do_sample=True, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("Bot:")[-1].strip()

test_prompts = [
    "Je viens de r√©ussir mon examen !",
    "Je me sens triste aujourd'hui...",
    "Je suis vraiment en col√®re contre mon coll√®gue.",
    "Wow, je n'arrive pas √† y croire !"
]
qualitative_examples = ""
for i, (prompt, emotion) in enumerate(zip(test_prompts, ["joyful", "sad", "angry", "surprised"])):
    response = generate_response(prompt, emotion)
    qualitative_examples += f"- [{emotion.upper()}] {prompt} ‚Üí {response}\n"

In [None]:
# 11. Classe ReportGenerator (corrig√©e avec indentation propre)
class ReportGenerator:
    def __init__(self, output_dir="./report"):
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)

    def plot_training_curves(self, train_loss, train_steps, eval_loss, eval_steps, perplexity):
        fig, axes = plt.subplots(1, 2, figsize=(16, 6))

        # Courbe Loss
        axes[0].plot(train_steps, train_loss, label='Train Loss', color='blue', linewidth=2, marker='o', markersize=4)
        axes[0].plot(eval_steps, eval_loss, label='Eval Loss', color='orange', linewidth=2, marker='s', markersize=4)
        axes[0].set_title('Training and Evaluation Loss', fontsize=14)
        axes[0].set_xlabel('Training Steps')
        axes[0].set_ylabel('Loss')
        axes[0].legend(fontsize=12)
        axes[0].grid(True, alpha=0.3)

        # Courbe Perplexity
        axes[1].plot(eval_steps, perplexity, color='green', linewidth=2, marker='o', markersize=5)
        axes[1].set_title('Model Perplexity Over Time', fontsize=14)
        axes[1].set_xlabel('Evaluation Steps')
        axes[1].set_ylabel('Perplexity')
        axes[1].grid(True, alpha=0.3)

        # Zoom automatique sur les valeurs de perplexity
        if len(perplexity) > 0:
            min_p = min(perplexity)
            max_p = max(perplexity)
            margin = (max_p - min_p) * 0.05
            axes[1].set_ylim(min_p - margin, max_p + margin)

        plt.tight_layout()
        plt.savefig(f"{self.output_dir}/training_curves.png", dpi=300, bbox_inches='tight')
        plt.show()
        print(f"üìä Graphique corrig√© sauvegard√© : {self.output_dir}/training_curves.png")
        plt.close()
    def generate_markdown_report(self, config):
        report = f"""# Rapport de Projet - Deep Learning
## G√©n√©ration de Texte Conversationnel Contr√¥l√©e par √âmotions

**Date:** {datetime.now().strftime('%Y-%m-%d %H:%M')}

---

## 1. Introduction

Syst√®me de g√©n√©ration conversationnelle avec contr√¥le √©motionnel via pr√©fixe, fine-tun√© sur EmpatheticDialogues.

## 2. M√©thodologie

### 2.1 Dataset
- Taille train: {config['train_size']}
- Taille validation: {config['valid_size']}
- Taille test: {config['test_size']}
- Nombre d'√©motions: {config['n_emotions']}

### 2.2 Mod√®le
- Architecture: DialoGPT-small
- Param√®tres: {config['n_params']}M
- Device: {config['device']}

### 2.3 Hyperparam√®tres
- Epochs: 3
- Learning rate: 5e-5
- Batch effectif: 16 (4 + accumulation 4)

## 3. R√©sultats

### 3.1 M√©triques
- Loss finale: {config['final_loss']:.4f}
- Perplexit√©: {config['perplexity']:.2f}

### 3.2 Courbes
![Training Curves](training_curves.png)

## 4. Exemples Qualitatifs

{config['qualitative_examples']}

## 5. Conclusion & Perspectives

Le contr√¥le √©motionnel par pr√©fixe fonctionne bien. Perspectives : LoRA, plus de donn√©es, mod√®les plus grands.

---
"""
        with open(f"{self.output_dir}/rapport.md", 'w', encoding='utf-8') as f:
            f.write(report)
        print(f"üìÑ Rapport g√©n√©r√©: {self.output_dir}/rapport.md")

In [None]:
# R√©cup√©ration compl√®te des donn√©es (logs + steps)
log_history = trainer.state.log_history

# Train loss et steps
train_loss = [log['loss'] for log in log_history if 'loss' in log and 'eval_loss' not in log]
train_steps = [log['step'] for log in log_history if 'loss' in log and 'eval_loss' not in log]

# Eval loss et steps
eval_loss = [log['eval_loss'] for log in log_history if 'eval_loss' in log]
eval_steps = [log['step'] for log in log_history if 'eval_loss' in log]

# Perplexity pendant l'entra√Ænement
perplexity = [np.exp(x) for x in eval_loss if not np.isnan(x)]

# G√©n√©ration du graphique CORRIG√â avec les vrais steps
report_gen = ReportGenerator()
report_gen.plot_training_curves(train_loss, train_steps, eval_loss, eval_steps, perplexity)

print("‚úÖ Graphique final g√©n√©r√© avec axe X correct et zoom parfait !")

In [None]:
# G√©n√©ration du rapport corrig√©e
report_gen = ReportGenerator()

# Extraction des losses + steps + perplexity pendant l'entra√Ænement
log_history = trainer.state.log_history

train_loss = [log['loss'] for log in log_history if 'loss' in log and 'eval_loss' not in log]
train_steps = [log['step'] for log in log_history if 'loss' in log and 'eval_loss' not in log]

eval_loss = [log['eval_loss'] for log in log_history if 'eval_loss' in log]
eval_steps = [log['step'] for log in log_history if 'eval_loss' in log]

# Perplexity pendant l'entra√Ænement (pour le graphique)
perplexity = [np.exp(x) for x in eval_loss if not np.isnan(x)]

# Utilise la perplexity finale pour le rapport
final_perplexity_text = np.exp(test_results["eval_loss"]) if 'test_results' in locals() else final_perplexity

# G√©n√®re le graphique (avec zoom automatique)
report_gen.plot_training_curves(train_loss, train_steps, eval_loss, eval_steps, perplexity)

config = {
    'train_size': len(train_raw),
    'valid_size': len(valid_raw),
    'test_size': len(test_raw),
    'n_emotions': len(emotions),
    'n_params': f"{sum(p.numel() for p in model.parameters())/1e6:.1f}",
    'device': device,
    'final_loss': train_loss[-1] if train_loss else 'N/A',
    'perplexity': final_perplexity_text,
    'qualitative_examples': qualitative_examples
}

report_gen.generate_markdown_report(config)

In [None]:
# Force la r√©g√©n√©ration du bon graphique (m√™me si le fichier existe d√©j√†)
report_gen = ReportGenerator()  # Cr√©e un nouveau g√©n√©rateur (dossier report existe d√©j√†)

# Extraction correcte des donn√©es
log_history = trainer.state.log_history

train_loss = [log['loss'] for log in log_history if 'loss' in log and 'eval_loss' not in log]
train_steps = [log['step'] for log in log_history if 'loss' in log and 'eval_loss' not in log]

eval_loss = [log['eval_loss'] for log in log_history if 'eval_loss' in log]
eval_steps = [log['step'] for log in log_history if 'eval_loss' in log]

perplexity = [np.exp(x) for x in eval_loss if not np.isnan(x)]

# G√©n√®re le graphique CORRIG√â (avec zoom et sans bug d'axe)
report_gen.plot_training_curves(train_loss, train_steps, eval_loss, eval_steps, perplexity)

print("‚úÖ Nouveau training_curves.png g√©n√©r√© et sauvegard√© dans le dossier report !")

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os

# os.environ["HF_HUB_OFFLINE"] = "1" is now set in an earlier cell

model_path = "emotion_chatbot_final"  # Le dossier que tu as apr√®s l'entra√Ænement

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_path, local_files_only=True)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()

print("‚úÖ Mod√®le charg√© et pr√™t !")

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
import gradio as gr

# Chemin du mod√®le (adapte si besoin apr√®s ton unzip)
model_path = "/content/emotion_chatbot_final"  # ou "/content/DL_Text-Synthesizing/emotion_chatbot_final" si tu n'as pas d√©plac√©

tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(model_path)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)
model.eval()
print("‚úÖ Mod√®le charg√© !")

# Auto-d√©tection d'√©motion
emotion_classifier = pipeline("text-classification",
                              model="bhadresh-savani/distilbert-base-uncased-emotion",
                              device=0 if torch.cuda.is_available() else -1)

emotion_mapping = {
    'joy': 'joyful',
    'sadness': 'sad',
    'anger': 'angry',
    'fear': 'afraid',
    'love': 'grateful',
    'surprise': 'surprised'
}

def detect_emotion(text):
    result = emotion_classifier(text)[0]
    detected = result['label'].lower()
    if result['score'] > 0.5:
        return emotion_mapping.get(detected, 'neutral')
    return 'neutral'

# G√©n√©ration de r√©ponse
def generate_response(message):
    emotion = detect_emotion(message)
    prompt = f"[{emotion.upper()}] User: {message} {tokenizer.eos_token} Bot:"
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    outputs = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.9,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    response = response.split("Bot:")[-1].strip() if "Bot:" in response else response
    return response, emotion

# Interface Gradio avec bouton "Envoyer" + Entr√©e fonctionnelle
def chat(message, history=[]):
    if not message.strip():
        return history, history
    response, emotion = generate_response(message)
    history.append((message, f"{response}\n\n(√âmotion d√©tect√©e : {emotion})"))
    return history, history

with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# ü§ñ Chatbot √âmotionnel Auto-D√©tection")
    gr.Markdown("Le bot d√©tecte automatiquement ton √©motion. Appuie sur **Entr√©e** ou clique sur **Envoyer** !")

    chatbot = gr.Chatbot(height=600)

    with gr.Row():
        msg = gr.Textbox(
            label="Ton message",
            placeholder="√âcris ici et appuie sur Entr√©e ‚èé",
            lines=3,
            scale=6,
            submit_btn="Envoyer"  # ‚Üê Bouton visible + Entr√©e fonctionne
        )

    clear = gr.Button("üóëÔ∏è Effacer la conversation")

    msg.submit(chat, [msg, chatbot], [chatbot, chatbot])
    clear.click(lambda: None, None, chatbot)

print("üîó Lancement du chatbot avec bouton Entr√©e corrig√© !")
demo.launch(share=True)

In [None]:
# Zip automatique de TOUT ce qui existe dans /content/ (sans warnings inutiles)
!zip -r chatbot_complet.zip /content/*

# T√©l√©chargement direct sur ton PC
from google.colab import files
files.download('chatbot_complet.zip')