In [None]:
# --- 0. IMPORTS & PARAMETERS ---
import os
import time
import pandas as pd
import numpy as np
import joblib

# On garde uniquement les bibliothèques de calcul (pas de graphique)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

# --- CONFIGURATION OPTIMISÉE ---
DEBUG = False          # Mettre à False pour le rendu final (tout le dataset)
MAX_ROWS = 5000        # (Ignoré si DEBUG = False)
N_ESTIMATORS = 100     # Plus d'arbres pour la forêt

# Création des dossiers
os.makedirs("splits", exist_ok=True)
os.makedirs("models", exist_ok=True)
# os.makedirs("figures", exist_ok=True) # Plus besoin de dossier figures

# Mapping
label_map = {
    0: 'Sadness', 1: 'Joy', 2: 'Love', 
    3: 'Anger', 4: 'Fear', 5: 'Surprise'
}

# --- 1. CHARGEMENT ---
print(" Loading Data...")
df = pd.read_csv("emotions_cleaned_features.csv").dropna(subset=['text', 'label'])

if DEBUG and len(df) > MAX_ROWS:
    df = df.sample(n=MAX_ROWS, random_state=42).reset_index(drop=True)
    print(f" DEBUG MODE: Reduced to {MAX_ROWS} rows.")
else:
    print(f" FULL MODE: Training on {len(df)} rows.")

# --- 2. SPLIT RIGOUREUX (70/15/15) ---
X = df['text']
y = df['label'] 

print("✂️ Splitting Data (70% Train / 15% Val / 15% Test)...")
X_train_raw, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
X_val_raw, X_test_raw, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Sauvegarde CSV légère
pd.DataFrame({'text': X_train_raw, 'label': y_train}).to_csv("splits/train.csv", index=False)
pd.DataFrame({'text': X_val_raw, 'label': y_val}).to_csv("splits/val.csv", index=False)
pd.DataFrame({'text': X_test_raw, 'label': y_test}).to_csv("splits/test.csv", index=False)
print("✅ Splits saved.")

# --- 3. PREPROCESSING HD ---
print(" Variable Transformation (TF-IDF HD)...")

# Modification Experte : 15k mots + Trigrammes
vectorizer = TfidfVectorizer(
    max_features=15000,
    ngram_range=(1, 3),
    stop_words='english'
)

X_train_vec = vectorizer.fit_transform(X_train_raw)
X_val_vec = vectorizer.transform(X_val_raw)
X_test_vec = vectorizer.transform(X_test_raw)

joblib.dump(vectorizer, "models/tfidf_vectorizer.pkl")
print(f"✅ Vectorizer HD saved. Vocab size: {len(vectorizer.vocabulary_)}")

# --- 4. MULTI-MODEL TRAINING (Le Tournoi) ---
models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=N_ESTIMATORS, class_weight='balanced', random_state=42, n_jobs=-1),
    "MLP Classifier": MLPClassifier(hidden_layer_sizes=(64, 32), max_iter=10, random_state=42, early_stopping=True)
}

metrics = {}

print("\n Starting Model Tournament...")

for name, model in models.items():
    print(f"\n==> Training {name}...")
    start_time = time.time()
    
    model.fit(X_train_vec, y_train)
    
    # On évalue sur le Validation Set
    y_pred_val = model.predict(X_val_vec)
    
    duration = time.time() - start_time
    
    metrics[name] = {
        "Val Accuracy": accuracy_score(y_val, y_pred_val),
        "Val F1-Score": f1_score(y_val, y_pred_val, average='weighted'),
        "Training Time (s)": round(duration, 2)
    }
    # Sauvegarde de chaque modèle
    joblib.dump(model, f"models/{name.replace(' ', '_')}.pkl")
    print(f"   -> F1-Score (Val): {metrics[name]['Val F1-Score']:.4f}")

# --- 5. RÉSULTATS ---
metrics_df = pd.DataFrame(metrics).T
print("\n --- TOURNAMENT RESULTS --- ")
print(metrics_df.sort_values(by="Val F1-Score", ascending=False))

# --- 6. VERDICT ---
best_model_name = metrics_df["Val F1-Score"].idxmax()
print(f"\n Champion: {best_model_name}")
print(f"✅ Phase 1 Completed successfully. Models are ready in 'models/' folder.")