In [None]:
from transformers import AutoTokenizer, AutoModel

model_name = "UBC-NLP/MARBERT"

# Télécharger et sauvegarder
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

tokenizer.save_pretrained("marbert_local")
model.save_pretrained("marbert_local")


In [None]:
import shutil

shutil.make_archive("marbert_local", 'zip', "marbert_local")


'/content/marbert_local.zip'

In [None]:
pip install xgboost scikit-learn




In [None]:
import pandas as pd
df_final = pd.read_csv('/content/donnees_emotions.csv')


In [None]:
import joblib
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, hamming_loss
from sklearn.decomposition import PCA
from imblearn.over_sampling import ADASYN
from xgboost import XGBClassifier
import torch
from transformers import AutoTokenizer, AutoModel

# === Paramètres ===
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']
model_name = "UBC-NLP/MARBERT"

# === Chargement du modèle MARBERT ===
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.eval()

# === Fonction pour obtenir les embeddings BERT (moyenne des tokens) ===
def get_bert_embeddings(texts, tokenizer, model, batch_size=16):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    embeddings = []

    with torch.no_grad():
        for i in range(0, len(texts), batch_size):
            batch = texts[i:i+batch_size]
            encoded = tokenizer(batch, padding=True, truncation=True, return_tensors="pt", max_length=128).to(device)
            output = model(**encoded)
            pooled = output.last_hidden_state.mean(dim=1)
            embeddings.append(pooled.cpu())

    return torch.cat(embeddings).numpy()

# === Données ===

X = df_final['clean_text'].fillna("")
y = df_final[emotion_labels]

# Poids pour le rééquilibrage
scale_pos_weight = [len(y) / sum(y[emotion]) for emotion in emotion_labels]
weights_dict = dict(zip(emotion_labels, scale_pos_weight))

# Split stratifié
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y.idxmax(axis=1)
)

# === Embeddings MARBERT ===
X_train_embed = get_bert_embeddings(X_train.tolist(), tokenizer, model)
X_test_embed = get_bert_embeddings(X_test.tolist(), tokenizer, model)

# === PCA ===
pca = PCA(n_components=100, random_state=42)
X_train_pca = pca.fit_transform(X_train_embed)
X_test_pca = pca.transform(X_test_embed)

# === Paramètres GridSearch ===
param_grid = {
    'max_depth': [3, 5],
    'n_estimators': [50, 100],
    'learning_rate': [0.05, 0.1],
    'reg_alpha': [0.5, 1],
    'reg_lambda': [1, 1.5],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

base_clf = XGBClassifier(
    objective='binary:logistic',
    tree_method='gpu_hist' if torch.cuda.is_available() else 'hist',
    predictor='gpu_predictor' if torch.cuda.is_available() else 'auto',
    eval_metric='logloss',
    use_label_encoder=False,
    early_stopping_rounds=10,
    verbosity=0
)

# === Entraînement + Prédiction + Sauvegarde ===
os.makedirs("saved_models", exist_ok=True)

y_pred_test = []
y_pred_train = []
best_params = {}

for emotion in emotion_labels:
    print(f"\n=== Traitement de l'émotion : {emotion} ===")

    # Rééquilibrage
    ada = ADASYN(random_state=42)
    X_res, y_res = ada.fit_resample(X_train_pca, y_train[emotion])

    # GridSearch
    clf = base_clf.set_params(scale_pos_weight=weights_dict[emotion])
    grid_search = GridSearchCV(
        estimator=clf,
        param_grid=param_grid,
        cv=StratifiedKFold(n_splits=3, shuffle=True, random_state=42),
        scoring='f1',
        verbose=1,
        n_jobs=-1
    )

    grid_search.fit(X_res, y_res, eval_set=[(X_test_pca, y_test[emotion])], verbose=0)
    best_model = grid_search.best_estimator_
    best_params[emotion] = grid_search.best_params_
    print(f"✅ Meilleurs paramètres : {best_params[emotion]}")

    # Prédictions
    y_pred_test.append(best_model.predict(X_test_pca))
    y_pred_train.append(best_model.predict(X_train_pca))

    # Sauvegarde
    joblib.dump(best_model, f"saved_models/{emotion}_model.pkl")
    print(f"✅ Modèle sauvegardé : saved_models/{emotion}_model.pkl")

# Sauvegarde du PCA
joblib.dump(pca, "saved_models/pca_model.pkl")
print("✅ PCA sauvegardé dans saved_models/pca_model.pkl")

# Résultats
y_pred_test = np.array(y_pred_test).T
y_pred_train = np.array(y_pred_train).T

print("\n=== Résultats TEST ===")
print(classification_report(y_test, y_pred_test, target_names=emotion_labels, zero_division=0))
print(f"Hamming Loss: {hamming_loss(y_test, y_pred_test):.4f}")

print("\n=== Résultats TRAIN ===")
print(classification_report(y_train, y_pred_train, target_names=emotion_labels, zero_division=0))

print("\n=== Meilleurs paramètres par émotion ===")
for emotion, params in best_params.items():
    print(f"{emotion}: {params}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/376 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/701 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/654M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/654M [00:00<?, ?B/s]


=== Traitement de l'émotion : anger ===
Fitting 3 folds for each of 128 candidates, totalling 384 fits
✅ Meilleurs paramètres : {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 0.5, 'reg_lambda': 1.5, 'subsample': 0.8}
✅ Modèle sauvegardé : saved_models/anger_model.pkl

=== Traitement de l'émotion : disgust ===
Fitting 3 folds for each of 128 candidates, totalling 384 fits
✅ Meilleurs paramètres : {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 0.5, 'reg_lambda': 1, 'subsample': 0.8}
✅ Modèle sauvegardé : saved_models/disgust_model.pkl

=== Traitement de l'émotion : fear ===
Fitting 3 folds for each of 128 candidates, totalling 384 fits
✅ Meilleurs paramètres : {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'reg_alpha': 0.5, 'reg_lambda': 1, 'subsample': 1.0}
✅ Modèle sauvegardé : saved_models/fear_model.pkl

=== Traitement de l'émotion : joy ===
Fit

profondeur 5,

100 arbres,

learning rate 0.1,

régularisation alpha 0.5 et lambda 1.5,

80% des données utilisées par arbre (subsample 0.8),

et 100% des caractéristiques utilisées par arbre (colsample_bytree 1.0).

Cela signifie que ce modèle est bien équilibré pour cette émotion.


In [None]:
# === Textes de test ===
new_texts = [
    "أنا فرحان بزاف اليوم",
    "حسيت بالخوف كي سمعت الصوت",
    "كرهت حياتي",
    "واش من المفاجأة هادي!",
   "أنا زعفان بزاف من الخدمة ديالي",
    "التنسيق المبكر لـ قيامة  توافق حزبي لتقديم تعديلات مشتركة على القوانين الانتخابية",
    "حزب الاستقلال سيعود مع الوفا مستقبلا",
]
# new_texts = [
#     "أنا زعفان بزاف من الخدمة ديالي",  # darija
#     "كوثر براني خرجات من دارها من بعد 20 يوم ديال الحجر الصحي باش دير الجلبة لبنتها وكتعيش خوف مضاعف",
#     "وش بيك اليوم؟ راك فرحان!",       # algérien
# ]
# === Embeddings MARBERT ===
X_new_embed = get_bert_embeddings(new_texts, tokenizer, model)

# === PCA transform ===
pca = joblib.load("saved_models/pca_model.pkl")
X_new_pca = pca.transform(X_new_embed)

# === Chargement des modèles et prédiction ===
results = []
for emotion in emotion_labels:
    model_path = f"saved_models/{emotion}_model.pkl"
    clf = joblib.load(model_path)
    preds = clf.predict(X_new_pca)
    results.append(preds)

# === Résultat final formaté ===
results = np.array(results).T
for i, text in enumerate(new_texts):
    predicted = [label for j, label in enumerate(emotion_labels) if results[i][j] == 1]
    print(f"Texte : \"{text}\"")
    print(f"Émotions détectées : {predicted if predicted else 'Aucune'}\n")


Texte : "أنا فرحان بزاف اليوم"
Émotions détectées : ['joy']

Texte : "حسيت بالخوف كي سمعت الصوت"
Émotions détectées : ['fear', 'sadness', 'surprise']

Texte : "كرهت حياتي"
Émotions détectées : ['anger']

Texte : "واش من المفاجأة هادي!"
Émotions détectées : ['surprise']

Texte : "أنا زعفان بزاف من الخدمة ديالي"
Émotions détectées : ['anger', 'sadness']

Texte : "التنسيق المبكر لـ قيامة  توافق حزبي لتقديم تعديلات مشتركة على القوانين الانتخابية"
Émotions détectées : Aucune

Texte : "حزب الاستقلال سيعود مع الوفا مستقبلا"
Émotions détectées : Aucune



In [None]:
pip install gradio

Collecting gradio
  Downloading gradio-5.32.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.2 (from gradio)
  Downloading gradio_client-1.10.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [None]:
import gradio as gr
import joblib
import numpy as np

# === Paramètres ===
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

# === Chargement PCA ===
pca = joblib.load("saved_models/pca_model.pkl")

# === Chargement des modèles XGB dans un dict ===
classifiers = {}
for emotion in emotion_labels:
    classifiers[emotion] = joblib.load(f"saved_models/{emotion}_model.pkl")

# === Fonction de prédiction ===
def predict_emotions(text):
    if not text.strip():
        return "Aucun texte fourni"

    # Récupérer embeddings (en liste même pour 1 texte)
    X_embed = get_bert_embeddings([text], tokenizer, model)
    # PCA
    X_pca = pca.transform(X_embed)

    detected = []
    for emotion in emotion_labels:
        clf = classifiers[emotion]
        pred = clf.predict(X_pca)[0]  # Prédiction 0/1 pour ce texte
        if pred == 1:
            detected.append(emotion)

    if detected:
        return ", ".join(detected)
    else:
        return "Aucune émotion détectée"

# === Interface Gradio ===
iface = gr.Interface(
    fn=predict_emotions,
    inputs=gr.Textbox(lines=3, placeholder="Entrez un texte en arabe ou dialecte ici..."),
    outputs=gr.Textbox(label="Émotions détectées"),
    title="Détection des émotions MARBERT + XGBoost"
)

iface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://1583788dcf68a0c4c1.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [None]:
import shutil
from google.colab import files

# 1. Compresser le dossier en fichier zip
shutil.make_archive("saved_models", 'zip', "saved_models")

# 2. Télécharger le fichier zip
files.download("saved_models.zip")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>