In [None]:
import pandas as pd
df_combined=pd.read_csv('/content/train.csv')
df_combined_dev=pd.read_csv('/content/dev.csv')
df_combined_test=pd.read_csv('/content/test.csv')

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier #permet de gérer des problèmes de classification multiclasse avec des classifieurs binaires.
from sklearn.metrics import classification_report #générer un rapport d’évaluation
from sklearn.pipeline import Pipeline # permet de chaîner plusieurs étapes de traitement

# === 1. Charger les DataFrames ===
# Assure-toi que df_combined_train, df_combined_dev et df_combined_test sont déjà disponibles

# === 2. Définir les colonnes des labels ===
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

# === 3. Séparer les features et les labels ===
X_train = df_combined['clean_text'].fillna('')
y_train = df_combined[emotion_labels]

X_dev = df_combined_dev['clean_text'].fillna('')
y_dev = df_combined_dev[emotion_labels]

X_test = df_combined_test['clean_text'].fillna('')
y_test = df_combined_test[emotion_labels]


# === 4. Créer le pipeline TF-IDF + Classifieur ===
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        tokenizer=lambda x: x.split(),  # suppose que le texte est déjà tokenisé
        ngram_range=(1, 2),
        min_df=2, # ignore les mots rares
        max_df=0.9 # ignore les mots trop fréquents
    )),
    ('clf', OneVsRestClassifier(
        LogisticRegression(
            class_weight='balanced',
            max_iter=1000,
            solver='liblinear'
        )
    ))
])

# === 5. Entraîner le modèle ===
pipeline.fit(X_train, y_train)

# === 6. Évaluer sur Dev ===
print("=== Validation (Dev) ===")
y_pred_dev = pipeline.predict(X_dev)
print(classification_report(y_dev, y_pred_dev, target_names=emotion_labels, zero_division=0))

# === 7. Évaluer sur Test ===
print("\n=== Test Final ===")
y_pred_test = pipeline.predict(X_test)
print(classification_report(y_test, y_pred_test, target_names=emotion_labels, zero_division=0))


=== Validation (Dev) ===
              precision    recall  f1-score   support

       anger       0.50      0.56      0.53        79
     disgust       0.31      0.37      0.34        38
        fear       0.39      0.57      0.46        46
         joy       0.38      0.49      0.43        47
     sadness       0.48      0.58      0.53        79
    surprise       0.38      0.49      0.42        68

   micro avg       0.42      0.52      0.46       357
   macro avg       0.41      0.51      0.45       357
weighted avg       0.42      0.52      0.47       357
 samples avg       0.31      0.38      0.32       357


=== Test Final ===
              precision    recall  f1-score   support

       anger       0.46      0.53      0.49       486
     disgust       0.27      0.48      0.34       241
        fear       0.36      0.59      0.45       279
         joy       0.39      0.33      0.36       312
     sadness       0.54      0.71      0.61       533
    surprise       0.39      0.56



In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import joblib

# Fonction tokenizer globale (picklable)
def simple_tokenizer(text):
    return text.split()

# === Charger les DataFrames ===
# df_combined, df_combined_dev, df_combined_test déjà chargés

emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

X_train = df_combined['clean_text'].fillna('')
y_train = df_combined[emotion_labels]

X_dev = df_combined_dev['clean_text'].fillna('')
y_dev = df_combined_dev[emotion_labels]

X_test = df_combined_test['clean_text'].fillna('')
y_test = df_combined_test[emotion_labels]

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        #tokenizer=simple_tokenizer,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.9
    )),
    ('clf', OneVsRestClassifier(
        LogisticRegression(
            class_weight='balanced',
            max_iter=1000,
            solver='liblinear'
        )
    ))
])

pipeline.fit(X_train, y_train)

# Sauvegarde du modèle
joblib.dump(pipeline, 'mon_modele_emotions.joblib')
print("Modèle sauvegardé !")

# Évaluation
print("=== Validation (Dev) ===")
y_pred_dev = pipeline.predict(X_dev)
print(classification_report(y_dev, y_pred_dev, target_names=emotion_labels, zero_division=0))

print("\n=== Test Final ===")
y_pred_test = pipeline.predict(X_test)
print(classification_report(y_test, y_pred_test, target_names=emotion_labels, zero_division=0))


Modèle sauvegardé !
=== Validation (Dev) ===
              precision    recall  f1-score   support

       anger       0.50      0.56      0.53        79
     disgust       0.31      0.37      0.34        38
        fear       0.39      0.57      0.46        46
         joy       0.38      0.49      0.43        47
     sadness       0.48      0.58      0.53        79
    surprise       0.38      0.49      0.42        68

   micro avg       0.42      0.52      0.46       357
   macro avg       0.41      0.51      0.45       357
weighted avg       0.42      0.52      0.47       357
 samples avg       0.31      0.38      0.32       357


=== Test Final ===
              precision    recall  f1-score   support

       anger       0.46      0.53      0.49       486
     disgust       0.27      0.48      0.34       241
        fear       0.36      0.59      0.45       279
         joy       0.39      0.33      0.36       312
     sadness       0.54      0.71      0.61       533
    surprise 

In [None]:
import joblib

# Recharger le modèle sauvegardé
pipeline = joblib.load('mon_modele_emotions.joblib')

# Exemple de phrases en darija marocaine
phrases_darija = [
    "أنا مفرح بزاف اليوم",
    "هادشي عجيب و خايف",
    "كنحس بغضب كبير",
    "مازال كنحس بحزن",
]


# Prédiction (multi-label)
preds = pipeline.predict(phrases_darija)

# Affichage simple des résultats
for phrase, pred in zip(phrases_darija, preds):
    emotions = [label for label, present in zip(['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise'], pred) if present]
    print(f"Phrase : {phrase}")
    print(f"Emotions prédites : {emotions}")
    print("-" * 30)


In [None]:
pip install gradio

Collecting gradio
  Downloading gradio-5.32.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.6.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.10.2 (from gradio)
  Downloading gradio_client-1.10.2-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.12-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [None]:
import gradio as gr
import joblib

# Charger le modèle sauvegardé
pipeline = joblib.load('mon_modele_emotions.joblib')

# Liste des émotions
emotion_labels = ['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']

# Fonction de prédiction qui retourne seulement les émotions détectées
def predict_emotions(text):
    preds = pipeline.predict([text])[0]  # tableau [0, 1, 1, 0, ...]
    emotions_detected = [label for label, value in zip(emotion_labels, preds) if value == 1]

    if emotions_detected:
        return " / ".join(emotions_detected)
    else:
        return "Aucune émotion détectée."

# Interface Gradio
iface = gr.Interface(
    fn=predict_emotions,
    inputs=gr.Textbox(lines=2, placeholder="اكتب جملة باللهجة المغربية..."),
    outputs=gr.Textbox(label="Émotions détectées"),
    title="Détecteur d'émotions - Darija",
    description="Entrez une phrase en darija marocaine pour détecter les émotions présentes."
)

iface.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://cef7725a52fb126c36.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


