In [None]:
!pip install torch

In [None]:
!pip install tensorflow

In [None]:
!pip install pandas

In [26]:
import pandas as pd

articles = pd.read_csv('LeMonde2003_9classes.csv.gz')
articles

Unnamed: 0,text,category
0,a la boutique du fulham fc nichée au dernier é...,SPO
1,pour la plupart de ceux qui n'y vivent pas la ...,ART
2,la perspective d'une enquête judiciaire sur la...,FRA
3,le tribunal administratif de limoges a annulé ...,SOC
4,des avions américains et britanniques ont bomb...,INT
...,...,...
30160,reçu à la mairie de biarritz à l'occasion d'un...,FRA
30161,la française malia metella a remporté le titre...,SPO
30162,1 pourquoi avoir choisi les années 1950 pour k...,ART
30163,la croix a licencié un journaliste alain herto...,ENT


In [27]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(articles, test_size=0.2, random_state=42)

In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

batch_size = 32


classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")


new_data = test[test['category'] != "UNE"].copy()
new_data.loc[new_data['category'] == 'SOC', 'category'] = 'FRA'
new_data = new_data.reset_index(drop=True)


category_to_label = {
    "SPO": "sport",
    "ART": "art",
    "FRA": "actualité france",
    "INT": "international",
    "ENT": "économie"
}


new_data["category_explicit"] = new_data["category"].map(category_to_label)
labels = list(category_to_label.values())


predicted_labels = []

for start in tqdm(range(0, len(new_data), batch_size)):
    end = min(start + batch_size, len(new_data))
    batch_texts = new_data.loc[start:end-1, "text"].fillna("").astype(str).str.slice(0, 300).tolist()

    for text in batch_texts:
        try:
            result = classifier(text, labels, multi_label=False)
            predicted_labels.append(result["labels"][0])
        except Exception as e:
            print(f"Error on text: {text[:50]}... → {e}")
            predicted_labels.append(None)


new_data["predicted_label"] = predicted_labels


Device set to use cpu
  0%|          | 0/177 [00:00<?, ?it/s]

In [None]:
from sklearn.metrics import accuracy_score, classification_report


mask = new_data["category_explicit"].notna() & new_data["predicted_label"].notna()
y_true = new_data.loc[mask, "category_explicit"]
y_pred = new_data.loc[mask, "predicted_label"]


acc = accuracy_score(y_true, y_pred)
print(f"\n Accuracy : {acc:.2f}\n")

print("Classification report :")
print(classification_report(y_true, y_pred, labels=list(category_to_label.values())))
