In [None]:
import pandas as pd
df = pd.read_csv('Data/articles_actualite.csv')

In [None]:
score = pd.read_csv('Annotations/scores.csv')

In [None]:
df_merged = df.merge(score, on="identifiant", how="left")

In [None]:
df_merged = df_merged[~df_merged["score"].isna()]

In [None]:
df_merged.drop(columns = ['Unnamed: 0', 'theme'])

In [None]:
df_merged["texte_total"] = df_merged["titre"].fillna("") + " " + df_merged["texte"].fillna("")

label_cols = [
    "voc_violence", "hierar", "portrait_victime", "portrait_auteur",
    "relation", "meanisme_violence", "stat"
]


df_filtered = df_merged.dropna(subset=["texte_total", "score"] + label_cols)


from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df_filtered,
    test_size=0.2,
    random_state=42
)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)


In [None]:
import torch
import torch.nn as nn
from transformers import CamembertModel

class CamembertMultiTaskModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = CamembertModel.from_pretrained("camembert-base")
        self.dropout = nn.Dropout(0.3)
        self.hidden_size = self.backbone.config.hidden_size  # 768
        self.classifier = nn.Linear(self.hidden_size, 7 * 3)  # 7 tasks × 3 classes

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = self.dropout(outputs.last_hidden_state[:, 0, :])
        logits = self.classifier(cls_embedding)          
        logits = logits.view(-1, 7, 3)                    
        return logits



In [None]:
from torch.utils.data import Dataset

from torch.utils.data import Dataset
import torch

from torch.utils.data import Dataset

class MultiTaskDataset(Dataset):
    def __init__(self, df, tokenizer, label_cols=None, max_length=256, inference=False):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.label_cols = label_cols
        self.max_length = max_length
        self.inference = inference  # ← nouveau

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.df.iloc[idx]["texte_total"])
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0)
        }

        # Ajout uniquement des labels en mode entraînement / évaluation
        if not self.inference:
            raw_labels = self.df.iloc[idx][self.label_cols].values.astype(float)
            class_labels = (raw_labels + 1).astype(int)
            item["class_labels"] = torch.tensor(class_labels, dtype=torch.long)

        return item




In [None]:
from transformers import CamembertTokenizer
from torch.utils.data import DataLoader

label_cols = [
    "voc_violence",
    "hierar",
    "portrait_victime",
    "portrait_auteur",
    "relation",
    "meanisme_violence",
    "stat"
]

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

train_dataset = MultiTaskDataset(train_df, tokenizer, label_cols=label_cols)
test_dataset = MultiTaskDataset(test_df, tokenizer, label_cols=label_cols)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
from tqdm import tqdm
import torch.nn as nn

model = CamembertMultiTaskModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

loss_fn = nn.CrossEntropyLoss()
num_epochs = 45

for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels_cls = batch['class_labels'].to(device) 
        pred_cls = model(input_ids, attention_mask)  
        loss_cls = 0
        for i in range(7):
            loss_cls += loss_fn(pred_cls[:, i, :], labels_cls[:, i])
        loss_cls = loss_cls / 7 
        optimizer.zero_grad()
        loss_cls.backward()
        optimizer.step()
        total_loss += loss_cls.item()
        avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1}/{num_epochs} — Loss: {avg_loss:.4f}")


In [None]:
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

model.eval()
all_preds = []
all_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["class_labels"].to(device)  # [batch, 7]

        logits = model(input_ids, attention_mask)  # [batch, 7, 3]
        preds = torch.argmax(logits, dim=2)        # [batch, 7]

        all_preds.extend(preds.cpu().numpy())      
        all_true.extend(labels.cpu().numpy())      # <-- corrigé ici

# Convert to arrays
all_preds = np.array(all_preds)
all_true = np.array(all_true)

# Vérification de forme (facultatif)
assert all_preds.shape == all_true.shape == (len(test_loader.dataset), 7)

# Recode en [-1, 0, 1]
all_preds_recoded = all_preds - 1
all_true_recoded = all_true - 1

# Moyenne des 7 dimensions
pred_scores = all_preds_recoded.mean(axis=1)
true_scores = all_true_recoded.mean(axis=1)

# Évaluation
mse = mean_squared_error(true_scores, pred_scores)
r2 = r2_score(true_scores, pred_scores)

print(f"[Moyenne des classes] MSE: {mse:.4f} | R²: {r2:.4f}")


In [None]:
from sklearn.metrics import classification_report
import numpy as np


y_pred = np.array(all_preds)     
y_true = np.array(all_true)


y_pred_recoded = y_pred - 1
y_true_recoded = y_true - 1

for i, col in enumerate(label_cols):
    print(f"\n--- {col} ---")
    print(classification_report(y_true_recoded[:, i], y_pred_recoded[:, i], digits=3, zero_division=0))



In [None]:
df_all = pd.read_csv("Data/articles_actualite.csv")
df_annotated = pd.read_csv("Annotations/scores.csv")  # ton fichier d’annotations
df_all = df_all.merge(df_annotated, on="identifiant", how="left")
df_all["texte_total"] = df_all["titre"].fillna("") + " " + df_all["texte"].fillna("")
df_to_predict = df_all[df_all[label_cols].isnull().any(axis=1)].copy()
df_to_predict["index"] = df_to_predict.index
predict_dataset = MultiTaskDataset(df_to_predict, tokenizer, label_cols=label_cols, inference=True)
predict_loader = DataLoader(predict_dataset, batch_size=16, shuffle=False)


model.eval()
pred_labels = []
pred_scores = []
original_idx = df_to_predict["index"].values

with torch.no_grad():
    for batch in tqdm(predict_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        logits = model(input_ids, attention_mask)       
        preds = torch.argmax(logits, dim=2)            
        preds = preds.cpu().numpy() - 1                 

        pred_labels.extend(preds)
        pred_scores.extend(preds.mean(axis=1))


In [None]:
df_all["index"] = df_all.index
pred_labels_np = np.array(pred_labels)  # Convertir en tableau NumPy
pred_df = pd.DataFrame(pred_labels_np, columns=[col + "_pred" for col in label_cols])

# Ajouter les scores prédits et l'index
pred_df["score_pred"] = pred_scores
pred_df["index"] = df_to_predict["index"].values  # Utilise 'index' de df_to_predict

# Fusionner les prédictions avec df_all
df_merged = df_all.merge(pred_df, on="index", how="left")

# Compléter les scores manquants
df_merged["score_final"] = df_merged["score"].fillna(df_merged["score_pred"])

# Compléter les colonnes de labels (voc_violence, hierar, etc.)
for col in label_cols:
    df_merged[f"{col}_final"] = df_merged[col].fillna(df_merged[f"{col}_pred"])

In [None]:
df_merged.to_csv('Data/scored_articles.csv')