In [None]:
%run training_theme_classification

In [None]:
import pandas as pd
df = pd.read_csv('Data/articles_actualite.csv')

In [None]:
score = pd.read_csv('Annotations/scores.csv')

In [None]:
df_merged = df.merge(score, on="identifiant", how="left")

In [None]:
df_merged = df_merged[~df_merged["score"].isna()]

In [None]:
df_merged.drop(columns = ['Unnamed: 0', 'theme'])

In [None]:
df_merged["texte_total"] = df_merged["titre"].fillna("") + " " + df_merged["texte"].fillna("")

label_cols = [
    "voc_violence", "hierar", "portrait_victime", "portrait_auteur",
    "relation", "meanisme_violence", "stat"
]


df_filtered = df_merged.dropna(subset=["texte_total", "score"] + label_cols)


from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(
    df_filtered,
    test_size=0.2,
    random_state=42
)

train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)


In [None]:
import torch
import torch.nn as nn
from transformers import CamembertModel

class CamembertMultiTaskModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = CamembertModel.from_pretrained("camembert-base")
        self.dropout = nn.Dropout(0.3)
        self.hidden_size = self.backbone.config.hidden_size  # 768
        self.regressor = nn.Linear(self.hidden_size, 1)
        self.classifier = nn.Linear(self.hidden_size, 7)  # 7 variables à classer

    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls_embedding = self.dropout(outputs.last_hidden_state[:, 0, :])

        score = self.regressor(cls_embedding).squeeze(1)
        class_logits = self.classifier(cls_embedding)  # shape: [batch, 7]

        return score, class_logits


In [None]:
from torch.utils.data import Dataset

class MultiTaskDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        text = str(self.df.iloc[idx]["texte_total"])
        inputs = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        )

        item = {
            "input_ids": inputs["input_ids"].squeeze(0),
            "attention_mask": inputs["attention_mask"].squeeze(0),
            "score": torch.tensor(self.df.iloc[idx]["score"], dtype=torch.float),
            "class_labels": torch.tensor(
                self.df.iloc[idx][label_cols].values.astype(float),
                dtype=torch.float  # ou torch.long si tu veux une classification avec CrossEntropy
            )
        }

        return item


In [None]:
from transformers import CamembertTokenizer
from torch.utils.data import DataLoader

tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
train_dataset = MultiTaskDataset(train_df, tokenizer)
test_dataset = MultiTaskDataset(test_df, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
from tqdm import tqdm
model = CamembertMultiTaskModel().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

loss_reg = nn.MSELoss()
loss_cls = nn.MSELoss() 
num_epochs = 30

for epoch in tqdm(range(num_epochs)):
    model.train()
    total_loss = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels_score = batch['score'].to(device)
        labels_cls = batch['class_labels'].to(device)

        pred_score, pred_cls = model(input_ids, attention_mask)

        loss1 = loss_reg(pred_score, labels_score)
        loss2 = loss_cls(pred_cls, labels_cls)

        loss = loss1 + loss2

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()


In [None]:
from sklearn.metrics import mean_squared_error, r2_score

model.eval()
all_scores = []
all_preds = []
all_true = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels_score = batch['score'].to(device)
        labels_class = batch['class_labels'].to(device)

        pred_score, pred_class = model(input_ids, attention_mask)

        all_scores.extend(pred_score.cpu().numpy())
        all_preds.extend(pred_class.cpu().numpy())
        all_true.extend(labels_class.cpu().numpy())
        

mse = mean_squared_error(test_df["score"], all_scores)
r2 = r2_score(test_df["score"], all_scores)
print(f"Test — MSE: {mse:.4f} | R²: {r2:.4f}")


In [None]:
from sklearn.metrics import classification_report
import numpy as np

y_pred = np.round(np.array(all_preds))  
y_true = np.array(all_true)

for i, col in enumerate(label_cols):
    print(f"\n--- {col} ---")
    print(classification_report(y_true[:, i], y_pred[:, i], digits=3, zero_division=0))


In [None]:
df_all = pd.read_csv("Data/articles_actualite.csv")
df_merged = df.merge(score, on="identifiant", how="left")
df_merged["texte_total"] = df_merged["titre"].fillna("") + " " + df_merged["texte"].fillna("")

In [None]:
df_annotated = df_merged[df_merged["score"].notna()].copy()
df_to_predict = df_merged[df_merged["score"].isna() & df_merged["texte_total"].notna()].copy()
df_to_predict = df_to_predict.reset_index()

In [None]:
predict_dataset = MultiTaskDataset(df_to_predict, tokenizer)
predict_loader = DataLoader(predict_dataset, batch_size=16, shuffle=False)

model.eval()
pred_scores = []
pred_labels = []

with torch.no_grad():
    for batch in tqdm(predict_loader):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        score, labels = model(input_ids, attention_mask)
        pred_scores.extend(score.cpu().numpy())
        pred_labels.extend(np.round(labels.cpu().numpy()))


In [None]:
original_idx = df_to_predict["index"]
for i, idx in tqdm(enumerate(original_idx)):
    if pd.isna(df_merged.loc[idx, "score"]):
        df_merged.loc[idx, "score_pred"] = pred_scores[i]
for i, col in tqdm(enumerate(label_cols)):
    for j, idx in tqdm(enumerate(original_idx)):
        if pd.isna(df_merged.loc[idx, col]):
            df_merged.loc[idx, col + "_pred"] = pred_labels[j][i]


In [None]:
df_merged["score_final"] = df_merged["score"].fillna(df_merged["score_pred"])
for col in label_cols:
    df_merged[col + "_final"] = df_merged[col].fillna(df_merged[col + "_pred"])

In [None]:
final_cols = [
    "identifiant", "journal_clean", "titre", "annee", "mois", "jour", "texte_total",
    "score_final"
] + [col + "_final" for col in label_cols]
df_final = df_merged[final_cols].copy()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

df_final["mean_variables"] = df_final[[col + "_final" for col in label_cols]].mean(axis=1)

plt.figure(figsize=(6, 6))
sns.scatterplot(data=df_final, x="mean_variables", y="score_final", alpha=0.6)
plt.plot([-1, 1], [-1, 1], 'r--')  
plt.xlabel("Moyenne des 7 variables finales")
plt.ylabel("Score final")
plt.title("Score vs Moyenne des variables finales")
plt.grid(True)
plt.savefig("score_vs_moyenne_variables.png", dpi=300, bbox_inches="tight")
plt.show()

In [65]:
df_final.to_csv('Data/articles_with_scores.csv')