# Projet final : Apprentissage artificiel 
CIOMNE CATALIN & TUYAY RACHELLE

Ce notebook se focalise sur l’analyse du corpus DEFT 2009, en se limitant spécifiquement à la tâche 3. L’objectif principal est d’examiner la capacité des classifieurs à traiter des textes issus de différentes langues (français, italien et anglais) et à les catégoriser correctement. Pour ce faire, nous avons choisi d’évaluer les performances des modèles sur l’ensemble des textes, toutes langues confondues, afin de tester leur robustesse dans un contexte multilingue.

Plusieurs classifieurs ont été sélectionnés pour cette expérimentation : LinearSVC, Régression Logistique et Multinomial Naive Bayes. 

version python utilisé : 3.12

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import os
import numpy as np
import xml.etree.ElementTree as ET

In [2]:
def extract_docs_from_xml(xml_path, with_label=True):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    texts = []
    doc_ids = []
    labels = []

    for doc in root.findall("doc"):
        doc_id = doc.attrib["id"]
        doc_ids.append(doc_id)

        paragraphs = doc.find("texte").findall("p")
        text = " ".join(p.text.strip() for p in paragraphs if p.text)
        texts.append(text)

        if with_label:
            parti = doc.find(".//PARTI").attrib["valeur"]
            labels.append(parti)

    return texts, doc_ids, labels


def load_corpus(folder, with_label=True):
    all_texts = []
    all_doc_ids = []
    all_labels = []

    for file in os.listdir(folder):
        if file.endswith(".xml"):
            path = os.path.join(folder, file)
            texts, doc_ids, labels = extract_docs_from_xml(path, with_label)
            all_texts.extend(texts)
            all_doc_ids.extend(doc_ids)
            all_labels.extend(labels)

    return all_texts, all_doc_ids, all_labels


In [3]:
train_dir = "data_deft/Corpus d_apprentissage"
test_dir = "data_deft/Corpus de test"

X,_, y = load_corpus(train_dir, with_label=True)

print(f"Nombre de documents : {len(X)}")
print(f"Nombre de labels : {len(set(y))}")

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train size :", len(X_train))
print("Test size :", len(X_test))

Nombre de documents : 58110
Nombre de labels : 5
Train size : 46488
Test size : 11622


In [4]:
# Vectorisation en TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    lowercase=True,
    max_df=0.9,
    min_df=5,
    ngram_range=(1, 2),
)

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Logistic regression :

In [5]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)

model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [6]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

model.score(X_test, y_test)

              precision    recall  f1-score   support

        ELDR       0.69      0.08      0.14      1205
     GUE-NGL       0.68      0.50      0.58      1613
      PPE-DE       0.49      0.78      0.60      4115
         PSE       0.45      0.47      0.46      3264
   Verts-ALE       0.58      0.11      0.18      1425

    accuracy                           0.50     11622
   macro avg       0.58      0.39      0.39     11622
weighted avg       0.54      0.50      0.46     11622



0.49991395628979524

# NB Multinomial

In [7]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB(alpha=0.01)
model.fit(X_train, y_train)

0,1,2
,alpha,0.01
,force_alpha,True
,fit_prior,True
,class_prior,


In [8]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

model.score(X_test, y_test)

              precision    recall  f1-score   support

        ELDR       0.72      0.04      0.07      1205
     GUE-NGL       0.74      0.41      0.53      1613
      PPE-DE       0.47      0.79      0.59      4115
         PSE       0.42      0.45      0.44      3264
   Verts-ALE       0.53      0.09      0.15      1425

    accuracy                           0.48     11622
   macro avg       0.58      0.36      0.36     11622
weighted avg       0.53      0.48      0.43     11622



0.47943555326105664

# LinearSVC

In [9]:
from sklearn.svm import LinearSVC, SVC

model = LinearSVC()
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1.0
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,verbose,0


In [10]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

model.score(X_test, y_test)

              precision    recall  f1-score   support

        ELDR       0.50      0.23      0.31      1205
     GUE-NGL       0.65      0.58      0.61      1613
      PPE-DE       0.53      0.70      0.61      4115
         PSE       0.46      0.50      0.48      3264
   Verts-ALE       0.49      0.27      0.35      1425

    accuracy                           0.52     11622
   macro avg       0.53      0.45      0.47     11622
weighted avg       0.52      0.52      0.51     11622



0.5237480640165204

In [11]:
#application des paramètres d'optimisation
model = LinearSVC(
    C=1,
    class_weight="balanced",
)

model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,loss,'squared_hinge'
,dual,'auto'
,tol,0.0001
,C,1
,multi_class,'ovr'
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,verbose,0


In [12]:
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

model.score(X_test, y_test)

              precision    recall  f1-score   support

        ELDR       0.42      0.34      0.37      1205
     GUE-NGL       0.60      0.64      0.62      1613
      PPE-DE       0.57      0.63      0.60      4115
         PSE       0.49      0.48      0.48      3264
   Verts-ALE       0.43      0.38      0.40      1425

    accuracy                           0.53     11622
   macro avg       0.50      0.49      0.49     11622
weighted avg       0.52      0.53      0.52     11622



0.525124763379797

# Evaluation finale sur le corpus test : 

In [13]:
def load_corpus_txt(folder, encoding="utf-8"):
    """
    Lit tous les fichiers .txt d'un dossier
    Supprime les doublons d'ID (garde la première occurrence)
    Retourne un dictionnaire {id: label}
    """
    references = {}

    for filename in os.listdir(folder):
        if not filename.endswith(".txt"):
            continue

        with open(os.path.join(folder, filename), "r", encoding=encoding) as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue

                sample_id, label = line.split("\t")

                if sample_id in references:
                    continue  #retire les doublons
                references[sample_id] = label

    return references

def load_predictions_txt(file_path, encoding="utf-8"):
    """
    Lit un fichier .txt
    Retourne un dictionnaire {id: label}
    """
    predictions = {}

    with open(file_path, "r", encoding=encoding) as f:
        for line in f:
            line = line.strip()
            if not line:
                continue

            parts = line.split("\t")
            if len(parts) != 2:
                continue  #ignore les lignes invalides

            sample_id, label = parts
            predictions[sample_id] = label

    return predictions

In [14]:
X_test, doc_ids, _ = load_corpus(test_dir, with_label=False)
X_test_tfidf = vectorizer.transform(X_test)
y_pred = model.predict(X_test_tfidf)

# Sauvegarde des prédictions
with open("predictions.txt", "w", encoding="utf-8") as f:
    for doc_id, label in zip(doc_ids, y_pred):
        f.write(f"{doc_id}\t{label}\n")

assert len(doc_ids) == len(y_pred)


#Chargement des données
gold_dir = "data_deft/Données de référence"
references = load_corpus_txt(gold_dir)
predictions = load_predictions_txt("predictions.txt")

#list(references.items())[:50]
#list(predictions.items())[:50]

In [15]:
y_true = []
y_pred = []
missing = 0
for doc_id, true_label in references.items():
    if doc_id in predictions:
        y_true.append(true_label)
        y_pred.append(predictions[doc_id])
    else:
        missing += 1

    if missing > 0:
        print(f"Il manque {missing} textes dans les predictions.")

# Évaluation
print(classification_report(y_true, y_pred, digits=3, zero_division=0))

print("Accuracy:", accuracy_score(y_true, y_pred))
print("Macro-F1:", f1_score(y_true, y_pred, average="macro"))

              precision    recall  f1-score   support

        ELDR      0.111     0.101     0.106      1339
     GUE-NGL      0.139     0.145     0.142      1795
      PPE-DE      0.362     0.380     0.371      4571
         PSE      0.286     0.278     0.282      3627
   Verts-ALE      0.125     0.118     0.121      1585

    accuracy                          0.258     12917
   macro avg      0.205     0.204     0.204     12917
weighted avg      0.255     0.258     0.256     12917

Accuracy: 0.25772238135790043
Macro-F1: 0.20437019531176265
