In [33]:
import json
import sys
import os
from collections import defaultdict
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.cluster import KMeans
import numpy as np
import json



from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from matplotlib import pyplot as plt


sys.path.append(os.path.join('../pstal-etu/lib/'))
from conllulib import CoNLLUReader

In [23]:

with open("mapping.json", "r") as file:
    mapping = json.load(file)

with open("embeddings.json", "r") as file:
    data_train = json.load(file)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained('almanach/camembert-base')
model = AutoModel.from_pretrained('almanach/camembert-base')


In [18]:
def dataloader_with_embeddings(file, mapping):
    """
    Charge un fichier CoNLLU, extrait les embeddings des mots associés à leurs classes,
    et enregistre les résultats dans une structure.
    """
    data = []
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    with open(file, 'r') as file:
        reader = CoNLLUReader(file)
        for sent in reader.readConllu():
            mots = [tok["form"] for tok in sent]
            labels = [tok["frsemcor:noun"] for tok in sent]


            token_obj = tokenizer(mots, is_split_into_words=True, return_tensors='pt').to(device)
            word_ids = token_obj.word_ids()

            with torch.no_grad():
                embeddings = model(**token_obj)["last_hidden_state"]
            for word_idx in range(len(mots)):
                token_indices = [i for i, w_id in enumerate(word_ids) if w_id == word_idx]
                if token_indices:
                    avg_embedding = embeddings[:, token_indices, :].mean(dim=1).squeeze(0).cpu().numpy()
                    data.append({
                        "word": mots[word_idx],
                        "embedding": avg_embedding.tolist(),
                        "class": mapping[labels[word_idx]]
                    })
    return data, mapping

In [36]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
import json
import torch
from transformers import AutoTokenizer, AutoModel
# from conllulib import CoNLLUReader

# Charger le tokenizer et le modèle
model = AutoModel.from_pretrained('almanach/camembert-base')
tokenizer = AutoTokenizer.from_pretrained('almanach/camembert-base')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

path = '.'
# Charger les embeddings d'entraînement et le mapping
with open("embeddings.json", "r") as file:
    train_data = json.load(file)

with open("mapping.json", "r") as file:
    mapping = json.load(file)

# Calculer l'embedding moyen de chaque classe
num_classes = len(mapping)
class_embeddings = {cls: [] for cls in range(num_classes)}

for item in train_data:
    class_embeddings[item["class"]].append(np.array(item["embedding"]))

class_means = {
    cls: np.mean(embeddings, axis=0) if embeddings else np.zeros_like(train_data[0]["embedding"])
    for cls, embeddings in class_embeddings.items()
}

# Fonction pour charger les données de test et leurs embeddings
def dataloader_test(file):
    test_data = []
    with open(file, 'r') as f:
        reader = CoNLLUReader(f)
        for sent in reader.readConllu():
            mots = [tok["form"] for tok in sent]
            labels = [tok.get("frsemcor:noun", "") for tok in sent]

            token_obj = tokenizer(mots, is_split_into_words=True, return_tensors='pt').to(device)
            word_ids = token_obj.word_ids()

            with torch.no_grad():
                embeddings = model(**token_obj)["last_hidden_state"]

            for word_idx in range(len(mots)):
                token_indices = [i for i, w_id in enumerate(word_ids) if w_id == word_idx]
                if token_indices:
                    avg_embedding = embeddings[:, token_indices, :].mean(dim=1).squeeze(0).cpu().numpy()
                    test_data.append({
                        "word": mots[word_idx],
                        "embedding": avg_embedding.tolist(),
                        "true_label": mapping.get(labels[word_idx], -1)
                    })
    return test_data

# Charger les données de test
file_test = os.path.join(path, '../pstal-etu/sequoia/sequoia-ud.parseme.frsemcor.simple.train')
test_data = dataloader_test(file_test)


from collections import defaultdict

# Appliquer KMeans aux données d'entraînement
kmeans = KMeans(n_clusters=num_classes, random_state=42)
train_embeddings = [item["embedding"] for item in train_data]
train_labels = [item["class"] for item in train_data]
kmeans.fit(train_embeddings)

# Créer une correspondance entre clusters et classes
cluster_to_class = {}
cluster_class_counts = defaultdict(lambda: defaultdict(int))

for i, label in enumerate(train_labels):
    cluster = kmeans.labels_[i]
    cluster_class_counts[cluster][label] += 1

for cluster, class_count in cluster_class_counts.items():
    cluster_to_class[cluster] = max(class_count, key=class_count.get)

# Prédiction pour les données de test
true_labels = []
predicted_labels = []

for item in test_data:
    embedding = np.array(item["embedding"])
    true_label = item["true_label"]
    cluster = kmeans.predict([embedding])[0]
    predicted_label = cluster_to_class.get(cluster, -1)

    true_labels.append(true_label)
    predicted_labels.append(predicted_label)

    # Afficher la phrase avec les prédictions
    print(f"Word: {item['word']}, True Label: {true_label}, Predicted Label: {predicted_label}")

# Calculer et afficher les statistiques
print("Classification Report:")
print(classification_report(true_labels, predicted_labels, zero_division=0))


Word: Gutenberg, True Label: 0, Predicted Label: 1
Word: Cette, True Label: 1, Predicted Label: 1
Word: exposition, True Label: 2, Predicted Label: 1
Word: nous, True Label: 1, Predicted Label: 1
Word: apprend, True Label: 1, Predicted Label: 1
Word: que, True Label: 1, Predicted Label: 1
Word: dès, True Label: 1, Predicted Label: 1
Word: le, True Label: 1, Predicted Label: 1
Word: XIIe, True Label: 1, Predicted Label: 1
Word: siècle, True Label: 3, Predicted Label: 1
Word: ,, True Label: 1, Predicted Label: 1
Word: à, True Label: 1, Predicted Label: 1
Word: Dammarie-sur-Saulx, True Label: 4, Predicted Label: 1
Word: ,, True Label: 1, Predicted Label: 1
Word: entre, True Label: 1, Predicted Label: 1
Word: autres, True Label: 1, Predicted Label: 1
Word: sites, True Label: 5, Predicted Label: 1
Word: ,, True Label: 1, Predicted Label: 1
Word: une, True Label: 1, Predicted Label: 1
Word: industrie, True Label: 2, Predicted Label: 1
Word: métallurgique, True Label: 1, Predicted Label: 1
Wo