In [1]:
from gensim.models import Word2Vec
from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer


import dill as pickle
import gensim.downloader as api
import joblib
import numpy as np
import re
import sys

In [2]:
sys.path.append('..')

from baselines import EmbeddingBaseline
from knowledgebase import KnowledgeBase

In [3]:
# Load a derived dataset from the "TREC CAR 2017" dataset
dataset = {
    "queries": pickle.load(open("queries.pkl", "rb")),
    "documents": pickle.load(open("documents.pkl", "rb")),
    "relevances": pickle.load(open("relevances.pkl", "rb"))
}

In [4]:
def load_entity_qrels(filepath):
    qrels = {}
    with open(filepath, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 4:
                continue
            query_id, _, entity_id, score = parts
            qrels.setdefault(query_id, set()).add(entity_id)
    return qrels


In [18]:
toplevel_qrels = load_entity_qrels("fold-0-train.pages.cbor-toplevel.entity.qrels")
hierarchical_qrels = load_entity_qrels("fold-0-train.pages.cbor-hierarchical.entity.qrels")
article_qrels = load_entity_qrels("fold-0-train.pages.cbor-article.entity.qrels")


In [5]:
galago_stopwords_file = './galago_418_inquery_stopwords.txt'

# Load Galago stopwords into a Python list
with open(galago_stopwords_file, 'r') as f:
    galago_stopwords = [line.strip() for line in f if line.strip()]

In [6]:
# Ressources Installation for NLTK
download("punkt")
download("stopwords")
download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [7]:
#Le nouveau PreProcess 
english_stopwords= galago_stopwords
stemmer = SnowballStemmer("english")

def preprocess(text):
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    words = word_tokenize(text)
    words_stemmed = [stemmer.stem(w) for w in words if w not in english_stopwords]
    return words_stemmed

In [8]:
# Create the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(tokenizer=preprocess, stop_words=english_stopwords)

# Train the TFIDF-Vectorizer on the corpus
def generate_docs_texts():
    for text in dataset["documents"].values():
        yield text

documents_vectors = vectorizer.fit_transform(generate_docs_texts())
documents_vectors = documents_vectors.toarray()



In [9]:
qrels = dataset["relevances"]
queries = dataset["queries"]

zero_relevance_queries = [
    qid for qid in queries
    if not any(qid == rel_qid for (rel_qid, _) in qrels)
]

print(f"{len(zero_relevance_queries)} requêtes n'ont aucun document pertinent.")
print(len(queries))

47 requêtes n'ont aucun document pertinent.
477


In [10]:
pretrained_model = api.load("fasttext-wiki-news-subwords-300")

In [11]:
knowledge_base = KnowledgeBase()
knowledge_base.load("../models/knowledge_base.pkl")
entities = list(knowledge_base.kb)

In [12]:
corpus = list(dataset["documents"].values())

model = Word2Vec(vector_size=300, window=10, sample=1e-3, min_count=0, sg=1)
model.build_vocab(corpus + entities)
total_examples = model.corpus_count
model.build_vocab([list(pretrained_model.key_to_index.keys())], update=True)

model.train(corpus, total_examples=total_examples, epochs=2)
model.save("../models/skipgram-entities.model")

In [14]:
def get_h_names_expansion(model,query_id, article_qrels):
    entities = article_qrels.get(query_id, [])
    vectors = []
    for eid in entities:
        tokens = eid.replace("enwiki:", "").replace("_", " ").split()
        for token in tokens:
            if token in model.wv:
                vectors.append(model.wv[token])
    if vectors:
        return np.mean(vectors, axis=0)
    return np.zeros(model.vector_size)

In [15]:

import metrics as metrics
import numpy as np
from baselines import EmbeddingBaseline


class EmbeddingBaselineHNames(EmbeddingBaseline):
    def __init__(self, dataset, w2v_model, article_qrels):
            super().__init__(dataset, w2v_model)
            self.article_qrels = article_qrels

    def get_query_vector(self, query, expansion=False):
        if expansion:
            query_id = query[0]  # exemple : "enwiki:Allergy/Signs%20and%20symptoms"
            return get_h_names_expansion(self.w2v_model, query_id, self.article_qrels)
        else:
            return self.vectorize(query[1][0])


In [19]:
embeddingHnames_baseline = EmbeddingBaselineHNames(dataset, model, article_qrels)


In [20]:
results_embedding = embeddingHnames_baseline.eval_model()

In [21]:
print("Embedding Baseline Results: ", results_embedding)

Embedding Baseline Results:  {'NDCG': 0.13279297451249622, 'MAP': 0.007712892408018277, 'RPrec': 0.0009433962264150943}


In [24]:
results_embedding_expanded =embeddingHnames_baseline.eval_model(expansion=True)

In [25]:
print("Embedding Baseline Expanded Results: ", results_embedding_expanded)

Embedding Baseline Expanded Results:  {'NDCG': 0.1289461152791479, 'MAP': 0.007139238354286149, 'RPrec': 0.000817520471608522}


In [26]:
def get_r_aliases_embed(model, query_id, toplevel_qrels):
    entities = toplevel_qrels.get(query_id, [])
    vectors = []
    for eid in entities:
        tokens = eid.replace("enwiki:", "").replace("_", " ").split()
        for token in tokens:
            if token in model.wv:
                vectors.append(model.wv[token])
    if vectors:
        return np.mean(vectors, axis=0)
    return np.zeros(model.vector_size)

In [27]:
# from expansion2 import get_alias_expansion
import metrics as metrics
import numpy as np
from baselines import EmbeddingBaseline

class EmbeddingBaselineAliases(EmbeddingBaseline):
    def __init__(self, dataset, w2v_model, toplevel_qrels):
        super().__init__(dataset, w2v_model)
        self.toplevel_qrels = toplevel_qrels

    def get_query_vector(self, query, expansion=False):
        if expansion:
            query_id = query[0]
            return get_r_aliases_embed(self.w2v_model, query_id, self.toplevel_qrels)
        else:
            return self.vectorize(query[1][0])


In [30]:
embeddingaliases_baseline = EmbeddingBaselineAliases(dataset, model, toplevel_qrels)


In [36]:
results_embedding = embeddingaliases_baseline.eval_model()

In [37]:
print("Embedding Baseline Results: ", results_embedding)

Embedding Baseline Results:  {'NDCG': 0.13279297451249622, 'MAP': 0.007712892408018277, 'RPrec': 0.0009433962264150943}


In [38]:
results_embedding_expanded = embeddingaliases_baseline.eval_model(expansion=True)

In [39]:
print("Embedding Baseline Expanded Results: ", results_embedding_expanded)

Embedding Baseline Expanded Results:  {'NDCG': 0.13052761537064678, 'MAP': 0.008085219497893611, 'RPrec': 0.001632801161103048}


In [51]:
def get_ih_ids_embed(model, query_id, hierarchical_qrels):
    entities = hierarchical_qrels.get(query_id, [])
    vectors = []
    for eid in entities:
        entity_token = eid.replace("enwiki:", "").replace(" ", "_") + ".id"
        if entity_token in model.wv:
            vectors.append(model.wv[entity_token])
    if vectors:
        return np.mean(vectors, axis=0)
    return np.zeros(model.vector_size)

In [81]:
class RIH_IDs_Embedding(object):
    def __init__(self, dataset, model, hierarchical_qrels):
        self.dataset = dataset
        self.model = model
        self.hierarchical_qrels = hierarchical_qrels

        # Pré-calcul des embeddings des documents
        self.document_embeddings = np.array([
            self.vectorize(doc) for doc in dataset["documents"].values()
        ])
        self.documents_ids = list(dataset["documents"].keys())

    def vectorize(self, text):
        embedding = np.zeros(self.model.vector_size)
        count = 0
        for word in text.split():
            if word in self.model.wv:
                embedding += self.model.wv[word]
                count += 1
        return embedding / count if count > 0 else embedding

    def get_query_vector(self, query,expansion=False):
        if expansion:
            query_id = query[0]
            return get_ih_ids_embed(self.model, query_id, self.hierarchical_qrels)
        else:
            # Prendre le premier élément du tuple query[1], qui est le texte
            return self.vectorize(query[1][0])
        
    def cosine_similarities(self, query_vec):
        dot_products = self.document_embeddings @ query_vec.reshape(-1, 1)
        norm_query = np.linalg.norm(query_vec)
        norms_docs = np.linalg.norm(self.document_embeddings, axis=1)
        cosine_scores = dot_products.flatten() / (norm_query * norms_docs + 1e-12)
        return cosine_scores

    def get_top_k(self, query, k=1000, expansion=False):
        q_vec = self.get_query_vector(query, expansion)
        scores = self.cosine_similarities(q_vec)
        top_k_idx = np.argsort(scores)[::-1][:k]
        top_k_docs = [self.documents_ids[i] for i in top_k_idx]
        top_k_scores = scores[top_k_idx]
        return top_k_docs, top_k_scores

    def eval_query(self, query, k=100, expansion=False):
        scores = {
            "NDCG": metrics.NDCG,
            "MAP": metrics.AP,
            "RPrec": metrics.RPrec
        }
        results = {}
        # On passe query complet à get_top_k, pas le vecteur
        docs, _ = self.get_top_k(query, k, expansion=expansion)
        for metric_name, metric_callback in scores.items():
            results[metric_name] = metric_callback(query[0], docs, self.dataset["relevances"])
        return results


    def eval_model(self, k=1000,expansion=False):

        results = {
            "NDCG": [],
            "MAP": [],
            "RPrec": []
        }
        for query in self.dataset["queries"].items():
            query_results = self.eval_query(query, k,expansion)
            for metric_name, metric_value in query_results.items():
                results[metric_name].append(metric_value)
        results = {metric: np.mean(values) for metric, values in results.items()}
        return results


In [82]:
rih_ids_model = RIH_IDs_Embedding(dataset, model, hierarchical_qrels)

In [85]:
results_rih_ids = rih_ids_model.eval_model()
print(results_ih_ids)

{'NDCG': 0.1289461152791479, 'MAP': 0.007139238354286149, 'RPrec': 0.000817520471608522}


In [86]:
results_rih_ids = rih_ids_model.eval_model(expansion=True)
print(results_ih_ids)

{'NDCG': 0.1289461152791479, 'MAP': 0.007139238354286149, 'RPrec': 0.000817520471608522}
