In [110]:
from gensim.models import Word2Vec
from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer


import dill as pickle
import gensim.downloader as api
import joblib
import numpy as np
import re
import sys

In [111]:
sys.path.append('..')

from baselines import EmbeddingBaseline
from knowledgebase import KnowledgeBase

In [6]:
# Load a derived dataset from the "TREC CAR 2017" dataset
dataset = {
    "queries": pickle.load(open("queries.pkl", "rb")),
    "documents": pickle.load(open("documents.pkl", "rb")),
    "relevances": pickle.load(open("relevances.pkl", "rb"))
}

In [7]:
def load_entity_qrels(filepath):
    qrels = {}
    with open(filepath, 'r') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) != 4:
                continue
            query_id, _, entity_id, score = parts
            qrels.setdefault(query_id, set()).add(entity_id)
    return qrels


In [8]:
toplevel_qrels = load_entity_qrels("fold-0-train.pages.cbor-toplevel.entity.qrels")
hierarchical_qrels = load_entity_qrels("fold-0-train.pages.cbor-hierarchical.entity.qrels")
article_qrels = load_entity_qrels("fold-0-train.pages.cbor-article.entity.qrels")


In [9]:
galago_stopwords_file = './galago_418_inquery_stopwords.txt'

# Load Galago stopwords into a Python list
with open(galago_stopwords_file, 'r') as f:
    galago_stopwords = [line.strip() for line in f if line.strip()]

In [10]:
# Ressources Installation for NLTK
download("punkt")
download("stopwords")
download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
#Le nouveau PreProcess 
english_stopwords= galago_stopwords
stemmer = SnowballStemmer("english")

def preprocess(text):
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    words = word_tokenize(text)
    words_stemmed = [stemmer.stem(w) for w in words if w not in english_stopwords]
    return words_stemmed

In [12]:
# Create the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(tokenizer=preprocess, stop_words=english_stopwords)

# Train the TFIDF-Vectorizer on the corpus
def generate_docs_texts():
    for text in dataset["documents"].values():
        yield text

documents_vectors = vectorizer.fit_transform(generate_docs_texts())
documents_vectors = documents_vectors.toarray()



In [13]:
qrels = dataset["relevances"]
queries = dataset["queries"]

zero_relevance_queries = [
    qid for qid in queries
    if not any(qid == rel_qid for (rel_qid, _) in qrels)
]

print(f"{len(zero_relevance_queries)} requêtes n'ont aucun document pertinent.")
print(len(queries))

47 requêtes n'ont aucun document pertinent.
477


In [14]:
pretrained_model = api.load("fasttext-wiki-news-subwords-300")

In [15]:
knowledge_base = KnowledgeBase()
knowledge_base.load("../models/knowledge_base.pkl")
entities = list(knowledge_base.kb)

In [16]:
corpus = list(dataset["documents"].values())

model = Word2Vec(vector_size=300, window=10, sample=1e-3, min_count=0, sg=1)
model.build_vocab(corpus + entities)
total_examples = model.corpus_count
model.build_vocab([list(pretrained_model.key_to_index.keys())], update=True)

model.train(corpus, total_examples=total_examples, epochs=2)
model.save("../models/skipgram-entities.model")

In [75]:
print(f"Nombre total de query_ids dans article_qrels : {len(article_qrels)}")
print("Exemples de query_ids disponibles :")
for i, key in enumerate(article_qrels.keys()):
    print(f"- {key}")
    if i == 10:
        break

Nombre total de query_ids dans article_qrels : 24
Exemples de query_ids disponibles :
- enwiki:Allergy
- enwiki:Candy%20making
- enwiki:Chocolate
- enwiki:Deforestation%20of%20the%20Amazon%20rainforest
- enwiki:Egg%20white
- enwiki:Fight-or-flight%20response
- enwiki:Gaffkaemia
- enwiki:Gut%E2%80%93brain%20axis
- enwiki:Heavy%20water
- enwiki:Ice%20bath
- enwiki:Irritable%20bowel%20syndrome


In [112]:
import urllib.parse
import re

def get_h_names_expansion(model, leaf_heading):
    tokens = leaf_heading.split()
    vectors = []
    try:
        similar = model.wv.most_similar(positive=tokens, topn=5)
    except KeyError:
        similar = []

    expanded_terms = set(tokens + [w for w, _ in similar])

    for word in expanded_terms:
        if word in model.wv:
            vectors.append(model.wv[word])

    if vectors:
        return np.mean(vectors, axis=0)
    return np.zeros(model.vector_size)



In [113]:

import metrics as metrics
import numpy as np
from baselines import EmbeddingBaseline


class EmbeddingBaselineHNames(EmbeddingBaseline):
    def __init__(self, dataset, w2v_model):
            super().__init__(dataset, w2v_model)

    def get_query_vector(self, query, expansion=False):
        if expansion:
            query_id = query[0]  # exemple : "enwiki:Allergy/Signs%20and%20symptoms"
            return get_h_names_expansion(self.w2v_model, query_id)
        else:
            return self.vectorize(query[1][0])
    def print_top_docs(self, query, expansion=False, top_k=5):
        qvec = self.get_query_vector(query, expansion)
        doc_ids, scores = self.get_top_k(qvec, k=top_k)
        
        print(f"\n🔎 Query: {query[0]} ({query[1][0]})")
        print("Top documents:")
        for doc_id in doc_ids:
            text = self.dataset["documents"].get(doc_id, "[Texte non trouvé]")
            print(f"📄 Doc ID: {doc_id}\n{text[:300]}...\n")


In [114]:
embeddingHnames_baseline = EmbeddingBaselineHNames(dataset, model)


In [115]:
results_embedding = embeddingHnames_baseline.eval_model()

In [116]:
print("Embedding Baseline Results: ", results_embedding)

Embedding Baseline Results:  {'NDCG': 0.13262531316961995, 'MAP': 0.007713285441813863, 'RPrec': 0.0009433962264150943}


In [117]:
results_embedding_expanded =embeddingHnames_baseline.eval_model(expansion=True)

In [118]:
print("Embedding Baseline Expanded Results: ", results_embedding_expanded)

Embedding Baseline Expanded Results:  {'NDCG': 0.1289461152791479, 'MAP': 0.007139238354286149, 'RPrec': 0.000817520471608522}


In [119]:
for i, query in enumerate(dataset["queries"].items()):
    if i >= 3:  # Limiter à 3 requêtes pour l'affichage
        break
    embeddingHnames_baseline.print_top_docs(query, expansion=True)


🔎 Query: enwiki:Chocolate/Etymology (Chocolate / Etymology)
Top documents:
📄 Doc ID: 000545072276c41f3bfba06c7f115de95c5997a5
Author Michael Lewis wrote that CDS enabled speculators to stack bets on the same mortgage bonds and CDO's. This is analogous to allowing many persons to buy insurance on the same house. Speculators that bought CDS insurance were betting that significant defaults would occur, while the sellers (such...

📄 Doc ID: ffcab78727700e73aa0100325e181418facc6d44
By September 2008, average U.S. housing prices had declined by over 20% from their mid-2006 peak. This major and unexpected decline in house prices means that many borrowers have zero or negative equity in their homes, meaning their homes were worth less than their mortgages. As of March 2008, an es...

📄 Doc ID: fefb0b295631e75f4030fc0b4160cc9f3451f8e5
The International Monetary Fund estimated that large U.S. and European banks lost more than $1 trillion on toxic assets and from bad loans from January 2007 to S

In [37]:
import urllib.parse

def get_r_aliases_embed(model, query_id, toplevel_qrels):
    entities = toplevel_qrels.get(query_id, [])
    print(f"\n🔍 Query ID: {query_id}")
    print(f"📌 Entités associées : {entities}")
    
    vectors = []
    used_tokens = []
    
    for eid in entities:
        decoded = urllib.parse.unquote(eid.replace("enwiki:", ""))
        tokens = decoded.replace("_", " ").split()

        for token in tokens:
            if token in model.wv:
                used_tokens.append(token)
                vectors.append(model.wv[token])
            else:
                print(f"⚠️ Token ignoré (hors vocabulaire) : {token}")
    
    print(f"✅ Tokens utilisés pour la moyenne : {used_tokens}")
    
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        print("❌ Aucune entité utilisable trouvée !")
        return np.zeros(model.vector_size)


In [122]:
# from expansion2 import get_alias_expansion
import metrics as metrics
import numpy as np
from baselines import EmbeddingBaseline

class EmbeddingBaselineAliases(EmbeddingBaseline):
    def __init__(self, dataset, w2v_model, toplevel_qrels):
        super().__init__(dataset, w2v_model)
        self.toplevel_qrels = toplevel_qrels

    def get_query_vector(self, query, expansion=False):
        if expansion:
            query_id = query[0]
            return get_r_aliases_embed(self.w2v_model, query_id, self.toplevel_qrels)
        else:
            return self.vectorize(query[1][0])

    def print_top_docs(self, query, expansion=False, top_k=5):
        qvec = self.get_query_vector(query, expansion)
        doc_ids, scores = self.get_top_k(qvec, k=top_k)
        
        print(f"\n🔎 Query: {query[0]} ({query[1][0]})")
        print("Top documents:")
        for doc_id in doc_ids:
            text = self.dataset["documents"].get(doc_id, "[Texte non trouvé]")
            print(f"📄 Doc ID: {doc_id}\n{text[:300]}...\n")


In [81]:
print(f"Nombre total de query_ids dans article_qrels : {len(toplevel_qrels)}")
print("Exemples de query_ids disponibles :")
for i, key in enumerate(toplevel_qrels.keys()):
    print(f"- {key}")
    if i == 10:
        break

Nombre total de query_ids dans article_qrels : 161
Exemples de query_ids disponibles :
- enwiki:Allergy
- enwiki:Allergy/Cause
- enwiki:Allergy/Diagnosis
- enwiki:Allergy/Epidemiology
- enwiki:Allergy/History
- enwiki:Allergy/Management
- enwiki:Allergy/Medical%20specialty
- enwiki:Allergy/Pathophysiology
- enwiki:Allergy/Research
- enwiki:Allergy/Signs%20and%20symptoms
- enwiki:Candy%20making


In [123]:
embeddingaliases_baseline = EmbeddingBaselineAliases(dataset, model, toplevel_qrels)


In [124]:
results_embedding = embeddingaliases_baseline.eval_model()

In [125]:
print("Embedding Baseline Results: ", results_embedding)

Embedding Baseline Results:  {'NDCG': 0.13262531316961995, 'MAP': 0.007713285441813863, 'RPrec': 0.0009433962264150943}


In [126]:
results_embedding_expanded = embeddingaliases_baseline.eval_model(expansion=True)


🔍 Query ID: enwiki:Chocolate/Etymology
📌 Entités associées : {'enwiki:Colonization', 'enwiki:Yucatec%20Maya%20language', 'enwiki:Chocolate%20chip', 'enwiki:Chocolatier', 'enwiki:Nahuatl'}
✅ Tokens utilisés pour la moyenne : ['Colonization', 'Yucatec', 'Maya', 'language', 'Chocolate', 'chip', 'Chocolatier', 'Nahuatl']

🔍 Query ID: enwiki:Chocolate/History
📌 Entités associées : {'enwiki:Moctezuma%20II', 'enwiki:Vanilla', 'enwiki:Jos%C3%A9%20de%20Acosta', 'enwiki:Allspice', 'enwiki:Avocado', 'enwiki:Cymbopetalum%20penduliflorum', 'enwiki:Chili%20pepper', 'enwiki:Christopher%20Columbus', 'enwiki:Society%20of%20Jesus', 'enwiki:Central%20America', 'enwiki:Veracruz', 'enwiki:Hern%C3%A1n%20Cort%C3%A9s', 'enwiki:Chocolate%20temper%20meter', 'enwiki:Chiapas', 'enwiki:Quetzalcoatl', 'enwiki:Maya%20script', 'enwiki:Mokaya', 'enwiki:South%20America', 'enwiki:Conquistador', 'enwiki:Honey', 'enwiki:Olmec', 'enwiki:Encyclop%C3%A9die', 'enwiki:Ferdinand%20Columbus'}
⚠️ Token ignoré (hors vocabulaire) 

In [43]:
print("Embedding Baseline Expanded Results: ", results_embedding_expanded)

Embedding Baseline Expanded Results:  {'NDCG': 0.1308704255398717, 'MAP': 0.00850410182079018, 'RPrec': 0.00205208837284309}


In [133]:
for i, query in enumerate(dataset["queries"].items()):
    if i >= 3:  # Limiter à 3 requêtes pour l'affichage
        break
    embeddingaliases_baseline.print_top_docs(query, expansion=True)


🔍 Query ID: enwiki:Chocolate/Etymology
📌 Entités associées : {'enwiki:Colonization', 'enwiki:Yucatec%20Maya%20language', 'enwiki:Chocolate%20chip', 'enwiki:Chocolatier', 'enwiki:Nahuatl'}
✅ Tokens utilisés pour la moyenne : ['Colonization', 'Yucatec', 'Maya', 'language', 'Chocolate', 'chip', 'Chocolatier', 'Nahuatl']

🔎 Query: enwiki:Chocolate/Etymology (Chocolate / Etymology)
Top documents:
📄 Doc ID: 359600a21f69259fa48edce87794ad2bea737a77
Deuterium is an isotope of hydrogen whose nucleus comprises both a neutron and a proton; the nucleus of a protium (normal hydrogen) atom consists of just a proton. The additional neutron makes a deuterium atom roughly twice as heavy as a protium atom....

📄 Doc ID: 66bec3101b1b06798c8028c895ee53900fd2a1a3
Domperidone, a dopamine receptor blocker and a parasympathomimetic, has been shown to reduce bloating and abdominal pain as a result of an accelerated colon transit time and reduced faecal load, that is, a relief from 'hidden constipation'; defec

In [128]:
import urllib.parse
import re

def get_rih_ids_embed(model, query_id, hierarchical_qrels):
    names = hierarchical_qrels.get(query_id, [])
    print(f"\n🔍 Query ID: {query_id}")
    print(f"📌 Noms hiérarchiques associés : {names}")

    vectors = []
    used_tokens = []

    for name in names:
        # Nettoyage
        decoded = urllib.parse.unquote(name)
        cleaned = re.sub(r"[^\w\s]", " ", decoded)  # supprime ponctuations
        tokens = cleaned.split()

        for token in tokens:
            if token in model.wv:
                used_tokens.append(token)
                vectors.append(model.wv[token])
            else:
                print(f"⚠️ Token ignoré (hors vocabulaire) : {token}")

    print(f"✅ Tokens utilisés pour la moyenne : {used_tokens}")

    if vectors:
        return np.mean(vectors, axis=0)
    else:
        print("❌ Aucune entité utilisable trouvée !")
        return np.zeros(model.vector_size)


In [129]:
class RIH_IDs_Embedding(object):
    def __init__(self, dataset, model, hierarchical_qrels):
        self.dataset = dataset
        self.model = model
        self.hierarchical_qrels = hierarchical_qrels

        # Pré-calcul des embeddings des documents
        self.document_embeddings = np.array([
            self.vectorize(doc) for doc in dataset["documents"].values()
        ])
        self.documents_ids = list(dataset["documents"].keys())

    def vectorize(self, text):
        embedding = np.zeros(self.model.vector_size)
        count = 0
        for word in text.split():
            if word in self.model.wv:
                embedding += self.model.wv[word]
                count += 1
        return embedding / count if count > 0 else embedding

    def get_query_vector(self, query,expansion=False):
        if expansion:
            query_id = query[0]
            return get_rih_ids_embed(self.model, query_id, self.hierarchical_qrels)
        else:
            # Prendre le premier élément du tuple query[1], qui est le texte
            return self.vectorize(query[1][0])
        
    def cosine_similarities(self, query_vec):
        dot_products = self.document_embeddings @ query_vec.reshape(-1, 1)
        norm_query = np.linalg.norm(query_vec)
        norms_docs = np.linalg.norm(self.document_embeddings, axis=1)
        cosine_scores = dot_products.flatten() / (norm_query * norms_docs + 1e-12)
        return cosine_scores

    def get_top_k(self, query, k=1000, expansion=False):
        q_vec = self.get_query_vector(query, expansion)
        scores = self.cosine_similarities(q_vec)
        top_k_idx = np.argsort(scores)[::-1][:k]
        top_k_docs = [self.documents_ids[i] for i in top_k_idx]
        top_k_scores = scores[top_k_idx]
        return top_k_docs, top_k_scores

    def eval_query(self, query, k=100, expansion=False):
        scores = {
            "NDCG": metrics.NDCG,
            "MAP": metrics.AP,
            "RPrec": metrics.RPrec
        }
        results = {}
        # On passe query complet à get_top_k, pas le vecteur
        docs, _ = self.get_top_k(query, k, expansion=expansion)
        for metric_name, metric_callback in scores.items():
            results[metric_name] = metric_callback(query[0], docs, self.dataset["relevances"])
        return results


    def eval_model(self, k=1000,expansion=False):

        results = {
            "NDCG": [],
            "MAP": [],
            "RPrec": []
        }
        for query in self.dataset["queries"].items():
            query_results = self.eval_query(query, k,expansion)
            for metric_name, metric_value in query_results.items():
                results[metric_name].append(metric_value)
        results = {metric: np.mean(values) for metric, values in results.items()}
        return results

    def print_top_docs(self, query, expansion=False, top_k=5):
        qvec = self.get_query_vector(query, expansion)
        doc_ids, scores = self.get_top_k(qvec, k=top_k)
        
        print(f"\n🔎 Query: {query[0]} ({query[1][0]})")
        print("Top documents:")
        for doc_id in doc_ids:
            text = self.dataset["documents"].get(doc_id, "[Texte non trouvé]")
            print(f"📄 Doc ID: {doc_id}\n{text[:300]}...\n")


In [138]:
rih_ids_model = RIH_IDs_Embedding(dataset, model, hierarchical_qrels)

In [139]:
results_rih_ids = rih_ids_model.eval_model()
print(results_rih_ids)

{'NDCG': 0.13145522812475688, 'MAP': 0.007297598776885572, 'RPrec': 0.0017295597484276728}


In [140]:
results_rih_ids = rih_ids_model.eval_model(expansion=True)
print(results_rih_ids)


🔍 Query ID: enwiki:Chocolate/Etymology
📌 Noms hiérarchiques associés : {'enwiki:Colonization', 'enwiki:Yucatec%20Maya%20language', 'enwiki:Chocolate%20chip', 'enwiki:Chocolatier', 'enwiki:Nahuatl'}
✅ Tokens utilisés pour la moyenne : ['enwiki', 'Colonization', 'enwiki', 'Yucatec', 'Maya', 'language', 'enwiki', 'Chocolate', 'chip', 'enwiki', 'Chocolatier', 'enwiki', 'Nahuatl']

🔍 Query ID: enwiki:Chocolate/History
📌 Noms hiérarchiques associés : []
✅ Tokens utilisés pour la moyenne : []
❌ Aucune entité utilisable trouvée !

🔍 Query ID: enwiki:Chocolate/History/Mesoamerican%20usage
📌 Noms hiérarchiques associés : {'enwiki:Maya%20script', 'enwiki:Chili%20pepper', 'enwiki:Chiapas', 'enwiki:Mokaya', 'enwiki:Vanilla', 'enwiki:Veracruz', 'enwiki:Allspice', 'enwiki:Quetzalcoatl', 'enwiki:Honey', 'enwiki:Olmec', 'enwiki:Avocado', 'enwiki:Cymbopetalum%20penduliflorum'}
⚠️ Token ignoré (hors vocabulaire) : Cymbopetalum
⚠️ Token ignoré (hors vocabulaire) : penduliflorum
✅ Tokens utilisés pour la 

In [142]:
for i, (query_id, query_content) in enumerate(dataset["queries"].items()):
    if i >= 3:
        break
    rih_ids_model.print_top_docs((query_id, query_content), expansion=True)


🔍 Query ID: enwiki:Chocolate/Etymology
📌 Noms hiérarchiques associés : {'enwiki:Colonization', 'enwiki:Yucatec%20Maya%20language', 'enwiki:Chocolate%20chip', 'enwiki:Chocolatier', 'enwiki:Nahuatl'}
✅ Tokens utilisés pour la moyenne : ['enwiki', 'Colonization', 'enwiki', 'Yucatec', 'Maya', 'language', 'enwiki', 'Chocolate', 'chip', 'enwiki', 'Chocolatier', 'enwiki', 'Nahuatl']


IndexError: invalid index to scalar variable.