In [1]:
pip install dill

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [2]:
from gensim.models import Word2Vec
from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer


import dill as pickle
import gensim.downloader as api
import joblib
import numpy as np
import re
import sys

In [3]:
sys.path.append('..')

from baselines import RIH_Cosine
from knowledgebase import KnowledgeBase

In [4]:
# Load a derived dataset from the "TREC CAR 2017" dataset
dataset = {
    "queries": pickle.load(open("queries.pkl", "rb")),
    "documents": pickle.load(open("documents.pkl", "rb")),
    "relevances": pickle.load(open("relevances.pkl", "rb"))
}

In [5]:
galago_stopwords_file = './galago_418_inquery_stopwords.txt'

# Load Galago stopwords into a Python list
with open(galago_stopwords_file, 'r') as f:
    galago_stopwords = [line.strip() for line in f if line.strip()]

In [6]:
print(f"Nombre total de documents : {len(dataset['documents'])}")

Nombre total de documents : 1103


In [7]:
# Ressources Installation for NLTK
download("punkt")
download("stopwords")
download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [59]:
# english_stopwords = stopwords.words("english")

# def preprocess(text):
#     text = re.sub(r"[^a-zA-Z]", " ", text.lower())
#     words = word_tokenize(text)
#     words_lemmed = [WordNetLemmatizer().lemmatize(w) for w in words if w not in english_stopwords]
#     return words_lemmed

In [8]:
#Le nouveau PreProcess 
english_stopwords= galago_stopwords
stemmer = SnowballStemmer("english")

def preprocess(text):
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    words = word_tokenize(text)
    words_stemmed = [stemmer.stem(w) for w in words if w not in english_stopwords]
    return words_stemmed

In [9]:
# Create the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(tokenizer=preprocess, stop_words=english_stopwords)

# Train the TFIDF-Vectorizer on the corpus
def generate_docs_texts():
    for text in dataset["documents"].values():
        yield text

documents_vectors = vectorizer.fit_transform(generate_docs_texts())
documents_vectors = documents_vectors.toarray()



In [10]:
qrels = dataset["relevances"]
queries = dataset["queries"]

zero_relevance_queries = [
    qid for qid in queries
    if not any(qid == rel_qid for (rel_qid, _) in qrels)
]

print(f"{len(zero_relevance_queries)} requêtes n'ont aucun document pertinent.")
print(len(queries))


47 requêtes n'ont aucun document pertinent.
477


In [29]:
relevances = pickle.load(open("relevances.pkl", "rb"))

covered_queries = set(qid for (qid, _) in relevances)
print(f"{len(covered_queries)} requêtes ont au moins un document pertinent")


462 requêtes ont au moins un document pertinent


In [30]:
queries = dataset["queries"]
relevances = dataset["relevances"]

covered_queries = set(qid for (qid, _) in relevances)
all_queries = set(queries.keys())

print(f"Requêtes totales : {len(all_queries)}")
print(f"Requêtes avec jugements pertinents : {len(covered_queries)}")
print(f"Requêtes sans jugements : {len(all_queries - covered_queries)}")


Requêtes totales : 477
Requêtes avec jugements pertinents : 462
Requêtes sans jugements : 47


In [11]:
pretrained_model = api.load("fasttext-wiki-news-subwords-300")

In [12]:
knowledge_base = KnowledgeBase()
knowledge_base.load("../models/knowledge_base.pkl")
entities = list(knowledge_base.kb)

In [13]:
corpus = list(dataset["documents"].values())

model = Word2Vec(vector_size=300, window=10, sample=1e-3, min_count=0, sg=1)
model.build_vocab(corpus + entities)
total_examples = model.corpus_count
model.build_vocab([list(pretrained_model.key_to_index.keys())], update=True)

model.train(corpus, total_examples=total_examples, epochs=2)
model.save("../models/skipgram-entities.model")

In [None]:
#BASELINE

In [None]:
#RELEVANCE BASELINE

In [14]:
rih_cosine = RIH_Cosine(dataset, vectorizer)
results_cosine = rih_cosine.eval_model()

In [None]:
#c'est le score de base avec la requete originale

In [15]:
print("RIH Cosine Results: ", results_cosine)

RIH Cosine Results:  {'NDCG': 0.2536553231880979, 'MAP': 0.08415221882224722, 'RPrec': 0.034126454723939006}


In [None]:
#RELEVANCE EXPANSION

In [37]:
# from expansion import relevance_expansion
# # , embedding_expansion

# import metrics as metrics
# import numpy as np

# class RelevanceBaseline(object):
#     def __init__(self, dataset, vectorizer):
#         self.dataset = dataset
#         self.vectorizer = vectorizer
    
#     def vectorize(self, text):
#         return self.vectorizer.transform([text]).toarray()[0]
        
#     def get_query_vector(self, query, expansion=None):
#         q = self.vectorize(query[1][1])
#         if expansion is not None:
#             relevant_ids = [doc_id for (q_id, doc_id) in self.dataset["relevances"].keys() if q_id == query[0]]
#             relevant_texts = [self.dataset["documents"][doc_id] for doc_id in relevant_ids if doc_id in self.dataset["documents"]]
    
        
#             print(f"Query: {query[0]} | Relevant texts count: {len(relevant_texts)}")
#             if len(relevant_texts) == 0:
#                 print("No relevant texts found, returning original query vector.")
#                 return q
    
#             # Même chose pour non-relevant
#             non_relevant_ids = [doc_id for doc_id in self.dataset["documents"].keys() if doc_id not in relevant_ids]
#             non_relevant_texts = [self.dataset["documents"][doc_id] for doc_id in non_relevant_ids if doc_id in self.dataset["documents"]]
            
#             print(f"Query: {query[0]} | Non relevant texts count: {len(non_relevant_texts)}")
#             if len(non_relevant_texts) == 0:
#                 print("No non-relevant texts found, returning original query vector.")
#                 return q
    
#             # Là juste avant transform(), tu sais combien de textes tu passes
#             print(f"Transforming {len(relevant_texts)} relevant texts")
#             relevant_docs = self.vectorizer.transform(relevant_texts).toarray()
#             print(relevant_docs.shape)
    
#             print(f"Transforming {len(non_relevant_texts)} non relevant texts")
#             non_relevant_docs = self.vectorizer.transform(non_relevant_texts).toarray()
#             print(non_relevant_docs.shape)
    
#             return relevance_expansion(q, relevant_docs, non_relevant_docs, *expansion)
            
#         else:
#             return q


#     def get_top_k(self, query, k=1000):
#         return list(self.dataset["documents"].keys())[:k]

#     def eval_query(self, query, k=1000, expansion=None):
#         scores = {
#         "NDCG": metrics.NDCG,
#         "MAP": metrics.AP,
#         "RPrec": metrics.RPrec
#         }
#         results = {}
#         q = self.get_query_vector(query, expansion)
#         docs, _ = self.get_top_k(q, k)
#         for metric_name, metric_callback in scores.items():
#             results[metric_name] = metric_callback(query[0], docs, self.dataset["relevances"])
#         return results
    
#     def eval_model(self, k=1000, expansion=None):
#         results = {
#             "NDCG": [],
#             "MAP": [],
#             "RPrec": []
#         }
#         for query in self.dataset["queries"].items():
#             query_results = self.eval_query(query, k, expansion)
#             for metric_name, metric_value in query_results.items():
#                 results[metric_name].append(metric_value)
#         results = {metric_name: np.mean(metric_values) for metric_name, metric_values in results.items()}
#         return results


# class RIH_Cosine(RelevanceBaseline):
#     def __init__(self, dataset, vectorizer):
#         self.dataset = dataset
#         self.documents_ids = np.array(list(self.dataset["documents"].keys()))
#         self.documents_vectors = vectorizer.transform(list(self.dataset["documents"].values())).toarray()
#         self.vectorizer = vectorizer

#     def cosine_similarities(self, query):
#         dot_product = self.documents_vectors @ query.reshape(-1, 1)
#         denominator = (np.maximum(np.linalg.norm(query), 1e-12) * np.maximum(np.linalg.norm(self.documents_vectors), 1e-12))
#         return dot_product / denominator

#     def get_top_k(self, query, k=1000):
#         scores = self.cosine_similarities(query).reshape(-1)
#         top_k_indexes = np.argsort(scores)[::-1][:k]
#         return self.documents_ids[top_k_indexes], scores[top_k_indexes]
    

In [16]:
rih_cosine = RIH_Cosine(dataset, vectorizer)
# results_cosine_expanded = rih_cosine.eval_model(expansion=[1.0, 0.75, 0.15])

In [17]:
results_cosine_expanded = rih_cosine.eval_model(expansion=[1.0, 0.75, 0.15])

In [18]:
print("RIH Cosine Expanded Results: ", results_cosine_expanded)
# {'NDCG': 0.25072403265687143, 'MAP': 0.08286148903048288, 'RPrec': 0.031986758873551326}

RIH Cosine Expanded Results:  {'NDCG': 0.7270287946225499, 'MAP': 0.6373963640241985, 'RPrec': 0.5858842706326982}


In [23]:
print(rih_cosine.dataset["queries"])

first_key = next(iter(rih_cosine.dataset["queries"]))   # récupère la première clé
first_query = rih_cosine.dataset["queries"][first_key]

{'enwiki:Chocolate/Etymology': ('Chocolate / Etymology', 'Chocolate', ('Etymology',)), 'enwiki:Chocolate/History': ('Chocolate / History', 'Chocolate', ('History',)), 'enwiki:Chocolate/History/Mesoamerican%20usage': ('Chocolate / History / Mesoamerican usage', 'Chocolate', ('History', 'Mesoamerican usage')), 'enwiki:Chocolate/History/European%20adaptation': ('Chocolate / History / European adaptation', 'Chocolate', ('History', 'European adaptation')), 'enwiki:Chocolate/History/Storage': ('Chocolate / History / Storage', 'Chocolate', ('History', 'Storage')), 'enwiki:Chocolate/Nutrition%20and%20research': ('Chocolate / Nutrition and research', 'Chocolate', ('Nutrition and research',)), 'enwiki:Chocolate/Nutrition%20and%20research/Nutrition': ('Chocolate / Nutrition and research / Nutrition', 'Chocolate', ('Nutrition and research', 'Nutrition')), 'enwiki:Chocolate/Nutrition%20and%20research/Research': ('Chocolate / Nutrition and research / Research', 'Chocolate', ('Nutrition and research'

In [25]:
first_key = next(iter(rih_cosine.dataset["queries"]))
first_query_tuple = rih_cosine.dataset["queries"][first_key]
first_query_text = first_query_tuple[0]  # Exemple : 'Chocolate / Etymology'

# Vectoriser la requête texte
first_query_vector = rih_cosine.vectorizer.transform([first_query_text]).toarray()[0]

# Maintenant tu peux passer ce vecteur à get_top_k
rih_cosine.get_top_k(first_query_vector, k=5)

(array(['e0ba53bb4d3c3f8e9364df27649d31073761b342',
        '7c5ec48b074fe00e2ec05cb04b91df774f09add0',
        '9a2e4dbaa97768ecf16cfc51ea9349c6e3f8b1f8',
        'ba0edd8ad705e074e1bf179e1639a6bbec234c82',
        '711bdf6c4115b6215c4d677b48ce687ff09f25fe'], dtype='<U40'),
 array([0.01244454, 0.01087372, 0.0091515 , 0.00855712, 0.00833759]))

In [27]:
doc_ids, scores = rih_cosine.get_top_k(first_query_vector, k=5)

print(f"\n🔎 Query: {query[0]} ({query[1][0]})")
print("Top documents:")
for doc_id in doc_ids:
    text = rih_cosine.dataset["documents"].get(doc_id, "[Texte non trouvé]")
    print(f"📄 Doc ID: {doc_id}\n{text[:300]}...\n")


🔎 Query: enwiki:Chocolate/Etymology (Chocolate / Etymology)
Top documents:
📄 Doc ID: e0ba53bb4d3c3f8e9364df27649d31073761b342
Chocolate is sold in chocolate bars, which come in dark chocolate, milk chocolate and white chocolate varieties. Some bars that are mostly chocolate have other ingredients blended into the chocolate, such as nuts, raisins or crisped rice. Chocolate is used as an ingredient in a huge variety of candy...

📄 Doc ID: 7c5ec48b074fe00e2ec05cb04b91df774f09add0
Chocolate has become one of the most popular food types and flavors in the world, and a vast number of foodstuffs involving chocolate have been created, particularly desserts including cakes, pudding, mousse, chocolate brownies, and chocolate chip cookies. Many candies are filled with or coated with...

📄 Doc ID: 9a2e4dbaa97768ecf16cfc51ea9349c6e3f8b1f8
While Columbus had taken cacao beans with him back to Spain, chocolate made no impact until Spanish friars introduced it to the Spanish court. After the Spanish 

In [41]:
rih_cosine = RIH_Cosine(dataset, vectorizer)
results_cosine_expanded2 = rih_cosine.eval_model(expansion=[1.0, 0.9, 0.1])

Query: enwiki:Chocolate/Etymology | Relevant texts count: 1
Query: enwiki:Chocolate/Etymology | Non relevant texts count: 1102
Transforming 1 relevant texts
(1, 6659)
Transforming 1102 non relevant texts
(1102, 6659)
Query: enwiki:Chocolate/History | Relevant texts count: 0
No relevant texts found, returning original query vector.
Query: enwiki:Chocolate/History/Mesoamerican%20usage | Relevant texts count: 4
Query: enwiki:Chocolate/History/Mesoamerican%20usage | Non relevant texts count: 1099
Transforming 4 relevant texts
(4, 6659)
Transforming 1099 non relevant texts
(1099, 6659)
Query: enwiki:Chocolate/History/European%20adaptation | Relevant texts count: 4
Query: enwiki:Chocolate/History/European%20adaptation | Non relevant texts count: 1099
Transforming 4 relevant texts
(4, 6659)
Transforming 1099 non relevant texts
(1099, 6659)
Query: enwiki:Chocolate/History/Storage | Relevant texts count: 3
Query: enwiki:Chocolate/History/Storage | Non relevant texts count: 1100
Transforming 3 r

In [42]:
print("RIH Cosine Expanded2 Results: ", results_cosine_expanded2)

RIH Cosine Expanded2 Results:  {'NDCG': 0.7579868381345268, 'MAP': 0.6775982408428256, 'RPrec': 0.6264028889815053}


In [43]:
rih_cosine = RIH_Cosine(dataset, vectorizer)
results_cosine_expanded3 = rih_cosine.eval_model(expansion=[0.9, 0.6, 0.3])

Query: enwiki:Chocolate/Etymology | Relevant texts count: 1
Query: enwiki:Chocolate/Etymology | Non relevant texts count: 1102
Transforming 1 relevant texts
(1, 6659)
Transforming 1102 non relevant texts
(1102, 6659)
Query: enwiki:Chocolate/History | Relevant texts count: 0
No relevant texts found, returning original query vector.
Query: enwiki:Chocolate/History/Mesoamerican%20usage | Relevant texts count: 4
Query: enwiki:Chocolate/History/Mesoamerican%20usage | Non relevant texts count: 1099
Transforming 4 relevant texts
(4, 6659)
Transforming 1099 non relevant texts
(1099, 6659)
Query: enwiki:Chocolate/History/European%20adaptation | Relevant texts count: 4
Query: enwiki:Chocolate/History/European%20adaptation | Non relevant texts count: 1099
Transforming 4 relevant texts
(4, 6659)
Transforming 1099 non relevant texts
(1099, 6659)
Query: enwiki:Chocolate/History/Storage | Relevant texts count: 3
Query: enwiki:Chocolate/History/Storage | Non relevant texts count: 1100
Transforming 3 r

In [44]:
print("RIH Cosine Expanded3 Results: ", results_cosine_expanded3)

RIH Cosine Expanded3 Results:  {'NDCG': 0.6901855927074163, 'MAP': 0.5890157901952431, 'RPrec': 0.5109732103442796}


In [None]:
#EMBEDDING Baseline

In [45]:
model = Word2Vec.load("../models/skipgram-entities.model")

In [46]:
import numpy as np

def embedding_expansion(model, headings):
    tree_embeddings = []

    for heading in headings:
        heading_words = heading.split()
        try:
            most_similar = [word for word, _ in model.wv.most_similar(positive=heading_words, topn=3)]
        except KeyError:
            most_similar = []

        enhanced_heading = set(heading_words + most_similar)
        heading_vectors = []

        for word in enhanced_heading:
            if word in model.wv:
                heading_vectors.append(model.wv[word])

        if heading_vectors:  # ajouter uniquement si non vide
            heading_embedding = np.mean(heading_vectors, axis=0)
            tree_embeddings.append(heading_embedding)

    # ✅ Si aucune embedding valide n’a été ajoutée
    if len(tree_embeddings) == 0:
        print("⚠️ Aucune entité trouvée pour cette requête. Expansion désactivée.")
        return np.zeros(model.vector_size)

    try:
        embedded_tree = np.mean(np.vstack(tree_embeddings), axis=0)
    except Exception as e:
        print("❌ Erreur lors du calcul de embedded_tree :", e)
        return np.zeros(model.vector_size)

    try:
        cosine_similarities = model.wv.cosine_similarities(embedded_tree, model.wv.vectors)
        most_similar = np.argsort(cosine_similarities)[-3:]
        expanded_tree_embedding = np.mean(
            np.vstack([embedded_tree, model.wv.vectors[most_similar]]), axis=0
        )
        return expanded_tree_embedding
    except Exception as e:
        print("❌ Erreur lors de l’expansion finale :", e)
        return embedded_tree  # fallback : utiliser embedded_tree seul



In [47]:
import metrics as metrics
# from expansion import embedding_expansion

class EmbeddingBaseline(object):
    def __init__(self, dataset, w2v_model):
        self.dataset = dataset
        self.documents_ids = np.array(list(self.dataset["documents"].keys()))
        self.w2v_model = w2v_model
        self.embedded_documents = np.array([self.vectorize(doc) for doc in self.dataset["documents"].values()])
        
    def vectorize(self, text):
        embedding = np.zeros(self.w2v_model.vector_size)
        size = 0
        for word in text.split():
            try:
                embedding += self.w2v_model.wv[word]
                size += 1
            except KeyError:
                pass
        if size == 0:
            return embedding
        return embedding / size
        
    
    def get_query_vector(self, query, expansion=False):
        if expansion:
            return embedding_expansion(self.w2v_model, [query[1][1], *query[1][2]])
        else:
            return self.vectorize(query[1][0])
    
    def cosine_similarities(self, query):
        dot_product = self.embedded_documents @ query.reshape(-1, 1)
        denominator = (np.maximum(np.linalg.norm(query), 1e-12) * np.maximum(np.linalg.norm(self.embedded_documents), 1e-12))
        return dot_product / denominator
    
    def get_top_k(self, query, k=1000):
        scores = self.cosine_similarities(query).reshape(-1)
        top_k_indexes = np.argsort(scores)[::-1][:k]
        return self.documents_ids[top_k_indexes], scores[top_k_indexes]

    def eval_query(self, query, k=1000, expansion=False):
        scores = {
        "NDCG": metrics.NDCG,
        "MAP": metrics.AP,
        "RPrec": metrics.RPrec
        }
        results = {}
        q = self.get_query_vector(query, expansion)
        docs, _ = self.get_top_k(q, k)
        for metric_name, metric_callback in scores.items():
            results[metric_name] = metric_callback(query[0], docs, self.dataset["relevances"])
        return results
    
    def eval_model(self, k=1000, expansion=False):
        results = {
            "NDCG": [],
            "MAP": [],
            "RPrec": []
        }
        for query in self.dataset["queries"].items():
            query_results = self.eval_query(query, k, expansion)
            for metric_name, metric_value in query_results.items():
                results[metric_name].append(metric_value)
        results = {metric_name: np.mean(metric_values) for metric_name, metric_values in results.items()}
        return results

In [48]:
embedding_baseline = EmbeddingBaseline(dataset, model)
results_embedding = embedding_baseline.eval_model()

In [49]:
print("Embedding Baseline Results: ", results_embedding)

Embedding Baseline Results:  {'NDCG': 0.13247014840580199, 'MAP': 0.007727793860292178, 'RPrec': 0.0009433962264150943}


In [None]:
#embedding expansion

In [50]:
results_embedding_expanded = embedding_baseline.eval_model(expansion=True)

In [51]:
print("Embedding Baseline Expanded Results: ", results_embedding_expanded)

Embedding Baseline Expanded Results:  {'NDCG': 0.13040966139915688, 'MAP': 0.008121822697821182, 'RPrec': 0.001924228811021264}
