In [1]:
pip install dill

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
from gensim.models import Word2Vec
from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import dill as pickle
import gensim.downloader as api
import joblib
import numpy as np
import re
import sys

In [2]:
sys.path.append('..')

from baselines import RIH_Cosine, EmbeddingBaseline
from knowledgebase import KnowledgeBase

In [3]:
# Load a derived dataset from the "TREC CAR 2017" dataset
dataset = {
    "queries": pickle.load(open("queries.pkl", "rb")),
    "documents": pickle.load(open("documents.pkl", "rb")),
    "relevances": pickle.load(open("relevances.pkl", "rb"))
}

In [4]:
# Ressources Installation for NLTK
download("punkt")
download("stopwords")
download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
english_stopwords = stopwords.words("english")

def preprocess(text):
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    words = word_tokenize(text)
    words_lemmed = [WordNetLemmatizer().lemmatize(w) for w in words if w not in english_stopwords]
    return words_lemmed

In [6]:
# Create the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(tokenizer=preprocess, stop_words=english_stopwords)

# Train the TFIDF-Vectorizer on the corpus
def generate_docs_texts():
    for text in dataset["documents"].values():
        yield text

documents_vectors = vectorizer.fit_transform(generate_docs_texts())
documents_vectors = documents_vectors.toarray()



In [7]:
qrels = dataset["relevances"]
queries = dataset["queries"]

zero_relevance_queries = [
    qid for qid in queries
    if not any(qid == rel_qid for (rel_qid, _) in qrels)
]

print(f"{len(zero_relevance_queries)} requêtes n'ont aucun document pertinent.")
print(len(queries))


200 requêtes n'ont aucun document pertinent.
1975


In [8]:
relevances = pickle.load(open("relevances.pkl", "rb"))

covered_queries = set(qid for (qid, _) in relevances)
print(f"{len(covered_queries)} requêtes ont au moins un document pertinent")


1937 requêtes ont au moins un document pertinent


In [9]:
rih_cosine = RIH_Cosine(dataset, vectorizer)
results_cosine = rih_cosine.eval_model()

In [10]:
print("RIH Cosine Results: ", results_cosine)

RIH Cosine Results:  {'NDCG': 0.2514339637467721, 'MAP': 0.08799401892603141, 'RPrec': 0.04304883442858126}
