In [37]:
from gensim.models import Word2Vec
from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import SnowballStemmer


import dill as pickle
import gensim.downloader as api
import joblib
import numpy as np
import re
import sys

In [55]:
sys.path.append('..')

from baselines import EmbeddingBaseline
from knowledgebase import KnowledgeBase

In [41]:
# Load a derived dataset from the "TREC CAR 2017" dataset
dataset = {
    "queries": pickle.load(open("queries.pkl", "rb")),
    "documents": pickle.load(open("documents.pkl", "rb")),
    "relevances": pickle.load(open("relevances.pkl", "rb"))
}

In [42]:
galago_stopwords_file = './galago_418_inquery_stopwords.txt'

# Load Galago stopwords into a Python list
with open(galago_stopwords_file, 'r') as f:
    galago_stopwords = [line.strip() for line in f if line.strip()]

In [43]:
# Ressources Installation for NLTK
download("punkt")
download("stopwords")
download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /users/Etu0/21410720/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [44]:
#Le nouveau PreProcess 
english_stopwords= galago_stopwords
stemmer = SnowballStemmer("english")

def preprocess(text):
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    words = word_tokenize(text)
    words_stemmed = [stemmer.stem(w) for w in words if w not in english_stopwords]
    return words_stemmed

In [45]:
# Create the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(tokenizer=preprocess, stop_words=english_stopwords)

# Train the TFIDF-Vectorizer on the corpus
def generate_docs_texts():
    for text in dataset["documents"].values():
        yield text

documents_vectors = vectorizer.fit_transform(generate_docs_texts())
documents_vectors = documents_vectors.toarray()



In [46]:
qrels = dataset["relevances"]
queries = dataset["queries"]

zero_relevance_queries = [
    qid for qid in queries
    if not any(qid == rel_qid for (rel_qid, _) in qrels)
]

print(f"{len(zero_relevance_queries)} requêtes n'ont aucun document pertinent.")
print(len(queries))

47 requêtes n'ont aucun document pertinent.
477


In [47]:
pretrained_model = api.load("fasttext-wiki-news-subwords-300")

In [48]:
knowledge_base = KnowledgeBase()
knowledge_base.load("../models/knowledge_base.pkl")
entities = list(knowledge_base.kb)

In [49]:
corpus = list(dataset["documents"].values())

model = Word2Vec(vector_size=300, window=10, sample=1e-3, min_count=0, sg=1)
model.build_vocab(corpus + entities)
total_examples = model.corpus_count
model.build_vocab([list(pretrained_model.key_to_index.keys())], update=True)

model.train(corpus, total_examples=total_examples, epochs=2)
model.save("../models/skipgram-entities.model")

In [77]:
def get_leafs_name_expansion(model,heading):
    heading_words = heading.split()
    try:
        most_similar = [word for word, _ in model.wv.most_similar(positive=heading_words, topn=5)]
    except KeyError:
        most_similar = []

    expansion_words = set(heading_words + most_similar)
    
    vectors = []
    for word in expansion_words:
        if word in model.wv:
            vectors.append(model.wv[word])

    #on vérifie qu'il y a bien au moins 1 mot 
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

In [78]:

import metrics as metrics
import numpy as np
from baselines import EmbeddingBaseline


class EmbeddingBaselineHNames(EmbeddingBaseline):
    def get_query_vector(self, query, expansion=False):
        if expansion:
            leaf_heading=query[1][2][-1] if query[1][2] else query[1][1] #le premier si y'a RIH le deuxième si y'a R-H
            return get_leafs_name_expansion(self.w2v_model, leaf_heading)
        else:
            return self.vectorize(query[1][0])

In [81]:
embeddingHnames_baseline =EmbeddingBaselineHNames(dataset,model)

In [82]:
results_embedding = embeddingHnames_baseline.eval_model()

In [83]:
print("Embedding Baseline Results: ", results_embedding)

Embedding Baseline Results:  {'NDCG': 0.13247611291899686, 'MAP': 0.007755621972831094, 'RPrec': 0.0009433962264150943}


In [84]:
results_embedding_expanded =embeddingHnames_baseline.eval_model(expansion=True)

In [85]:
print("Embedding Baseline Expanded Results: ", results_embedding_expanded)

Embedding Baseline Expanded Results:  {'NDCG': 0.136549742069739, 'MAP': 0.011709342140237286, 'RPrec': 0.0022886093640810618}


In [89]:
from expansion2 import get_alias_expansion
import metrics as metrics
import numpy as np
from baselines import EmbeddingBaseline

class EmbeddingBaselineAliases(EmbeddingBaseline):
    def get_query_vector(self, query, expansion=False):
        if expansion:
            leaf_heading=query[1][0]  #On prends le root
            return get_alias_expansion(self.w2v_model, leaf_heading)
        else:
            return self.vectorize(query[1][0])

In [90]:
embeddingaliases_baseline =EmbeddingBaselineAliases(dataset,model)


In [91]:
results_embedding = embeddingaliases_baseline.eval_model()

In [92]:
print("Embedding Baseline Results: ", results_embedding)

Embedding Baseline Results:  {'NDCG': 0.13247611291899686, 'MAP': 0.007755621972831094, 'RPrec': 0.0009433962264150943}


In [93]:
results_embedding_expanded =embeddingHnames_baseline.eval_model(expansion=True)

In [94]:
print("Embedding Baseline Expanded Results: ", results_embedding_expanded)

Embedding Baseline Expanded Results:  {'NDCG': 0.136549742069739, 'MAP': 0.011709342140237286, 'RPrec': 0.0022886093640810618}
