# Models

In [1]:
from gensim.models import Word2Vec
from nltk import download, word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

import dill as pickle
import gensim.downloader as api
import joblib
import numpy as np
import re
import sys

In [2]:
sys.path.append('..')

from src.baselines import RIH_Cosine, EmbeddingBaseline
from src.knowledgebase import KnowledgeBase

In [3]:
# Load a derived dataset from the "TREC CAR 2017" dataset
dataset = {
    "queries": pickle.load(open("../data/queries.pkl", "rb")),
    "documents": pickle.load(open("../data/documents.pkl", "rb")),
    "relevances": pickle.load(open("../data/relevances.pkl", "rb"))
}

## TFIDF Vectorizer

In [4]:
# Ressources Installation for NLTK
download("punkt")
download("stopwords")
download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ethan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
english_stopwords = stopwords.words("english")

def preprocess(text):
    text = re.sub(r"[^a-zA-Z]", " ", text.lower())
    words = word_tokenize(text)
    words_lemmed = [WordNetLemmatizer().lemmatize(w) for w in words if w not in english_stopwords]
    return words_lemmed

In [6]:
# Create the TF-IDF Vectorizer
vectorizer = TfidfVectorizer(tokenizer=preprocess, stop_words=english_stopwords)

# Train the TFIDF-Vectorizer on the corpus
def generate_docs_texts():
    for text in dataset["documents"].values():
        yield text

documents_vectors = vectorizer.fit_transform(generate_docs_texts())
documents_vectors = documents_vectors.toarray()



## Skipgram

In [4]:
pretrained_model = api.load("fasttext-wiki-news-subwords-300")

In [13]:
knowledge_base = KnowledgeBase()
knowledge_base.load("../models/knowledge_base.pkl")
entities = list(knowledge_base.kb)

In [14]:
corpus = list(dataset["documents"].values())

model = Word2Vec(vector_size=300, window=10, sample=1e-3, min_count=0, sg=1)
model.build_vocab(corpus + entities)
total_examples = model.corpus_count
model.build_vocab([list(pretrained_model.key_to_index.keys())], update=True)

model.train(corpus, total_examples=total_examples, epochs=2)
model.save("../models/skipgram-entities.model")

## Baselines

### Relevance Baseline

In [7]:
rih_cosine = RIH_Cosine(dataset, vectorizer)
results_cosine = rih_cosine.eval_model()

In [8]:
print("RIH Cosine Results: ", results_cosine)

RIH Cosine Results:  {'NDCG': 0.3536916343324855, 'MAP': 0.16825002655410093, 'RPrec': 0.8055093265456168}


### Relevance Expansion

In [9]:
rih_cosine = RIH_Cosine(dataset, vectorizer)
results_cosine_expanded = rih_cosine.eval_model(expansion=[1.0, 0.75, 0.15])

In [10]:
print("RIH Cosine Expanded Results: ", results_cosine_expanded)

RIH Cosine Expanded Results:  {'NDCG': 0.346435022362756, 'MAP': 0.16316602729310062, 'RPrec': 0.8004908060875802}


In [11]:
rih_cosine = RIH_Cosine(dataset, vectorizer)
results_cosine_expanded2 = rih_cosine.eval_model(expansion=[1.0, 0.9, 0.1])

In [12]:
print("RIH Cosine Expanded2 Results: ", results_cosine_expanded2)

RIH Cosine Expanded2 Results:  {'NDCG': 0.340159366820859, 'MAP': 0.1581191949314524, 'RPrec': 0.799505143005143}


In [13]:
rih_cosine = RIH_Cosine(dataset, vectorizer)
results_cosine_expanded3 = rih_cosine.eval_model(expansion=[0.9, 0.6, 0.3])

In [14]:
print("RIH Cosine Expanded3 Results: ", results_cosine_expanded3)

RIH Cosine Expanded3 Results:  {'NDCG': 0.3503750202677431, 'MAP': 0.16611019112982603, 'RPrec': 0.8022351310577117}


### Embedding Baseline

In [15]:
model = Word2Vec.load("../models/skipgram-entities.model")

In [40]:
def embedding_expansion(model, headings):
    # Get the embeddings for each node of the tree (headings)
    tree_embeddings = []
    for heading in headings:
        heading_words = heading.split()
        try:
            most_similar = [word for word, _ in model.wv.most_similar(positive=heading_words, topn=3)]
        except KeyError:
            most_similar = []
        enhanced_heading = set(heading_words + most_similar)

        # Get the embeddings for the enhanced heading
        heading_embeddings = []
        for word in enhanced_heading:
            try:
                heading_embeddings.append(model.wv[word])
            except KeyError:
                pass
        if len(heading_embeddings) > 0:
            tree_embeddings.append(np.mean(heading_embeddings, axis=0))
    tree_embeddings = np.array(tree_embeddings)

    # Expand the global tree
    embedded_tree = np.mean(tree_embeddings, axis=0)
    cosine_similarities = model.wv.cosine_similarities(embedded_tree, model.wv.vectors)
    most_similar = np.argsort(cosine_similarities)[-3:]
    expanded_tree_embedding = np.mean(embedded_tree + model.wv.vectors[most_similar], axis=0)

    return expanded_tree_embedding

In [28]:
import src.metrics as metrics
from src.expansion import embedding_expansion

class EmbeddingBaseline(object):
    def __init__(self, dataset, w2v_model):
        self.dataset = dataset
        self.documents_ids = np.array(list(self.dataset["documents"].keys()))
        self.w2v_model = w2v_model
        self.embedded_documents = np.array([self.vectorize(doc) for doc in self.dataset["documents"].values()])
        
    def vectorize(self, text):
        embedding = np.zeros(self.w2v_model.vector_size)
        size = 0
        for word in text.split():
            try:
                embedding += self.w2v_model.wv[word]
                size += 1
            except KeyError:
                pass
        if size == 0:
            return embedding
        return embedding / size
    
    def get_query_vector(self, query, expansion=False):
        if expansion:
            return embedding_expansion(self.w2v_model, [query[1][1], *query[1][2]])
        else:
            return self.vectorize(query[1][0])
    
    def cosine_similarities(self, query):
        dot_product = self.embedded_documents @ query.reshape(-1, 1)
        denominator = (np.maximum(np.linalg.norm(query), 1e-12) * np.maximum(np.linalg.norm(self.embedded_documents), 1e-12))
        return dot_product / denominator
    
    def get_top_k(self, query, k=1000):
        scores = self.cosine_similarities(query).reshape(-1)
        top_k_indexes = np.argsort(scores)[::-1][:k]
        return self.documents_ids[top_k_indexes], scores[top_k_indexes]

    def eval_query(self, query, k=1000, expansion=False):
        scores = {
        "NDCG": metrics.NDCG,
        "MAP": metrics.AP,
        "RPrec": metrics.RPrec
        }
        results = {}
        q = self.get_query_vector(query, expansion)
        docs, _ = self.get_top_k(q, k)
        for metric_name, metric_callback in scores.items():
            results[metric_name] = metric_callback(query[0], docs, self.dataset["relevances"])
        return results
    
    def eval_model(self, k=1000, expansion=False):
        results = {
            "NDCG": [],
            "MAP": [],
            "RPrec": []
        }
        for query in self.dataset["queries"].items():
            query_results = self.eval_query(query, k, expansion)
            for metric_name, metric_value in query_results.items():
                results[metric_name].append(metric_value)
        results = {metric_name: np.mean(metric_values) for metric_name, metric_values in results.items()}
        return results

In [29]:
embedding_baseline = EmbeddingBaseline(dataset, model)
results_embedding = embedding_baseline.eval_model()

In [19]:
print("Embedding Baseline Results: ", results_embedding)

Embedding Baseline Results:  {'NDCG': 0.0668529586728596, 'MAP': 0.00616720644818336, 'RPrec': 0.25395354705032125}


### Embedding Expansion

In [41]:
results_embedding_expanded = embedding_baseline.eval_model(expansion=True)

In [42]:
print("Embedding Baseline Expanded Results: ", results_embedding_expanded)

Embedding Baseline Expanded Results:  {'NDCG': 0.06072493786583757, 'MAP': 0.003761579768421089, 'RPrec': 0.24418412098654035}
