In [2]:
import re
import string
import pickle
from typing import List
import numpy as np
from nltk import tokenize, pos_tag, download
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import ir_datasets
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from typing import Callable
import gensim
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import logging


class LemmatizerWithPOSTagger(WordNetLemmatizer):
    def __init__(self):
        pass

    def _get_wordnet_pos(self, tag: str) -> str:
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def lemmatize(self, word: str, pos: str = "n") -> str:
        return super().lemmatize(word, self._get_wordnet_pos(pos))

class TextPreprocessor():

    def __init__(self, tokenizer: Callable = None) -> None:
        self.tokenizer = tokenizer

        if self.tokenizer is None:
            self.tokenizer = tokenize.word_tokenize

        self.stopwords_tokens = stopwords.words('english')
        self.stemmer = PorterStemmer()
        self.lemmatizer = LemmatizerWithPOSTagger()

    def tokenize(self, text: str)-> List[str]:
        tokens =self.tokenizer(text)
        return tokens
    
    def to_lower(self, tokens: List[str]) -> List[str]:
        lower_tokens = []
        for token in tokens:
            lower_token = str(np.char.lower(token))
            lower_tokens.append(lower_token)
        return lower_tokens

    
    def remove_markers(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(re.sub(r'\u00AE', '', token))
        return new_tokens

    def remove_punctuation(self, tokens: List[str]) ->  List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(token.translate(str.maketrans('', '', string.punctuation)))
        return new_tokens




    def rplace_under_score_with_space(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(re.sub(r'_', ' ', token))
        return new_tokens

    def remove_stop_words(self,tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            if token not in self.stopwords_tokens and len(token) > 1:
                new_tokens.append(token)
        return new_tokens

    def remove_apostrophe(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(str(np.char.replace(token, "'", " ")))
        return new_tokens

    def stemming(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(self.stemmer.stem(token))
        return new_tokens
    
    
    def normalize_appreviations(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        resolved_terms = {}
        for token in tokens:

            if len(token) >= 2:
                synsets = wordnet.synsets(token)
                if synsets:
                    resolved_term = synsets[0].lemmas()[0].name()
                    resolved_terms[token] = resolved_term

        for abbreviation, resolved_term in resolved_terms.items():
            for i in range(len(tokens)):
                if tokens[i] == abbreviation:
                    tokens[i] = resolved_term
                    break

        return tokens
    
    def lemmatizing(self, tokens: List[str]) -> List[str]:
        tagged_tokens = pos_tag(tokens)
        lemmatized_tokens = [self.lemmatizer.lemmatize(token, pos) for token, pos in tagged_tokens]
        return lemmatized_tokens


    def preprocess(self, text: str) -> str:
        operations = [
            self.to_lower,
            self.remove_punctuation,
            self.remove_apostrophe,
            self.remove_stop_words,
            self.remove_markers,
            self.stemming,
            self.lemmatizing,
            self.normalize_appreviations, 
            self.to_lower,
            self.rplace_under_score_with_space
        ]
        text_tokens=self.tokenize(text)
        for op in operations:
              text_tokens=op(text_tokens)
    
        new_text=""
        new_text = ' '.join(text_tokens)
            
        return new_text






# # Initialize TextPreprocessor
text_preprocessor = TextPreprocessor()

# # Load documents
dataset = ir_datasets.load('wikir/en1k/training')
# documents = [{'id': doc.doc_id, 'text': doc.text} for doc in dataset.docs_iter()]




In [3]:
class WordEmbeddingEngine:
    def __init__(self, vector_size, sg, workers, epochs, text_processor, text_tokenizer):
        self.vector_size = vector_size
        self.sg = sg
        self.workers = workers
        self.epochs = epochs
        self.text_processor = text_processor
        self.text_tokenizer = text_tokenizer
        self.word_embedding_model = None
        self.documents_vectors = None
        self.document_id_mapping = {}

    def init_sentences(self, documents):
        sentences = []
        for doc_id, document in documents.items():
            sentences.append(self.text_tokenizer(self.text_processor.preprocess(document)))
            self.document_id_mapping[doc_id] = document
        return sentences

    def train_model(self, documents):
        sentences = self.init_sentences(documents)
        model = Word2Vec(sentences,
                         vector_size=self.vector_size,
                         sg=self.sg,
                         workers=self.workers,
                         epochs=self.epochs)

        self.word_embedding_model = model
        self.documents_vectors = self.vectorize_documents(sentences)
        self.save_model()

    def vectorize_documents(self, sentences):
        documents_vectors = []
        for sentence in sentences:
            zero_vector = np.zeros(self.vector_size)
            vectors = []
            for token in sentence:
                if token in self.word_embedding_model.wv:
                    try:
                        vectors.append(self.word_embedding_model.wv[token])
                    except KeyError:
                        vectors.append(np.random(self.vector_size))
            if vectors:
                vectors = np.asarray(vectors)
                avg_vec = vectors.mean(axis=0)
                documents_vectors.append(avg_vec)
            else:
                documents_vectors.append(zero_vector)
        return documents_vectors

    def save_model(self):
        with open('word_embedding_model_wiki.pickle', 'wb') as f_model:
            pickle.dump(self.word_embedding_model, f_model)
        with open('document_vectors_wiki.pickle', 'wb') as f_vectors:
            pickle.dump(self.documents_vectors, f_vectors)
        with open('document_id_mapping_vector_wiki.pickle', 'wb') as f_mapping:
            pickle.dump(self.document_id_mapping, f_mapping)

    def load_model(self):
        with open('word_embedding_model_wiki.pickle', 'rb') as f_model:
            self.word_embedding_model = pickle.load(f_model)
        with open('document_vectors_wiki.pickle', 'rb') as f_vectors:
            self.documents_vectors = pickle.load(f_vectors)
        with open('document_id_mapping_vector_wiki.pickle', 'rb') as f_mapping:
            self.document_id_mapping = pickle.load(f_mapping)

    def get_query_vector(self, query_text):
        preprocessed_query = self.text_processor.preprocess(query_text)
        tokens = self.text_tokenizer(preprocessed_query)
        vectors = [self.word_embedding_model.wv[token] for token in tokens if token in self.word_embedding_model.wv]
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            return avg_vec
        else:
            return np.zeros(self.vector_size)

    def get_results(self, query_text):
        query_vector = self.get_query_vector(query_text)
        similarities = cosine_similarity([query_vector], self.documents_vectors).flatten()
        ranked_indices = np.argsort(-similarities)
        result_ids = []
        for idx in ranked_indices[:10]:  # Top 10 results
            if similarities[idx] >= 0.35:
                result_ids.append(list(self.document_id_mapping.keys())[idx])
        unordered_results = [{'_id': doc_id, 'text': self.document_id_mapping[doc_id]} for doc_id in result_ids]
        return unordered_results
    
def calculate_MAP(query_id, engine, dataset):
    relevant_docs = [qrel.doc_id for qrel in dataset.qrels_iter() if qrel.query_id == query_id]
    
    ordered_results = []
    for query in dataset.queries_iter():
        if query.query_id == query_id:
            ordered_results = engine.get_results(query.text)
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = 0
        for j in range(i):
            if j < len(ordered_results) and ordered_results[j]['_id'] in relevant_docs:
                relevant_ret += 1
        p_at_k = (relevant_ret / i) * (1 if i-1 < len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs else 0)
        pk_sum += p_at_k
        if i-1 < len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

# Load documents
dataset = ir_datasets.load('wikir/en1k/training')
documents = {doc.doc_id: doc.text for doc in dataset.docs_iter()}

# Initialize WordEmbeddingEngine
word_embedding_engine = WordEmbeddingEngine(
    vector_size=500, sg=1, workers=4, epochs=35,
    text_processor=text_preprocessor,
    text_tokenizer=tokenize.word_tokenize
)

# Train the Word Embedding model (uncomment to train and save the model)
word_embedding_engine.train_model(documents)
print('Train model done!')

# # Load the trained Word Embedding model
# word_embedding_engine.load_model()

# # Calculate MAP for Word Embedding Engine
# map_sum_word_embedding = 0
# queries_ids = {qrel.query_id for qrel in dataset.queries_iter()}
# for query_id in queries_ids:

#     map_sum_word_embedding += calculate_MAP(query_id, word_embedding_engine, dataset)

# mean_average_precision_word_embedding = map_sum_word_embedding / len(queries_ids)
# print(f"Mean Average Precision (MAP) for Word Embedding: {mean_average_precision_word_embedding}")


KeyboardInterrupt: 