In [2]:
import re
import string
import pickle
from typing import List
import numpy as np
from nltk import tokenize, pos_tag, download
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import ir_datasets
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from typing import Callable
import gensim
from gensim.models import Word2Vec
from gensim.models.callbacks import CallbackAny2Vec
import logging


class LemmatizerWithPOSTagger(WordNetLemmatizer):
    def __init__(self):
        pass

    def _get_wordnet_pos(self, tag: str) -> str:
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def lemmatize(self, word: str, pos: str = "n") -> str:
        return super().lemmatize(word, self._get_wordnet_pos(pos))

class TextPreprocessor():

    def __init__(self, tokenizer: Callable = None) -> None:
        self.tokenizer = tokenizer

        if self.tokenizer is None:
            self.tokenizer = tokenize.word_tokenize

        self.stopwords_tokens = stopwords.words('english')
        self.stemmer = PorterStemmer()
        self.lemmatizer = LemmatizerWithPOSTagger()

    def tokenize(self, text: str)-> List[str]:
        tokens =self.tokenizer(text)
        return tokens
    
    def to_lower(self, tokens: List[str]) -> List[str]:
        lower_tokens = []
        for token in tokens:
            lower_token = str(np.char.lower(token))
            lower_tokens.append(lower_token)
        return lower_tokens

    
    def remove_markers(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(re.sub(r'\u00AE', '', token))
        return new_tokens

    def remove_punctuation(self, tokens: List[str]) ->  List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(token.translate(str.maketrans('', '', string.punctuation)))
        return new_tokens




    def rplace_under_score_with_space(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(re.sub(r'_', ' ', token))
        return new_tokens

    def remove_stop_words(self,tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            if token not in self.stopwords_tokens and len(token) > 1:
                new_tokens.append(token)
        return new_tokens

    def remove_apostrophe(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(str(np.char.replace(token, "'", " ")))
        return new_tokens

    def stemming(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(self.stemmer.stem(token))
        return new_tokens
    
    
    def normalize_appreviations(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        resolved_terms = {}
        for token in tokens:

            if len(token) >= 2:
                synsets = wordnet.synsets(token)
                if synsets:
                    resolved_term = synsets[0].lemmas()[0].name()
                    resolved_terms[token] = resolved_term

        for abbreviation, resolved_term in resolved_terms.items():
            for i in range(len(tokens)):
                if tokens[i] == abbreviation:
                    tokens[i] = resolved_term
                    break

        return tokens
    
    def lemmatizing(self, tokens: List[str]) -> List[str]:
        tagged_tokens = pos_tag(tokens)
        lemmatized_tokens = [self.lemmatizer.lemmatize(token, pos) for token, pos in tagged_tokens]
        return lemmatized_tokens
    def replace_country_symbols(self, tokens: List[str]) -> List[str]:
            country_symbols = {
                'US': 'United States', 'UK': 'United Kingdom', 'IN': 'India', 'CA': 'Canada',
                'AU': 'Australia', 'DE': 'Germany', 'FR': 'France', 'ES': 'Spain', 'IT': 'Italy',
                'JP': 'Japan', 'CN': 'China', 'BR': 'Brazil', 'RU': 'Russia', 'MX': 'Mexico',
                'ZA': 'South Africa', 'KR': 'South Korea', 'AR': 'Argentina', 'SA': 'Saudi Arabia',
                'EG': 'Egypt', 'NG': 'Nigeria', 'TR': 'Turkey', 'NL': 'Netherlands', 'SE': 'Sweden',
                'CH': 'Switzerland', 'BE': 'Belgium', 'AT': 'Austria', 'DK': 'Denmark', 'FI': 'Finland',
                'NO': 'Norway', 'PL': 'Poland', 'IE': 'Ireland', 'NZ': 'New Zealand', 'SG': 'Singapore',
                'MY': 'Malaysia', 'TH': 'Thailand', 'PH': 'Philippines', 'ID': 'Indonesia', 'VN': 'Vietnam',
                'PK': 'Pakistan', 'BD': 'Bangladesh', 'IR': 'Iran', 'IQ': 'Iraq', 'IL': 'Israel', 'GR': 'Greece',
                'PT': 'Portugal', 'CZ': 'Czech Republic', 'HU': 'Hungary', 'RO': 'Romania', 'BG': 'Bulgaria',
                'HR': 'Croatia', 'SI': 'Slovenia', 'SK': 'Slovakia', 'UA': 'Ukraine', 'BY': 'Belarus', 'LT': 'Lithuania',
                'LV': 'Latvia', 'EE': 'Estonia', 'IS': 'Iceland', 'MT': 'Malta', 'CY': 'Cyprus', 'LK': 'Sri Lanka',
                'KE': 'Kenya', 'GH': 'Ghana', 'UG': 'Uganda', 'TZ': 'Tanzania', 'SN': 'Senegal', 'DZ': 'Algeria',
                'MA': 'Morocco', 'TN': 'Tunisia', 'AE': 'United Arab Emirates', 'QA': 'Qatar', 'KW': 'Kuwait',
                'OM': 'Oman', 'BH': 'Bahrain', 'LB': 'Lebanon', 'JO': 'Jordan', 'SY': 'Syria', 'YE': 'Yemen',
                'AF': 'Afghanistan', 'UZ': 'Uzbekistan', 'KZ': 'Kazakhstan', 'KG': 'Kyrgyzstan', 'TJ': 'Tajikistan',
                'TM': 'Turkmenistan', 'MN': 'Mongolia', 'KH': 'Cambodia', 'LA': 'Laos', 'MM': 'Myanmar', 'NP': 'Nepal',
                'BT': 'Bhutan', 'LK': 'Sri Lanka', 'MV': 'Maldives', 'BN': 'Brunei', 'MO': 'Macau', 'HK': 'Hong Kong',
                'TW': 'Taiwan', 'AM': 'Armenia', 'GE': 'Georgia', 'AZ': 'Azerbaijan'
            }
            return [country_symbols.get(token, token) for token in tokens]

    def process_hashtags_mentions(self, tokens: List[str]) -> List[str]:
            new_tokens = [token for token in tokens if not token.startswith('#') and not token.startswith('@')]
            return new_tokens

    def normalize_abbreviations(self, tokens: List[str]) -> List[str]:
            new_tokens = []
            resolved_terms = {}
            for token in tokens:
                if len(token) >= 2:
                    synsets = wordnet.synsets(token)
                    if synsets:
                        resolved_term = synsets[0].lemmas()[0].name()
                        resolved_terms[token] = resolved_term

            for abbreviation, resolved_term in resolved_terms.items():
                tokens = [resolved_term if token == abbreviation else token for token in tokens]
            return tokens
    def preprocess(self, text: str) -> str:
        operations = [
            self.process_hashtags_mentions,       # Step 5: Process hashtags and mentions
            self.replace_country_symbols,         # Step 6: Replace country symbols
            self.normalize_abbreviations,         # Step 2: Normalize abbreviations
            self.remove_markers,                  # Step 9: Remove markers
            self.to_lower,                        # Step 1: Convert text to lower case
            self.remove_punctuation,              # Step 3: Remove punctuation
            self.remove_apostrophe,               # Step 4: Remove apostrophes
            self.remove_stop_words,               # Step 8: Remove stop words
            self.lemmatizing, 
        ]
        text_tokens=self.tokenize(text)
        for op in operations:
              text_tokens=op(text_tokens)
    
        new_text=""
        new_text = ' '.join(text_tokens)
            
        return new_text




class TfidfEngine:
    def __init__(self, text_preprocessor):
        self.text_preprocessor = text_preprocessor
        self.tfidf_matrix = None
        self.tfidf_model = None
        self.document_id_mapping = {}

    def train_model(self, documents):
        document_texts = [doc['text'] for doc in documents]
        vectorizer = TfidfVectorizer(preprocessor=self.text_preprocessor.preprocess, tokenizer=self.text_preprocessor.tokenizer)
        tfidf_matrix = vectorizer.fit_transform(document_texts)
        self.tfidf_matrix = tfidf_matrix
        self.tfidf_model = vectorizer
        self.save_model(documents)

    def save_model(self, documents):
        with open('tfidf_model2.pickle', 'wb') as f_model:
            pickle.dump(self.tfidf_model, f_model)
        with open('tfidf_matrix2.pickle', 'wb') as f_matrix:
            pickle.dump(self.tfidf_matrix, f_matrix)
        with open('document_id_mapping2.pickle', 'wb') as f_mapping:
            pickle.dump({doc['id']: doc['text'] for doc in documents}, f_mapping)

    def load_model(self):
        with open('tfidf_model2.pickle', 'rb') as f_model:
            self.tfidf_model = pickle.load(f_model)
        with open('tfidf_matrix2.pickle', 'rb') as f_matrix:
            self.tfidf_matrix = pickle.load(f_matrix)
        with open('document_id_mapping2.pickle', 'rb') as f_mapping:
            self.document_id_mapping = pickle.load(f_mapping)

    def query(self, query_text):
        preprocessed_query = self.text_preprocessor.preprocess(query_text)
        query_vector = self.tfidf_model.transform([preprocessed_query])
        return query_vector
    
    def rank_documents(self, query_vector):
        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        ranked_indices = np.argsort(-cosine_similarities)
        return ranked_indices, cosine_similarities

    def get_results(self, query_text):
        query_vector = self.query(query_text)
        ranked_indices, similarities = self.rank_documents(query_vector)
        result_ids = []
        for idx in ranked_indices[:10]:  # Top 10 results
            if similarities[idx] >= 0.35:
                result_ids.append(list(self.document_id_mapping.keys())[idx])
        unordered_results = [{'_id': doc_id, 'text': self.document_id_mapping[doc_id]} for doc_id in result_ids]
        return unordered_results

def calculate_MRR(query_id,tfidf_engine):
    relevant_docs = []
    for qrel in dataset.qrels_iter():
        if qrel[0] == query_id :
            relevant_docs.append(qrel[1]) 
            
    ordered_results = []
    for query in dataset.queries_iter():
        if query[0] == query_id:
            ordered_results = tfidf_engine.get_results(query[1])
            break
    for i, result in enumerate(ordered_results):
        if result['_id'] in relevant_docs:
            return 1 / (i+1)
    
    return 0
def calculate_MAP(query_id, tfidf_engine, dataset):
    relevant_docs = [qrel.doc_id for qrel in dataset.qrels_iter() if qrel.query_id == query_id]
    
    ordered_results = []
    for query in dataset.queries_iter():
        if query.query_id == query_id:
            ordered_results = tfidf_engine.get_results(query.text)
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = 0
        for j in range(i):
            if j < len(ordered_results) and ordered_results[j]['_id'] in relevant_docs:
                relevant_ret += 1
        p_at_k = (relevant_ret / i) * (1 if i-1 < len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs else 0)
        pk_sum += p_at_k
        if i-1 < len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

# # Initialize TextPreprocessor
text_preprocessor = TextPreprocessor()

# # Load documents
dataset = ir_datasets.load('antique/train')
# documents = [{'id': doc.doc_id, 'text': doc.text} for doc in dataset.docs_iter()]

# # Initialize TfidfEngine with the TextPreprocessor
# tfidf_engine = TfidfEngine(text_preprocessor)
# # Initialize the WordEmbeddingTrainer
# # embedding_trainer = WordEmbeddingTrainer(text_preprocessor)



# # Load the trained model
# tfidf_engine.load_model()

# # Calculate MAP for all queries in the dataset
# dataset = ir_datasets.load("antique/train")

# queries_ids = {qrel.query_id for qrel in dataset.queries_iter()}
# map_sum = 0

# for query_id in queries_ids:
#     map_sum += calculate_MAP(query_id, tfidf_engine, dataset)

# mean_average_precision = map_sum / len(queries_ids)
# print(f"Mean Average Precision (MAP): {mean_average_precision}")

In [3]:
class WordEmbeddingEngine:
    def __init__(self, vector_size, sg, workers, epochs, text_processor, text_tokenizer):
        self.vector_size = vector_size
        self.sg = sg
        self.workers = workers
        self.epochs = epochs
        self.text_processor = text_processor
        self.text_tokenizer = text_tokenizer
        self.word_embedding_model = None
        self.documents_vectors = None
        self.document_id_mapping = {}

    def init_sentences(self, documents):
        sentences = []
        for doc_id, document in documents.items():
            sentences.append(self.text_tokenizer(self.text_processor.preprocess(document)))
            self.document_id_mapping[doc_id] = document
        return sentences

    def train_model(self, documents):
        sentences = self.init_sentences(documents)
        model = Word2Vec(sentences,
                         vector_size=self.vector_size,
                         sg=self.sg,
                         workers=self.workers,
                         epochs=self.epochs)

        self.word_embedding_model = model
        self.documents_vectors = self.vectorize_documents(sentences)
        self.save_model()

    def vectorize_documents(self, sentences):
        documents_vectors = []
        for sentence in sentences:
            zero_vector = np.zeros(self.vector_size)
            vectors = []
            for token in sentence:
                if token in self.word_embedding_model.wv:
                    try:
                        vectors.append(self.word_embedding_model.wv[token])
                    except KeyError:
                        vectors.append(np.random(self.vector_size))
            if vectors:
                vectors = np.asarray(vectors)
                avg_vec = vectors.mean(axis=0)
                documents_vectors.append(avg_vec)
            else:
                documents_vectors.append(zero_vector)
        return documents_vectors

    def save_model(self):
        with open('word_embedding_model.pickle', 'wb') as f_model:
            pickle.dump(self.word_embedding_model, f_model)
        with open('document_vectors.pickle', 'wb') as f_vectors:
            pickle.dump(self.documents_vectors, f_vectors)
        with open('document_id_mapping_vector.pickle', 'wb') as f_mapping:
            pickle.dump(self.document_id_mapping, f_mapping)

    def load_model(self):
        with open('word_embedding_model.pickle', 'rb') as f_model:
            self.word_embedding_model = pickle.load(f_model)
        with open('document_vectors.pickle', 'rb') as f_vectors:
            self.documents_vectors = pickle.load(f_vectors)
        with open('document_id_mapping_vector.pickle', 'rb') as f_mapping:
            self.document_id_mapping = pickle.load(f_mapping)

    def get_query_vector(self, query_text):
        preprocessed_query = self.text_processor.preprocess(query_text)
        tokens = self.text_tokenizer(preprocessed_query)
        vectors = [self.word_embedding_model.wv[token] for token in tokens if token in self.word_embedding_model.wv]
        if vectors:
            vectors = np.asarray(vectors)
            avg_vec = vectors.mean(axis=0)
            return avg_vec
        else:
            return np.zeros(self.vector_size)

    def get_results(self, query_text):
        query_vector = self.get_query_vector(query_text)
        similarities = cosine_similarity([query_vector], self.documents_vectors).flatten()
        ranked_indices = np.argsort(-similarities)
        result_ids = []
        for idx in ranked_indices[:10]:  # Top 10 results
            if similarities[idx] >= 0.35:
                result_ids.append(list(self.document_id_mapping.keys())[idx])
        unordered_results = [{'_id': doc_id, 'text': self.document_id_mapping[doc_id]} for doc_id in result_ids]
        return unordered_results
    
def calculate_MAP(query_id, engine, dataset):
    relevant_docs = [qrel.doc_id for qrel in dataset.qrels_iter() if qrel.query_id == query_id]
    
    ordered_results = []
    for query in dataset.queries_iter():
        if query.query_id == query_id:
            ordered_results = engine.get_results(query.text)
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = 0
        for j in range(i):
            if j < len(ordered_results) and ordered_results[j]['_id'] in relevant_docs:
                relevant_ret += 1
        p_at_k = (relevant_ret / i) * (1 if i-1 < len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs else 0)
        pk_sum += p_at_k
        if i-1 < len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

# Load documents
dataset = ir_datasets.load('antique/train')
documents = {doc.doc_id: doc.text for doc in dataset.docs_iter()}

# Initialize WordEmbeddingEngine
word_embedding_engine = WordEmbeddingEngine(
    vector_size=500, sg=1, workers=4, epochs=35,
    text_processor=text_preprocessor,
    text_tokenizer=tokenize.word_tokenize
)

# word_embedding_engine.train_model(documents)
# print('Train model done!')

# Load the trained Word Embedding model
word_embedding_engine.load_model()

# Calculate MAP for Word Embedding Engine
map_sum_word_embedding = 0
queries_ids = {qrel.query_id for qrel in dataset.queries_iter()}
for query_id in queries_ids:

    map_sum_word_embedding += calculate_MAP(query_id, word_embedding_engine, dataset)

mean_average_precision_word_embedding = map_sum_word_embedding / len(queries_ids)
print(f"Mean Average Precision (MAP) for Word Embedding: {mean_average_precision_word_embedding}")


Mean Average Precision (MAP) for Word Embedding: 0.28236264446772885
