In [37]:
import re
import string
import pickle
from typing import List
import numpy as np
from nltk import tokenize, pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import ir_datasets
from sklearn.metrics.pairwise import cosine_similarity

class TextPreprocessor:
    def __init__(self):
        self.tokenizer = tokenize.word_tokenize
        self.stopwords_tokens = set(stopwords.words('english'))
        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()

    def preprocess(self, text: str) -> str:
        text = self._to_lower(text)
        text = self._remove_punctuation(text)
        text = self._remove_apostrophe(text)
        text = self._remove_stop_words(text)
        text = self._remove_markers(text)
        text = self._stemming(text)
        text = self._lemmatizing(text)
        text = self._normalize_abbreviations(text)
        return text

    def _to_lower(self, text: str) -> str:
        return str(np.char.lower(text))

    def _remove_punctuation(self, text: str) -> str:
        return text.translate(str.maketrans('', '', string.punctuation))

    def _remove_apostrophe(self, text: str) -> str:
        return str(np.char.replace(text, "'", " "))

    def _remove_stop_words(self, text: str) -> str:
        return ' '.join([token for token in text.split() if token not in self.stopwords_tokens and len(token) > 1])

    def _remove_markers(self, text: str) -> str:
        return re.sub(r'\u00AE', '', text)

    def _stemming(self, text: str) -> str:
        return ' '.join([self.stemmer.stem(token) for token in text.split()])

    def _get_wordnet_pos(self, tag: str) -> str:
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def _lemmatizing(self, text: str) -> str:
        tagged_tokens = pos_tag(text.split())
        lemmatized_tokens = [self.lemmatizer.lemmatize(token, self._get_wordnet_pos(pos)) for token, pos in tagged_tokens]
        return ' '.join(lemmatized_tokens)

    def _normalize_abbreviations(self, text: str) -> str:
        resolved_terms = {}
        for token in text.split():
            if len(token) >= 2:
                synsets = wordnet.synsets(token)
                if synsets:
                    resolved_term = synsets[0].lemmas()[0].name()
                    resolved_terms[token] = resolved_term

        for token in text.split():
            if token in resolved_terms:
                text = text.replace(token, resolved_terms[token])

        return text


class TfidfEngine:
    def __init__(self, text_preprocessor):
        self.text_preprocessor = text_preprocessor
        self.tfidf_matrix = None
        self.tfidf_model = None
        self.document_id_mapping = {}

    def train_model(self, documents):
        document_texts = [doc['text'] for doc in documents]
        vectorizer = TfidfVectorizer(preprocessor=self.text_preprocessor.preprocess, tokenizer=self.text_preprocessor.tokenizer)
        tfidf_matrix = vectorizer.fit_transform(document_texts)
        self.tfidf_matrix = tfidf_matrix
        self.tfidf_model = vectorizer
        self.save_model(documents)

    def save_model(self, documents):
        with open('tfidf_model.pickle', 'wb') as f_model:
            pickle.dump(self.tfidf_model, f_model)
        with open('tfidf_matrix.pickle', 'wb') as f_matrix:
            pickle.dump(self.tfidf_matrix, f_matrix)
        with open('document_id_mapping.pickle', 'wb') as f_mapping:
            pickle.dump({doc['id']: doc['text'] for doc in documents}, f_mapping)

    def load_model(self):
        with open('tfidf_model.pickle', 'rb') as f_model:
            self.tfidf_model = pickle.load(f_model)
        with open('tfidf_matrix.pickle', 'rb') as f_matrix:
            self.tfidf_matrix = pickle.load(f_matrix)
        with open('document_id_mapping.pickle', 'rb') as f_mapping:
            self.document_id_mapping = pickle.load(f_mapping)

    def query(self, query_text):
        preprocessed_query = self.text_preprocessor.preprocess(query_text)
        query_vector = self.tfidf_model.transform([preprocessed_query])
        return query_vector
    
    def rank_documents(self, query_vector):
        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        ranked_indices = np.argsort(-cosine_similarities)
        return ranked_indices, cosine_similarities

# Usage example:
# Initialize TextPreprocessor
text_preprocessor = TextPreprocessor()

# Load documents
dataset = ir_datasets.load('antique/train')
documents = [{'id': doc.doc_id, 'text': doc.text} for doc in dataset.docs_iter()]

# Initialize TfidfEngine with the TextPreprocessor
tfidf_engine = TfidfEngine(text_preprocessor)

# Train the TF-IDF model
# tfidf_engine.train_model(documents)

# Save the trained model (this step is already done inside train_model)
# tfidf_engine.save_model(documents)

# Load the trained model
tfidf_engine.load_model()

# Query processing and ranking
query_text = "what is the difference between a cigarette and a hand rolled joint?"
query_vector = tfidf_engine.query(query_text)
ranked_indices, similarities = tfidf_engine.rank_documents(query_vector)

# Output the ranked documents
for idx in ranked_indices[:20]:  # Show top 20 results
    doc_id = list(tfidf_engine.document_id_mapping.keys())[idx]
    doc_text = tfidf_engine.document_id_mapping[doc_id]
    print(f"Document ID {doc_id}: {doc_text}")
    print(f"Similarity: {similarities[idx]}")


Document ID 851454_22: smoke what cigarettes or reefer?? cigarettes i have never and will never smoke and as for the other well it relaxes you
Similarity: 0.5663244839508543
Document ID 1603419_6: Got a cigarette?
Similarity: 0.5629552401929355
Document ID 20085_8: Cigarettes and alcohol. One for each hand and not illegal.
Similarity: 0.5105578255383663
Document ID 2273650_4: when you light up that cigarette.
Similarity: 0.4987626942223903
Document ID 1910631_1: Always wanting a cigarette
Similarity: 0.4712118920399412
Document ID 3638873_10: wash it down with a joint
Similarity: 0.46114641500013825
Document ID 3124812_0: ball joint
Similarity: 0.4592886693133129
Document ID 2907121_2: A chicken wing has 3 joints split the 2 largest joints apart and throw away the smallest tip joint.
Similarity: 0.45595486493856585
Document ID 717237_2: What you actually pop is your joint. When an air bubble forms in your joint and it pops. You feel a need to pop your joint because an air bubble in you

In [8]:
import re
import string
import pickle
from typing import List
import numpy as np
from nltk import tokenize, pos_tag, download
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import ir_datasets
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from typing import Callable
class LemmatizerWithPOSTagger(WordNetLemmatizer):
    def __init__(self):
        pass

    def _get_wordnet_pos(self, tag: str) -> str:
        if tag.startswith('J'):
            return wordnet.ADJ
        elif tag.startswith('V'):
            return wordnet.VERB
        elif tag.startswith('N'):
            return wordnet.NOUN
        elif tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def lemmatize(self, word: str, pos: str = "n") -> str:
        return super().lemmatize(word, self._get_wordnet_pos(pos))

class TextPreprocessor():

    def __init__(self, tokenizer: Callable = None) -> None:
        self.tokenizer = tokenizer

        if self.tokenizer is None:
            self.tokenizer = tokenize.word_tokenize

        self.stopwords_tokens = stopwords.words('english')
        self.stemmer = PorterStemmer()
        self.lemmatizer = LemmatizerWithPOSTagger()

    def tokenize(self, text: str)-> List[str]:
        tokens =self.tokenizer(text)
        return tokens
    
    def to_lower(self, tokens: List[str]) -> List[str]:
        lower_tokens = []
        for token in tokens:
            lower_token = str(np.char.lower(token))
            lower_tokens.append(lower_token)
        return lower_tokens

    
    def remove_markers(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(re.sub(r'\u00AE', '', token))
        return new_tokens

    def remove_punctuation(self, tokens: List[str]) ->  List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(token.translate(str.maketrans('', '', string.punctuation)))
        return new_tokens




    def rplace_under_score_with_space(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(re.sub(r'_', ' ', token))
        return new_tokens

    def remove_stop_words(self,tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            if token not in self.stopwords_tokens and len(token) > 1:
                new_tokens.append(token)
        return new_tokens

    def remove_apostrophe(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(str(np.char.replace(token, "'", " ")))
        return new_tokens

    def stemming(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        for token in tokens:
            new_tokens.append(self.stemmer.stem(token))
        return new_tokens
    
    
    def normalize_appreviations(self, tokens: List[str]) -> List[str]:
        new_tokens = []
        resolved_terms = {}
        for token in tokens:

            if len(token) >= 2:
                synsets = wordnet.synsets(token)
                if synsets:
                    resolved_term = synsets[0].lemmas()[0].name()
                    resolved_terms[token] = resolved_term

        for abbreviation, resolved_term in resolved_terms.items():
            for i in range(len(tokens)):
                if tokens[i] == abbreviation:
                    tokens[i] = resolved_term
                    break

        return tokens
    
    def lemmatizing(self, tokens: List[str]) -> List[str]:
        tagged_tokens = pos_tag(tokens)
        lemmatized_tokens = [self.lemmatizer.lemmatize(token, pos) for token, pos in tagged_tokens]
        return lemmatized_tokens


    def preprocess(self, text: str) -> str:
        operations = [
            self.to_lower,
            self.remove_punctuation,
            self.remove_apostrophe,
            self.remove_stop_words,
            self.remove_markers,
            self.stemming,
            self.lemmatizing,
            self.normalize_appreviations, 
            self.to_lower,
            self.rplace_under_score_with_space
        ]
        text_tokens=self.tokenize(text)
        for op in operations:
              text_tokens=op(text_tokens)
    
        new_text=""
        new_text = ' '.join(text_tokens)
            
        return new_text

class TfidfEngine:
    def __init__(self, text_preprocessor):
        self.text_preprocessor = text_preprocessor
        self.tfidf_matrix = None
        self.tfidf_model = None
        self.document_id_mapping = {}

    def train_model(self, documents):
        document_texts = [doc['text'] for doc in documents]
        vectorizer = TfidfVectorizer(preprocessor=self.text_preprocessor.preprocess, tokenizer=self.text_preprocessor.tokenizer)
        tfidf_matrix = vectorizer.fit_transform(document_texts)
        self.tfidf_matrix = tfidf_matrix
        self.tfidf_model = vectorizer
        self.save_model(documents)

    def save_model(self, documents):
        with open('tfidf_model.pickle', 'wb') as f_model:
            pickle.dump(self.tfidf_model, f_model)
        with open('tfidf_matrix.pickle', 'wb') as f_matrix:
            pickle.dump(self.tfidf_matrix, f_matrix)
        with open('document_id_mapping.pickle', 'wb') as f_mapping:
            pickle.dump({doc['id']: doc['text'] for doc in documents}, f_mapping)

    def load_model(self):
        with open('tfidf_model.pickle', 'rb') as f_model:
            self.tfidf_model = pickle.load(f_model)
        with open('tfidf_matrix.pickle', 'rb') as f_matrix:
            self.tfidf_matrix = pickle.load(f_matrix)
        with open('document_id_mapping.pickle', 'rb') as f_mapping:
            self.document_id_mapping = pickle.load(f_mapping)

    def query(self, query_text):
        preprocessed_query = self.text_preprocessor.preprocess(query_text)
        query_vector = self.tfidf_model.transform([preprocessed_query])
        return query_vector
    
    def rank_documents(self, query_vector):
        cosine_similarities = cosine_similarity(query_vector, self.tfidf_matrix).flatten()
        ranked_indices = np.argsort(-cosine_similarities)
        return ranked_indices, cosine_similarities

    def get_results(self, query_text):
        query_vector = self.query(query_text)
        ranked_indices, similarities = self.rank_documents(query_vector)
        result_ids = []
        for idx in ranked_indices[:10]:  # Top 10 results
            if similarities[idx] >= 0.35:
                result_ids.append(list(self.document_id_mapping.keys())[idx])
        unordered_results = [{'_id': doc_id, 'text': self.document_id_mapping[doc_id]} for doc_id in result_ids]
        return unordered_results


def calculate_MAP(query_id, tfidf_engine, dataset):
    relevant_docs = [qrel.doc_id for qrel in dataset.qrels_iter() if qrel.query_id == query_id]
    
    ordered_results = []
    for query in dataset.queries_iter():
        if query.query_id == query_id:
            ordered_results = tfidf_engine.get_results(query.text)
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = 0
        for j in range(i):
            if j < len(ordered_results) and ordered_results[j]['_id'] in relevant_docs:
                relevant_ret += 1
        p_at_k = (relevant_ret / i) * (1 if i-1 < len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs else 0)
        pk_sum += p_at_k
        if i-1 < len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant

# Initialize TextPreprocessor
# text_preprocessor = TextPreprocessor()

# # Load documents
# dataset = ir_datasets.load('wikir/en1k/training')
# documents = [{'id': doc.doc_id, 'text': doc.text} for doc in dataset.docs_iter()]

# # Initialize TfidfEngine with the TextPreprocessor
# tfidf_engine = TfidfEngine(text_preprocessor)

# # Train the TF-IDF model (uncomment to train and save the model)
# # tfidf_engine.train_model(documents)

# # Load the trained model
# tfidf_engine.load_model()

# # Calculate MAP for all queries in the dataset
# dataset = ir_datasets.load("wikir/en1k/training")

# queries_ids = {qrel.query_id for qrel in dataset.qrels_iter()}
# map_sum = 0

# for query_id in queries_ids:
#     map_sum += calculate_MAP(query_id, tfidf_engine, dataset)

# mean_average_precision = map_sum / len(queries_ids)
# print(f"Mean Average Precision (MAP): {mean_average_precision}")


In [10]:


def calculate_MAP(query_id, tfidf_engine, dataset):
    relevant_docs = [qrel.doc_id for qrel in dataset.qrels_iter() if qrel.query_id == query_id]
    
    ordered_results = []
    for query in dataset.queries_iter():
        if query.query_id == query_id:
            ordered_results = tfidf_engine.get_results(query.text)
            break

    pk_sum = 0
    total_relevant = 0
    for i in range(1, 11):
        relevant_ret = 0
        for j in range(i):
            if j < len(ordered_results) and ordered_results[j]['_id'] in relevant_docs:
                relevant_ret += 1
        p_at_k = (relevant_ret / i) * (1 if i-1 < len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs else 0)
        pk_sum += p_at_k
        if i-1 < len(ordered_results) and ordered_results[i-1]['_id'] in relevant_docs:
            total_relevant += 1

    return 0 if total_relevant == 0 else pk_sum / total_relevant
def calculate_precision_at_10(query_id, tfidf_engine, dataset):
    relevant_docs = [qrel.doc_id for qrel in dataset.qrels_iter() if qrel.query_id == query_id]
    
    ordered_results = []
    for query in dataset.queries_iter():
        if query.query_id == query_id:
            ordered_results = tfidf_engine.get_results(query.text)
            break

    retrieved_docs = [result['_id'] for result in ordered_results[:10]]
    relevant_retrieved_docs = [doc for doc in retrieved_docs if doc in relevant_docs]

    precision_at_10 = len(relevant_retrieved_docs) / 10
    return precision_at_10

def calculate_recall(query_id, tfidf_engine, dataset):
    relevant_docs = [qrel.doc_id for qrel in dataset.qrels_iter() if qrel.query_id == query_id]
    
    ordered_results = []
    for query in dataset.queries_iter():
        if query.query_id == query_id:
            ordered_results = tfidf_engine.get_results(query.text)
            break

    retrieved_docs = [result['_id'] for result in ordered_results]
    relevant_retrieved_docs = [doc for doc in retrieved_docs if doc in relevant_docs]

    recall = len(relevant_retrieved_docs) / len(relevant_docs) if relevant_docs else 0
    return recall

# # Initialize TextPreprocessor
text_preprocessor = TextPreprocessor()

# Load documents
dataset = ir_datasets.load('wikir/en1k/training')
documents = [{'id': doc.doc_id, 'text': doc.text} for doc in dataset.docs_iter()]

# # Initialize TfidfEngine with the TextPreprocessor
tfidf_engine = TfidfEngine(text_preprocessor)

# Train the TF-IDF model (uncomment to train and save the model)
# tfidf_engine.train_model(documents)

# Load the trained model
tfidf_engine.load_model()

# Calculate MAP, precision@10, and recall for all queries in the dataset
dataset = ir_datasets.load("wikir/en1k/training")

queries_ids = {qrel.query_id for qrel in dataset.qrels_iter()}
map_sum = 0
precision_at_10_sum = 0
recall_sum = 0

for query_id in queries_ids:
    map_sum += calculate_MAP(query_id, tfidf_engine, dataset)
    precision_at_10_sum += calculate_precision_at_10(query_id, tfidf_engine, dataset)
    recall_sum += calculate_recall(query_id, tfidf_engine, dataset)

mean_average_precision = map_sum / len(queries_ids)
mean_precision_at_10 = precision_at_10_sum / len(queries_ids)
mean_recall = recall_sum / len(queries_ids)

print(f"Mean Average Precision (MAP): {mean_average_precision}")
print(f"Mean Precision@10: {mean_precision_at_10}")
print(f"Mean Recall: {mean_recall}")


Mean Average Precision (MAP): 0.4649909229876424
Mean Precision@10: 0.13614958448753353
Mean Recall: 0.1197556803665943
