In [1]:
# !pip install nltk

In [2]:
# import nltk
# nltk.download()

In [3]:
# !pip install pyspellchecker

In [26]:
import string
import ir_datasets
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
import pandas as pd
import pickle
from tabulate import tabulate
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [5]:
dataset = ir_datasets.load("trec-tot/2023/dev")

In [6]:
documents = []

for i, doc in enumerate(dataset.docs_iter()):
    if i == 5:
        break
    
    documents.append(doc.text)
    

print(documents[0])

Actresses (Catalan: Actrius) is a 1997 Catalan language Spanish drama film produced and directed by Ventura Pons and based on the award-winning stage play "E.R." by Josep Maria Benet i Jornet. The film has no male actors, with all roles played by females. The film was produced in 1996. 
Synopsis.
In order to prepare herself to play a role commemorating the life of legendary actress Empar Ribera, young actress (Mercè Pons) interviews three established actresses who had been the Ribera's pupils: the international diva Glòria Marc (Núria Espert), the television star Assumpta Roca (Rosa Maria Sardà), and dubbing director Maria Caminal (Anna Lizaran).
Recognition.
Screenings.
"Actrius" screened in 2001 at the Grauman's Egyptian Theatre in an American Cinematheque retrospective of the works of its director. The film had first screened at the same location in 1998. It was also shown at the 1997 Stockholm International Film Festival.
Reception.
In "Movie - Film - Review", Christopher Tookey wr

In [7]:
def preprocess_text(text):
    """Lowercase, remove punctuation, tokenize."""
    text = text.lower()
    tokens = word_tokenize(text)
    new_tokens = []
    for token in tokens:
        cleaned_token = token.translate(str.maketrans('', '', string.punctuation))
        if cleaned_token != '':
            new_tokens.append(cleaned_token)
    return new_tokens 

In [8]:
preprocessed_docs = [preprocess_text(doc) for doc in documents]

In [9]:
def remove_stopwords(tokens):
    """Remove stopwords from tokens"""
    filtered_tokens = []
    for token in tokens:
        if token not in stopwords.words('English'):
            filtered_tokens.append(token)
    return filtered_tokens 

In [10]:
filtered_docs = []
for doc in preprocessed_docs:
    filtered_docs.append(remove_stopwords(doc))

In [11]:
tagged_docs = [pos_tag(filtered_doc) for filtered_doc in filtered_docs]

In [12]:
def get_wordnet_pos(tag_parameter):

    tag = tag_parameter[0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    return tag_dict.get(tag, wordnet.NOUN)

In [13]:
def lemmatization(tagged_doc):
    """Lemmatization."""
    lemmatized_doc = []
    for word, tag in tagged_doc:
        lemmatized_word = lemmatizer.lemmatize(word, pos=get_wordnet_pos(tag))
        lemmatized_doc.append(lemmatized_word)
    return lemmatized_doc 

In [14]:
lemmatizer = WordNetLemmatizer()

lemmatized_docs = []

for tagged_doc in tagged_docs:
    lemmatized_docs.append(lemmatization(tagged_doc))

In [15]:
documents_as_strings = [' '.join(doc) for doc in lemmatized_docs]

In [16]:
vectorizer = TfidfVectorizer() 
tfidf_matrix = vectorizer.fit_transform(documents_as_strings)

print(vectorizer.vocabulary_) 
print(len(vectorizer.vocabulary_)) 

pickle.dump(vectorizer, open("tfidf_vectorizer.pkl", 'wb'))

{'actress': 221, 'catalan': 516, 'actrius': 222, '1997': 70, 'language': 1401, 'spanish': 2289, 'drama': 846, 'film': 1036, 'produce': 1915, 'directed': 796, 'ventura': 2560, 'pons': 1853, 'base': 392, 'awardwinning': 371, 'stage': 2315, 'play': 1840, 'er': 947, 'josep': 1356, 'maria': 1514, 'benet': 414, 'jornet': 1355, 'male': 1499, 'actor': 220, 'roles': 2103, 'played': 1841, 'female': 1022, '1996': 69, 'synopsis': 2404, 'order': 1733, 'prepare': 1887, 'role': 2101, 'commemorate': 603, 'life': 1436, 'legendary': 1428, 'empar': 916, 'ribera': 2086, 'young': 2672, 'mercè': 1555, 'interview': 1317, 'three': 2454, 'establish': 953, 'pupils': 1948, 'international': 1313, 'diva': 828, 'glòria': 1143, 'marc': 1510, 'núria': 1695, 'espert': 951, 'television': 2425, 'star': 2321, 'assumpta': 349, 'roca': 2097, 'rosa': 2110, 'sardà': 2140, 'dub': 858, 'director': 799, 'caminal': 497, 'anna': 298, 'lizaran': 1462, 'recognition': 1996, 'screening': 2160, 'screen': 2158, '2001': 78, 'grauman': 1

In [17]:
df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out(), index={"doc1":"","doc2":"","doc3":"","doc4":"","doc5":""}.keys())
df

Unnamed: 0,10,100,104,1080,11,1100,112,115,12,12500,...,wyatt,yankee,year,yellowface,york,yorkbased,young,zero,zone,álvarez
doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.077068,0.0,0.0,0.0
doc2,0.008286,0.0,0.004124,0.004124,0.002762,0.004124,0.004124,0.0,0.002762,0.004124,...,0.0,0.0,0.136427,0.004124,0.022097,0.004124,0.0,0.004124,0.004124,0.0
doc3,0.009597,0.0,0.0,0.0,0.009597,0.0,0.0,0.0,0.019194,0.0,...,0.01433,0.0,0.0,0.0,0.009597,0.0,0.0,0.0,0.0,0.0
doc4,0.051004,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc5,0.0,0.022211,0.0,0.0,0.014875,0.0,0.0,0.011106,0.007438,0.0,...,0.0,0.011106,0.00896,0.0,0.007438,0.0,0.0,0.0,0.0,0.033317


In [18]:
df = pd.DataFrame(tfidf_matrix.T.toarray(), index=vectorizer.get_feature_names_out())
df.head()
df

Unnamed: 0,0,1,2,3,4
10,0.000000,0.008286,0.009597,0.051004,0.000000
100,0.000000,0.000000,0.000000,0.000000,0.022211
104,0.000000,0.004124,0.000000,0.000000,0.000000
1080,0.000000,0.004124,0.000000,0.000000,0.000000
11,0.000000,0.002762,0.009597,0.000000,0.014875
...,...,...,...,...,...
yorkbased,0.000000,0.004124,0.000000,0.000000,0.000000
young,0.077068,0.000000,0.000000,0.000000,0.000000
zero,0.000000,0.004124,0.000000,0.000000,0.000000
zone,0.000000,0.004124,0.000000,0.000000,0.000000


In [21]:
def search_engine(query):
    """Processes a query and returns the top matching documents."""
    preprocessed_query = preprocess_text(query)
    print(preprocessed_query)
    remove_stopwords_query = remove_stopwords(preprocessed_query)
    print(remove_stopwords_query)
    pos_tag_query = pos_tag(remove_stopwords_query)
    print(pos_tag_query)
    lemmatization_query = lemmatization(pos_tag_query)
    print(lemmatization_query)
    text_query = ' '.join(lemmatization_query)
    print(text_query)
    query_vector = vectorizer.transform([text_query]) 
    print(query_vector)
    cosine_similarities = cosine_similarity(tfidf_matrix, query_vector)
    print(cosine_similarities)
    top_indices = cosine_similarities.argsort()[0][-10:][::-1] 
    results_data = []
    for i, doc_index in enumerate(top_indices):
        similarity = cosine_similarities[0, doc_index]
        results_data.append([i + 1, similarity, documents[doc_index]])

    table_headers = ["Rank", "Similarity", "Document"]
    print("Top 5 Results:")
    print(tabulate(results_data, headers=table_headers, tablefmt="grid"))  # 'grid' for a nice grid-like format


In [27]:
query = "sturdy"
results = search_engine(query)

['sturdy']
['sturdy']
[('sturdy', 'NN')]
['sturdy']
sturdy

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]]
Top 5 Results:
+--------+--------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------