BY ARJAV JAIN AND ARYAN GUPTA

In [None]:
import nltk
import re
import math
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('omw-1.4')

from google.colab import drive
drive.mount('/content/drive')

file_list = glob.glob("/content/drive/MyDrive/corpus/*")
for i in range(len(file_list)):
  name = file_list[i].split("/")
  file_list[i] = name[-1]

def read_document(file_name):
  with open('/content/drive/MyDrive/corpus/{}'.format(file_name)) as file:
    lines = file.read()
  return lines

def preprocess_text(text):
  text = text.lower()
  text = re.sub('[^A-Za-z0-9.]+', ' ',text)
  # preprocess text
  tokens = word_tokenize(text)
    
  stop_words = set(stopwords.words("english"))
  tokens = [token for token in tokens if token not in stop_words]
    
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    

  stemmer = PorterStemmer()
  stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]
    
  return stemmed_tokens


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# function to calculate term frequency for both query and doc
def calculate_tf(tokens):
    tf_dict = defaultdict(int)
    for token in tokens:
        tf_dict[token] += 1
    # convert to log(tf) weights
    for term, freq in tf_dict.items():
        tf_dict[term] = 1 + math.log10(freq)
    return tf_dict

In [None]:
# function to calculate inverse document frequency for query
def calculate_idf(N, df):
    return math.log10(N/df)

In [None]:
# function to build index
def build_index(file_list):
    inverted_index = defaultdict(list)
    doc_lengths = defaultdict(float)
    N = len(file_list)

    for doc_id, file_name in enumerate(file_list):
        text = read_document(file_name)
        tokens = preprocess_text(text)
        tf_dict = calculate_tf(tokens)
        # print(tf_dict)
        # update document lengths
        length = 0.0
        for term, weight in tf_dict.items():
            length += weight**2
            # add postings to inverted index
            inverted_index[term].append((doc_id, weight))
        
        # update document length dictionary
        doc_lengths[doc_id] = math.sqrt(length)
        
    return inverted_index, doc_lengths, N

In [None]:
# function to calculate cosine similarity
def calculate_cosine_similarity(query_tf_idf, doc_weights, doc_length):
    dot_product = 0.0
    for term, weight in query_tf_idf.items():
        if term in doc_weights:
            dot_product += weight * doc_weights[term]
    return dot_product / doc_length

In [None]:
# function to rank documents based on cosine similarity
def rank_documents(query, inverted_index, doc_lengths, N):
    tokens = preprocess_text(query)
    query_tf = calculate_tf(tokens)
    query_tf_idf = {}
    for term, tf in query_tf.items():
        if term in inverted_index:
            df = len(inverted_index[term])
            idf = calculate_idf(N, df)
            query_tf_idf[term] = tf * idf
    
    scores = defaultdict(float)
    for term, weight in query_tf_idf.items():
        postings = inverted_index[term]
        idf = calculate_idf(N, len(postings))
        for doc_id, doc_weight in postings:
            tf_idf = doc_weight * idf
            scores[doc_id] += tf_idf * weight
    
    # normalize scores by document lengths
    for doc_id, score in scores.items():
        scores[doc_id] /= doc_lengths[doc_id]
    
    # rank documents by cosine similarity
    ranked_documents = sorted(scores.items(), key=lambda x: -x[1])
    # return up to top 10 documents
    return ranked_documents[:10]

In [None]:
# First, build the inverted index, document lengths, and number of documents
inverted_index, doc_lengths, N = build_index(file_list)

# Then, prompt the user to enter a query
query = input("Enter your query: ")

# Use the rank_documents function to get the top 10 ranked documents
top_documents = rank_documents(query, inverted_index, doc_lengths, N)

# Print out the names of the top ranked documents and their scores
for doc_id, score in top_documents:
    print(f"{file_list[doc_id]}, {score}")


Enter your query:  Warwickshire, came from an ancient family and was the heiress to  some land
shakespeare.txt, 0.5218952557479883
google.txt, 0.06346005353536822
zomato.txt, 0.06317755812891683
levis.txt, 0.05259403722112643
Adobe.txt, 0.05127059683761026
nike.txt, 0.05025820195108572
huawei.txt, 0.036121264905662025
Dell.txt, 0.02818648819317776
skype.txt, 0.015972125229477746
blackberry.txt, 0.015012957067678082
