BY ARJAV JAIN AND ARYAN GUPTA

In [None]:
import nltk
import re
import math
import glob

from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from collections import defaultdict
nltk.download("punkt")
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('omw-1.4')

from google.colab import drive
drive.mount('/content/drive')

file_list = glob.glob("/content/drive/MyDrive/corpus/*")
for i in range(len(file_list)):
  name = file_list[i].split("/")
  file_list[i] = name[-1]

def read_document(file_name):
  with open('/content/drive/MyDrive/corpus/{}'.format(file_name)) as file:
    lines = file.read()
  return lines

def preprocess_text(text):
  text = text.lower()
  text = re.sub('[^A-Za-z0-9]+', ' ',text)
  # preprocess text
  tokens = word_tokenize(text)
    
  stop_words = set(stopwords.words("english"))
  tokens = [token for token in tokens if token not in stop_words]
    
  lemmatizer = WordNetLemmatizer()
  lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    

  stemmer = PorterStemmer()
  stemmed_tokens = [stemmer.stem(token) for token in lemmatized_tokens]
    
  return stemmed_tokens


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# function to calculate term frequency for both query and doc
def calculate_tf(tokens):
    tf_dict = defaultdict(int)
    for token in tokens:
        # increment the count for that token in the tf_dict
        tf_dict[token] += 1
    # convert raw term frequency to log(tf) weights
    for term, freq in tf_dict.items():
        if freq == 0:
          tf_dict[term] = 0
        else:
          tf_dict[term] = 1 + math.log10(freq)
    return tf_dict

In [None]:
# function to calculate inverse document frequency for query
def calculate_idf(N, df):
    # calculate inverse document frequency
    return math.log10(N/df)

In [None]:
# function to build inverted index, document lengths, and number of documents
def build_index(file_list):
    inverted_index = defaultdict(list)
    doc_lengths = defaultdict(float)
    N = len(file_list)

    for doc_id, file_name in enumerate(file_list):
        text = read_document(file_name)
        tokens = preprocess_text(text)
        # calculate term frequency for each token in the document
        tf_dict = calculate_tf(tokens)
        # update document lengths by summing the square of each term weight
        length = 0.0
        for term, weight in tf_dict.items():
            length += weight**2
            # add postings to inverted index for each term
            inverted_index[term].append((doc_id, weight))
        
        doc_lengths[doc_id] = math.sqrt(length)
        
    return inverted_index, doc_lengths, N

In [None]:
# function to rank documents based on cosine similarity
#query ltc
#doc lnc 
def rank_documents(query, inverted_index, doc_lengths, N):
    tokens = preprocess_text(query)
    query_tf = calculate_tf(tokens)
    query_tf_idf = {}
    normalized_query = 0
    for term, tf in query_tf.items():
        if term in inverted_index:
            df = len(inverted_index[term])
            idf = calculate_idf(N, df)
            query_tf_idf[term] = tf * idf
            normalized_query+=query_tf_idf[term]**2

    normalized_query= math.sqrt(normalized_query)

    scores = defaultdict(float)
    for term, query_weight in query_tf_idf.items():
        for doc_id, doc_weight in inverted_index[term]: 
            scores[doc_id] += doc_weight * query_weight
    
    # print(scores)
    print(query_tf_idf)
    
    # cosine normalization 
    for doc_id, score in scores.items():
        # scores[doc_id] /=doc_lengths[doc_id]
    
        scores[doc_id] /=(doc_lengths[doc_id]*normalized_query)
    
    ranked_documents = sorted(scores.items(), key=lambda x: -x[1])
    return ranked_documents[:10]

In [None]:
inverted_index, doc_lengths, N = build_index(file_list)

In [None]:
# Then, prompt the user to enter a query
query = input("Enter your query: ")
# Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation

Enter your query: The company was founded in 1982 by John Warnock and Charles Geschke. While employed at Xerox Corporation’s Palo Alto (California) Research Center (PARC), the two computer scientists had developed a programming language specially designed to describe the precise position, shape, and size of objects on a computer-generated page. This page description language, later known as PostScript, described such objects as letters and graphics in mathematical terms, without reference to any specific computer or printer; any device capable of interpreting the language would be able to generate a representation of the page at any resolution the device supported. When Xerox declined to bring the technology to market, Warnock and Geschke formed their own company to do so, naming it after a creek near their homes.  In 1983 Apple Computer, Inc. (now Apple Inc.), acquired 15 percent of Adobe and became the first licensee of PostScript. In 1985 Apple introduced the first Macintosh-compati

In [None]:
# Use the rank_documents function to get the top 10 ranked documents
top_documents = rank_documents(query, inverted_index, doc_lengths, N)

# Print out the names of the top ranked documents and their scores
for doc_id, score in top_documents:
    if score >0:
      print(f"{file_list[doc_id]}, {score}")

{'compani': 0.31503831193837806, 'found': 0.2703611758975292, '1982': 1.0107238653917732, 'john': 1.3117538610557542, 'warnock': 2.0982801741150165, 'charl': 1.3117538610557542, 'geschk': 2.0982801741150165, 'employ': 0.6585413472804106, 'xerox': 2.0982801741150165, 'corpor': 0.612523503197519, 'palo': 1.6127838567197355, 'alto': 1.6127838567197355, 'california': 1.0107238653917732, 'research': 0.7433970534591091, 'center': 0.9138138523837167, 'parc': 1.6127838567197355, 'two': 0.3993562393692417, 'comput': 0.6871429148231267, 'scientist': 1.135662602000073, 'develop': 0.4088671759863924, 'program': 0.7986283147676576, 'languag': 1.1369721547816145, 'special': 0.856782046196785, 'design': 0.45933552818691636, 'describ': 1.4775311101559008, 'precis': 1.0107238653917732, 'posit': 0.7676858167054786, 'shape': 0.9138138523837167, 'size': 0.7676858167054786, 'object': 1.6775113676045446, 'gener': 0.5681501684453572, 'page': 1.0160199261702747, 'descript': 1.3117538610557542, 'later': 0.6943