In [1]:
import nltk
import glob
import os
import re
import pickle
import os
import numpy as np
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ayush\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def rem_sw(arr):
  global stop_words
  reg_str = r'^'
  for i in range(len(arr)):
    for sw in stop_words:
      reg_str+=sw
      reg_str+='$'
      arr[i] = re.sub(reg_str, '', arr[i])
      reg_str = r'^'
  return arr

def rem_punc(str):
  str = re.sub(r'(\.|\?|\,|!|\:|\;|\&|\-|\(|\)|\{|\}|\'|\"|\/)', ' ', str)
  return str

def preprocessing(s):
    s = s.lower()
    s = re.sub(r'(\.|\?|\,|!|\:|\;|\&|\-|\(|\)|\{|\}|\'|\"|\/)', ' ', s)
    s = word_tokenize(s)
    s = rem_sw(s)
    for j in range(len(s)):
      s[j] = rem_punc(s[j])
    s[:] = [j for j in s if (j!='' and j!=' ')]
    return s

In [4]:
def compute_tf(token_list):
    tf_dict = {}
    for token in token_list:
        tf_dict[token] = tf_dict.get(token, 0) + 1
    return tf_dict

def compute_df(documents):
    df_dict = {}
    for doc in documents:
        for token in set(documents[doc]):
            df_dict[token] = df_dict.get(token, 0) + 1
    return df_dict

def compute_idf(documents):
    idf_dict = {}
    N = len(documents)
    df = compute_df(documents)
    for term in df:
        idf_dict[term] = math.log((N+1)/(df[term]+1))
    return idf_dict

def compute_tf_idf(documents):
    tf_idf_matrix = {}
    idf = compute_idf(documents)
    for doc in documents:
        tf = compute_tf(documents[doc])
        tf_idf_matrix[doc] = {}
        for term in tf:
            tf_idf_matrix[doc][term] = tf[term] * idf[term]
    return tf_idf_matrix

def compute_query_tf_idf(query, idf):
    query_tf = compute_tf(query)
    query_tf_idf = {}
    for term in query_tf:
        if term in idf:
            query_tf_idf[term] = query_tf[term] * idf[term]
    return query_tf_idf

def compute_cosine_similarity(doc_vector, query_vector):
    dot_product = 0
    for term in query_vector:
        if term in doc_vector:
            dot_product += doc_vector[term] * query_vector[term]
    doc_norm = math.sqrt(sum([i**2 for i in doc_vector.values()]))
    query_norm = math.sqrt(sum([i**2 for i in query_vector.values()]))
    if doc_norm == 0 or query_norm == 0:
        return 0
    return dot_product / (doc_norm * query_norm)

def rank_documents(documents, query, tf_weighting):
    tf_idf_matrix = compute_tf_idf(documents)
    idf = compute_idf(documents)
    query_tf_idf = compute_query_tf_idf(query, idf)
    rankings = {}
    for doc in tf_idf_matrix:
        doc_vector = tf_idf_matrix[doc]
        if tf_weighting == 'binary':
            for term in doc_vector:
                if doc_vector[term] > 0:
                    doc_vector[term] = 1
        elif tf_weighting == 'raw_count':
            pass
        elif tf_weighting == 'term_frequency':
            for term in doc_vector:
                doc_vector[term] = doc_vector[term] / compute_tf(documents[doc])[term]
        elif tf_weighting == 'log_normalization':
            for term in doc_vector:
                doc_vector[term] = math.log(1 + doc_vector[term])
        elif tf_weighting == 'double_normalization':
            max_tf = max(doc_vector.values())
            for term in doc_vector:
                doc_vector[term] = 0.5 + 0.5 * (doc_vector[term] / max_tf)
        else:
            raise ValueError('Invalid TF weighting scheme')
        similarity = compute_cosine_similarity(doc_vector, query_tf_idf)
        rankings[doc] = similarity
        
    sorted_rankings = sorted(rankings.items(), key=lambda x: x[1], reverse=True)
    return sorted_rankings


In [37]:
def tfidf_driver(documents, query):
    
    print('\n------------------------------------------------------------------------')
    print('Binary weighting scheme:')
    binary_rankings = rank_documents(documents, query, 'binary')
    for doc, score in binary_rankings[:5]:
        print(f'{doc}: {score}')
    print('\n------------------------------------------------------------------------')
    
    print('\nRaw count weighting scheme:')
    raw_count_rankings = rank_documents(documents, query, 'raw_count')
    for doc, score in raw_count_rankings[:5]:
        print(f'{doc}: {score}')
    print('\n------------------------------------------------------------------------')
    
    print('\nTerm frequency weighting scheme:')
    term_frequency_rankings = rank_documents(documents, query, 'term_frequency')
    for doc, score in term_frequency_rankings[:5]:
        print(f'{doc}: {score}')
    print('\n------------------------------------------------------------------------')
    
    print('\nLog normalization weighting scheme:')
    log_normalization_rankings = rank_documents(documents, query, 'log_normalization')
    for doc, score in log_normalization_rankings[:5]:
        print(f'{doc}: {score}')
    print('\n------------------------------------------------------------------------')
    
    print('\nDouble normalization weighting scheme:')
    double_normalization_rankings = rank_documents(documents, query, 'double_normalization')
    for doc, score in double_normalization_rankings[:5]:
        print(f'{doc}: {score}')
    print('\n------------------------------------------------------------------------')
        


In [38]:
def jaccard(tokenised_docs, query):
    jaccard_scores = {}
    
    for doc, tokens in tokenised_docs.items():
        tokens = set(tokens)
        intersection = tokens.intersection(query)
        union = tokens.union(query)
        jaccard_scores[doc] = len(intersection) / len(union)

    sorted_docs = sorted(jaccard_scores.items(), key=lambda x: x[1], reverse=True)
    
    i = 0
    print("S.No.||   Document Name   ||   Jaccard Coefficient")
    print("----------------------------------------------------")
    for doc_name, score in sorted_docs[:10]:
        print(f"  {i}  ||   {doc_name}   ||   {score}")
        i += 1

In [44]:
def main():
    with open('tokens.pickle', "rb") as f:
        tokenised_docs = pickle.load(f)
    
    raw_query = input("Enter your raw query: ")
    query =  preprocessing(raw_query)

    print('\nTop 10 documents with highest Jaccard Coefficient are: \n')    
    jaccard(tokenised_docs, query)
    tfidf_driver(tokenised_docs, query)

if __name__ == '__main__':
    main()


Enter your raw query: experimental is the average

Top 10 documents with highest Jaccard Coefficient are: 

S.No.||   Document Name   ||   Jaccard Coefficient
----------------------------------------------------
  0  ||   cranfield1045   ||   0.07142857142857142
  1  ||   cranfield0137   ||   0.045454545454545456
  2  ||   cranfield0286   ||   0.045454545454545456
  3  ||   cranfield0074   ||   0.044444444444444446
  4  ||   cranfield0271   ||   0.043478260869565216
  5  ||   cranfield1146   ||   0.043478260869565216
  6  ||   cranfield0670   ||   0.04
  7  ||   cranfield0339   ||   0.03571428571428571
  8  ||   cranfield0932   ||   0.034482758620689655
  9  ||   cranfield0501   ||   0.03225806451612903

------------------------------------------------------------------------
Binary weighting scheme:
cranfield0074: 0.19384860436389673
cranfield0400: 0.16156789118984793
cranfield0959: 0.15542465623585575
cranfield0497: 0.14916335374445833
cranfield0741: 0.14321462866692672

------------