In [13]:
import os
import math as m
import numpy as np
import pandas as pd
import tkinter as tk
import customtkinter as ctk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [14]:
def load_docIDs():
    curr_dir = os.getcwd() # get the current directory
    docID = [] # create a list to store the document IDs
    for i in os.listdir(curr_dir + '\ResearchPapers'): # loop through each file in the 'ResearchPapers' directory
        i = i.rstrip('.txt')
        docID.append(int(i))
    docID.sort()
    return docID

def load_stopwords():
    stopwords = []
    f = open('Stopword-List.txt', 'r') # open the 'Stopword-List.txt' file
    while True:
        line = f.readline() # each line from the file is read one by one
        if not line: # if the line read is empty (which means end of file), the loop is broken
            break
        stopwords.append(line) # else append the read line to the stopwords list

    f.close() # close the file

    for i in range(len(stopwords)):
        if i != '\n' and i != '':
            stopwords[i] = stopwords[i].rstrip(' \n') # remove newline characters from the strings
        else:
            stopwords.pop(i) # remove any empty strings and newline characters from the stopwords list
    return stopwords


def calc_TF(total_tokens):
    tf = {} # declare an empty dictionary for the term frequency weights
    porter_stemmer = PorterStemmer() # initialize the stemmer
    doc = load_docIDs() # get the docIDs

    for i, tokens in enumerate(total_tokens): # loop through each token in total_tokens, and then loop through each word in the token
        for word in tokens:
            word = porter_stemmer.stem(word) # stem the word
            if word[-1] == "'": # if the word ends with an apostrophe, remove it
                word = word.rstrip("'")
            if word in tf: # if the word is already in the index, add the docID to the index
                if doc[i] in tf[word]:
                    freq = tf[word][doc[i]]
                    tf[word][doc[i]] = freq + 1
                else:
                    tf[word][doc[i]] = 1
            else: # add the word in the index along with the docID and the frequency
                tf[word] = {doc[i]: 1}

    for word in tf.keys(): # calculate the log term frequency weights
        for doc in tf[word].keys():
            tf[word][doc] = 1 + m.log(tf[word][doc], 10) # normalize the term frequency weights

    tf = pd.DataFrame(tf) # convert the dictionary to a Pandas DataFrame
    tf = tf.transpose() # since the DataFrame will be in the form of words as columns and docIDs as rows, we transpose it to have docIDs as columns and words as rows
    tf.fillna(0, inplace=True) # fill the NaN values with 0

    print("Term Frequency Weights created")
    return tf


def calc_IDF(tf):
    df = {} # a dictionary to store the document frequency of each word
    idf = {} # a dictionary to store the inverse document frequency of each word
    doc = load_docIDs() # get the docIDs

    for keys in tf.index: # loop through each word in the index
        frequency = list(tf.loc[keys].value_counts()) # get the frequency of each word in the document
        df[keys] = len(doc) - frequency[0] # calculate the document frequency of each word

    for keys in tf.index: # loop through each word in the document frequency index
        idf[keys] = m.log(len(doc)/df[keys], 10) # calculate the inverse document frequency of each word

    idf = pd.DataFrame(idf, index=[0]) # convert the dictionary to a Pandas DataFrame

    print("Inverse Document Frequency Weights calculated")
    return idf


def preprocessing():
    total_tokens = [] # an empty list to store the tokens from all the files
    doc = load_docIDs() # get the docIDs
    stopwords = load_stopwords() # get the stopwords
    stemmer = PorterStemmer() # create a stemmer object

    for i in doc: # iterate through each doc
        tokens = []
        with open('ResearchPapers/' + str(i) + '.txt', 'r') as f: # open the file corresponding to the current document ID
            while True:
                text = f.readline() # read a line from the file
                if not text: # if the line is empty (which means end of file), break the loop
                    break
                tokens += word_tokenize(text) # tokenize the line and add the tokens to the list

        j = 0
        while j < len(tokens): # loop through each token
            if tokens[j] not in stopwords and len(tokens[j]) <= 45: # filter out the stopwords and tokens with length greater than 45
                # remove symbols and numbers from the start and end of the token and also apply case folding
                tokens[j] = tokens[j].strip('0123456789!@#$%^&*()-_=+[{]}\|;:\'",<.>/?`~').casefold()
                if '.' in tokens[j]: # if '.' exists in a word, split the word at that point and add the splitted words at the end of the tokens list while removing the original word
                    word = tokens[j].split('.')
                    del tokens[j]
                    tokens.extend(word)
                elif '-' in tokens[j]: # do the same for words with '-'
                    word = tokens[j].split('-')
                    del tokens[j]
                    tokens.extend(word)
            j += 1 # move the index forward
        tokens = [stemmer.stem(c) for c in tokens if c.isalpha() and c not in stopwords and len(c) >= 2] # filter out any strings that contain symbols, numbers, etc.
        total_tokens.append(tokens) # add the processed tokens as a seperate list. Did this to keep track of which tokens appear in which docs (needed to construct indexes). List at index 0 indicate tokens found in doc 1 and so on.

    return total_tokens


def calc_TFIDF(TF, IDF):
    vector_space = pd.DataFrame(index=TF.index, columns=TF.columns) # create a DataFrame with the same index and columns as the TF DataFrame
    for term in TF.index: # loop through each term in the index
        if pd.isna(term): # special case of 'nan' term
            vector_space.loc[term] = TF.loc[term] * IDF.loc[0, 'nan']
        else: # calculate the TF-IDF weights
            vector_space.loc[term] = TF.loc[term] * IDF.loc[0, term]
    vector_space.transpose() # transpose the DataFrame

    print("TF-IDF Weights calculated")
    return vector_space


def save_weights():
    tokens = preprocessing() # preprocessing function is called, returns the processed tokens
    tf = calc_TF(tokens) # calculate_TF function is called, returns the TF weights
    idf = calc_IDF(tf) # create_positional_index function is called, returns the IDF weights
    tf_idf = calc_TFIDF(tf, idf) # calculate the TF-IDF weights

    tf_idf.to_csv('tf-idf.csv') # output TF-IDF DataFrame to CSV including the index
    print("TF-IDF Weights saved")

    idf.to_csv('idf.csv') # output IDF DataFrame to CSV including the index
    print("Inverse Document Frequency Weights saved")

In [15]:
if (not os.path.isfile('tf-idf.csv') or not os.path.isfile('idf.csv')): # check if the indexes already exist, if they don't, call the save_indexes function
    save_weights()
else:
    print("Weights are already calculated")

Weights are already calculated


In [16]:
def extract_weights():
    tf_idf = pd.read_csv('tf-idf.csv', index_col=0)  # read TF-IDF DataFrame from CSV
    idf = pd.read_csv('idf.csv', index_col=0) # read IDF DataFrame from CSV

    return tf_idf, idf

def calc_queryvector(query, IDF):
    query_vector = {} # create a list of zeros with the length of the columns in the IDF DataFrame

    for term in IDF.columns: # loop through each term in the IDF columns
        if term in query: # if the term is in the query
            query_vector[term] = 1 + m.log(query.count(term), 10) # calculate the log term frequency weight
            query_vector[term] *= IDF.loc[0, term] # multiply the log term frequency weight by the IDF weight
        else:
            query_vector[term] = 0 # if the term is not in the query, set the weight to zero

    norm = sum([query_vector[x] ** 2 for x in query_vector.keys()]) ** 0.5 # calculate the norm of the query vector

    if norm != 0: # if the norm is not zero
        for term in query_vector:
            query_vector[term] = query_vector[term] / norm # normalize the query vector

    return query_vector

def query_processing(query):
    vectors, IDF = extract_weights() # # read the TF-IDF and IDF DataFrames from CSV

    query_vector = calc_queryvector(query, IDF) # calculate the query vector
    query_vector = pd.DataFrame(query_vector, index=[0]) # create a DataFrame from the query vector

    return vectors, query_vector

def calc_sim(query):
    query = query.split() # split the query into words
    porter_stemmer = PorterStemmer() # initialize the stemmer
    stopwords = load_stopwords() # get the stopwords
    doc = load_docIDs() # get the document IDs

    query = [porter_stemmer.stem(word).rstrip("'").casefold() for word in query if word not in stopwords] # stem the words in the query and remove the stopwords
    
    vectors , query_vector = query_processing(query) # calculate the query vector

    score = {} # create a dictionary to store the similarity scores
    for docID in doc:
        score[docID] = np.dot(vectors[str(docID)], query_vector.transpose()) # calculate the similarity score for each document

    score = {k: v for k, v in sorted(score.items(), key=lambda item: item[1], reverse=True)} # sort the similarity scores in descending order
    score = {k: score[k] for k in score if score[k] >= 0.05} # remove any documents with a similarity score less than 0.05

    score = [k for k in score.keys()]
    score = ' '.join(map(str, score))

    return score

def process_query():
    query = entry.get() # get the query from the text entry field
    result = calc_sim(query)

    if result == '': # if no documents are found
        result = 'No documents found'

    output_label.configure(state='normal') # enable the output label
    output_label.delete(0, tk.END) # clear the output label
    output_label.insert(0, result) # insert the result into the output label
    output_label.configure(state='readonly') # again disable the output label

In [18]:
ctk.set_appearance_mode('Dark') # set the appearance mode to dark
ctk.set_default_color_theme('dark-blue') # set the default color theme

root = ctk.CTk() # create a new window
root.geometry('500x400') # set the window size
root.title('Vector Space Retrieval Model') # set the window title

# create a label "Vector Space Retrieval Model" with a font size of 20 and transparent foreground color
label1 = ctk.CTkLabel(
    root,
    text="Vector Space Retrieval Model",
    font=("Verdana", 20),
    fg_color="transparent"
)
# place the label according to the given co-ordinated relative to x and y axis
label1.place(
    relx=0.5,
    rely=0.2,
    anchor=tk.CENTER
)

# create another label "Enter Query" with a transparent foreground color
label2 = ctk.CTkLabel(
    root,
    text="Enter Query",
    fg_color="transparent"
)
# place the at the center of the window
label2.place( 
    relx=0.5,
    rely=0.3,
    anchor=tk.CENTER
)

# create a text entry field with a width of 200 and a black background color
entry = ctk.CTkEntry(
    root,
    width=200,
    bg_color='black'
)
# place the text entry field in the window
entry.place(
    relx=0.5,
    rely=0.4,
    anchor=tk.CENTER
)

# create a button "Process Query" with a font size of 12 and white background color and black text color. The button calls the process_query function when clicked
process_button = ctk.CTkButton(
    root,
    text="Process Query",
    font=("Helvetica", 12),
    bg_color='white',
    fg_color="#B6C8A9",
    hover_color="white",
    text_color = "black",
    command=process_query
)
# place the button in the window
process_button.place(
    relx=0.5,
    rely=0.5,
    anchor=tk.CENTER
)

# create a button "Exit" with a font size of 12 with white background color and black text color. The button terminates the window when clicked
exit_button = ctk.CTkButton(
    root,
    text="Exit",
    font=("Helvetica", 12),
    bg_color='white',
    fg_color="#B6C8A9",
    hover_color="white",
    text_color = "black",
    command=root.destroy
)
# place the button in the window
exit_button.place(
    relx=0.5,
    rely=0.6,
    anchor=tk.CENTER
)

# create a text entry field with a width of 400, a height of 50, and a black background color
output_label = ctk.CTkEntry(
    root,
    width=400,
    height=50,
    bg_color='black'
)
# place the text entry field in the window
output_label.place(
    relx=0.5,
    rely=0.8,
    anchor=tk.CENTER
)
# set the state of the text entry field to readonly (disable it)
output_label.configure(
    state='readonly'
)

root.mainloop() # run the window