# Vector Space Model

In [1]:
import re
from collections import defaultdict
from nltk.stem.porter import *
from PyQt5 import QtWidgets
from PyQt5.QtWidgets import QApplication,QMainWindow
import sys
import json
import math
from nltk.stem import WordNetLemmatizer
import operator



## Pre-Processing

- removing stop words from document
- lemmatizing words in document
- Making dictionary of all terms in document 
- creating term frequency matrix
- creating tf-idf matrix
- finding magnitudes of document

### if lemmatizer is not working
- steps
- import nltk
- nltk.download()
- from collections
- from all packages tab wordnet
- run the below cell to follow the steps

In [2]:
# import nltk
# nltk.download()

In [3]:
lemmatizer = WordNetLemmatizer()
document_matrix = {}

def create_stoplist(): #creating stopword list 
    stop_list_f = open("Stopword-list.txt","r")
    stop_list = stop_list_f.read()
    stop_list.replace(" ","")
    stopword_list = stop_list.split("\n")
    return stopword_list

def create_wordlist(docID): #creating list of all words document
    f = open(str(docID)+".txt", "r",encoding="utf-8")
    f_read=f.read()
#     print(f_read)
    f_read=f_read.lower() #lower case complete document 
    f_read = f_read.replace("\n"," ") #removing new line character
    f_read = re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,“”’———‘*1234567890]","", f_read) #removing punctuations
    f_list = f_read.split(" ") 
    f_list=[x for x in f_list if x] #creating list of words
    words = [x for x in f_list if x not in stopword_list] #removing stopwords from the list of words
    words = [lemmatizer.lemmatize(x) for x in words]
    return words


def create_term_frequency(words,docID): ## creating term frequency matrix {word: [documentFrequency,{docID: termFrequency}]}
#     print(words)
    for pos,term in enumerate(words):
#         print(words[word])
        if term not in term_frequency:
            term_frequency[term] = []
            term_frequency[term].append(1)
            term_frequency[term].append({})
            term_frequency[term][1][docID] = 1
        else:
#             print(term_frequency)
            if docID in term_frequency[term][1]:
                term_frequency[term][1][docID] = term_frequency[term][1][docID] + 1
            else:
                term_frequency[term][1][docID] = 1
                term_frequency[term][0] = term_frequency[term][0] + 1
    
    sorted_index = sorted(term_frequency.items(), key = lambda kv: kv[0])
    sorted_index = dict(sorted_index)
    return sorted_index


def create_matrix(words,words_dictionary,docID):
    for i in range(0,len(words_dictionary)):
        if docID not in document_matrix:
            document_matrix[docID] = []
        document_matrix[docID].append(0)
    for i in range(0,len(words)):
        index = words_dictionary.index(words[i])
        document_matrix[docID][index] = 1
        
    return document_matrix


def create_idf_matrix(words_dictionary,term_frequency): ## calculating tf-idf of all terms in every document
    idf_values = {}
    total_documents = 50.00
    for i in range(0,len(words_dictionary)):
        temp = math.log10(term_frequency[words_dictionary[i]][0])/total_documents
        idf_values[words_dictionary[i]] = temp
    return idf_values


def create_tf_idf_matrix(words,words_dictionary,docID,idf_values,term_frequency): ##creating tf-idf matrix of terms in document
#     for i in range(0,len(words_dictionary)):
    if docID not in document_matrix:
        document_matrix[docID] = []
    document_matrix[docID] = [0] * len(words_dictionary)
    for i in range(0,len(words)):
        index = words_dictionary.index(words[i])
        temp = idf_values[words[i]] * term_frequency[words[i]][1][docID]
        document_matrix[docID][index] = temp
    return document_matrix


def find_magnitudes_of_documents(tf_idf_matrix): ##finding magnitudes of all documents using tf-idf matrix
    magnitude = {}
    for i in range(1,51):
#         print(i)
        squared_numbers = [number ** 2 for number in tf_idf_matrix[i]]
#         print(squared_numbers)
        total = sum(squared_numbers)
#         print(total)
        magnitude[i] = math.sqrt(total)
#         print(magnitude[i])
    return magnitude





In [4]:
words_dictionary = []
stopword_list = []
words = []
stopword_list = create_stoplist()
term_frequency = {}
tf_idf_matrix = {}

document_frequency = {}

for docID in range (1,51):
    words = create_wordlist(docID)
    words.sort()
    term_frequency = create_term_frequency(words,docID)

words_dictionary = list(term_frequency.keys()) ##creating list of all terms in document

idf_values = create_idf_matrix(words_dictionary,term_frequency)
print(idf_values['due'])


for docID in range (1,51):
    tf_idf_matrix = {}
    words = create_wordlist(docID)
    words.sort()
    tf_idf_matrix = create_tf_idf_matrix(words,words_dictionary,docID,idf_values,term_frequency)
    
    
term_frequency_file = open("term_frequency.json", "w")
json.dump(term_frequency, term_frequency_file, indent = 6)
term_frequency_file.close()     
tf_idf_matrix_file = open("tf_idf_matrix.json", "w")
json.dump(tf_idf_matrix, tf_idf_matrix_file, indent = 6)
tf_idf_matrix_file.close() 



0.015563025007672872


In [5]:
magnitudes = {}
magnitudes = find_magnitudes_of_documents(tf_idf_matrix)
# print(magnitudes)
magnitudes_file = open("magnitudes.json", "w")
json.dump(magnitudes, magnitudes_file, indent = 6)
magnitudes_file.close() 

## Query Processing


In [6]:
term_frequency_file = open('term_frequency.json',)
term_frequency = json.load(term_frequency_file)
tf_idf_matrix_file = open('tf_idf_matrix.json',)
tf_idf_matrix = json.load(tf_idf_matrix_file)
magnitudes_file = open('magnitudes.json',)
magnitudes = json.load(magnitudes_file)



def q_term_frequency(new_query): #calculating ter frequency of query
    query_term_frequency = {}
    for pos,term in enumerate(new_query):
        if term not in query_term_frequency:
            query_term_frequency[term] = 1
        else:
            query_term_frequency[term] = query_term_frequency[term] + 1
    query_sorted_index = sorted(query_term_frequency.items(), key = lambda kv: kv[0])
    query_sorted_index = dict(query_sorted_index)
    return query_sorted_index

def create_q_tf_idf_matrix(query_term_frequency,idf_values): #calculating tf-idf of query
    keys = list(query_term_frequency.keys()) 
    query_tf_idf_matrix = {}
    for i in range(0,len(query_term_frequency)):
        query_tf_idf_matrix[keys[i]] = query_term_frequency[keys[i]] * idf_values[keys[i]]
#         query_tf_idf_matrix[keys[i]] = query_term_frequency[keys[i]] * 1
    return query_tf_idf_matrix

def q_magnitude(query_tf_idf_matrix): #calculating magnitude of query
    keys = list(query_tf_idf_matrix.keys())
    total = 0
    for i in range (0,len(query_tf_idf_matrix)):
        temp = query_tf_idf_matrix[keys[i]]**2
        total = total + temp
    return (math.sqrt(total))


def get_posting_list(new_query): ## get all documents in which query terms are present
    temp = []
    for i in range(0,len(new_query)):
        temp.extend(list(term_frequency[new_query[i]][1].keys()))
#         print(temp)
    temp = list(dict.fromkeys(temp))
    return temp 


def get_term_index_from_vocabulary(new_query,words_dictionary):
    temp = []
    for i in range(0,len(new_query)):
        temp.append(words_dictionary.index(new_query[i]))
    return temp

In [7]:
def query_processing(query):
    new_query = query.split()
    stopword_list = create_stoplist()
    new_query = [x for x in new_query if x not in stopword_list] # remove stop_Words from query
    new_query = [lemmatizer.lemmatize(x) for x in new_query] # lemmatize the query
    new_query = [x for x in new_query if x in words_dictionary] # remove words which are not in any document
    if not new_query: #if after all removals if dicitionary is empty then a default ans is returned which is zero
        return {0:0}
    query_term_frequency = q_term_frequency(new_query) # creating term frequency of query matrix 
    query_tf_idf_matrix = create_q_tf_idf_matrix(query_term_frequency,idf_values) #  creating tf-idf matrix of query
    query_magnitude = q_magnitude(query_tf_idf_matrix) # calculating magnitude of quer
    documents = get_posting_list(new_query) # getting all documents which contain uery terms
    term_index = get_term_index_from_vocabulary(new_query,words_dictionary) # getting positions of query term in dictionary of terms
    cosine_scores = {} 
    for l in range(0,len(documents)): # calculating cosine scores
        total = 0
        temp = 0
        for m in range(0,len(term_index)):
            temp = tf_idf_matrix[documents[l]][term_index[m]] * query_tf_idf_matrix[new_query[m]]
            total = total + temp
        cosine_temp = total/(magnitudes[documents[l]] * query_magnitude)
        cosine_scores[documents[l]] = cosine_temp
#     {k: v for k, v in sorted(cosine_scores.items(), key=lambda item: item[1])}
    return cosine_scores
        

In [8]:
# query = input("Enter your Query: ")
# result = []
# result = query_processing(query)
# print(result)

In [9]:
# -*- coding: utf-8 -*-

# Form implementation generated from reading ui file 'Vector Space Model.ui'
#
# Created by: PyQt5 UI code generator 5.15.2
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again.  Do not edit this file unless you know what you are doing.


# -*- coding: utf-8 -*-

# Form implementation generated from reading ui file 'VectorSpaceModel.ui'
#
# Created by: PyQt5 UI code generator 5.15.2
#
# WARNING: Any manual changes made to this file will be lost when pyuic5 is
# run again.  Do not edit this file unless you know what you are doing.


from PyQt5 import QtCore, QtGui, QtWidgets


class Ui_Dialog(object):
    def setupUi(self, Dialog):
        Dialog.setObjectName("Dialog")
        Dialog.resize(586, 424)
        self.listWidget = QtWidgets.QListWidget(Dialog)
        self.listWidget.setGeometry(QtCore.QRect(20, 180, 256, 192))
        self.listWidget.setObjectName("listWidget")
        self.listWidget_2 = QtWidgets.QListWidget(Dialog)
        self.listWidget_2.setGeometry(QtCore.QRect(310, 180, 256, 192))
        self.listWidget_2.setObjectName("listWidget_2")
        self.label_3 = QtWidgets.QLabel(Dialog)
        self.label_3.setGeometry(QtCore.QRect(20, 130, 231, 31))
        font = QtGui.QFont()
        font.setPointSize(16)
        self.label_3.setFont(font)
        self.label_3.setObjectName("label_3")
        self.label_4 = QtWidgets.QLabel(Dialog)
        self.label_4.setGeometry(QtCore.QRect(310, 140, 211, 21))
        font = QtGui.QFont()
        font.setPointSize(16)
        self.label_4.setFont(font)
        self.label_4.setObjectName("label_4")
        self.lineEdit = QtWidgets.QLineEdit(Dialog)
        self.lineEdit.setGeometry(QtCore.QRect(100, 100, 311, 20))
        self.lineEdit.setObjectName("lineEdit")
        self.label_2 = QtWidgets.QLabel(Dialog)
        self.label_2.setGeometry(QtCore.QRect(20, 90, 81, 31))
        font = QtGui.QFont()
        font.setPointSize(16)
        self.label_2.setFont(font)
        self.label_2.setObjectName("label_2")
        self.pushButton = QtWidgets.QPushButton(Dialog)
        self.pushButton.setGeometry(QtCore.QRect(450, 100, 75, 23))
        self.pushButton.setObjectName("pushButton")
        self.pushButton.clicked.connect(self.print1)
        self.label = QtWidgets.QLabel(Dialog)
        self.label.setGeometry(QtCore.QRect(50, 10, 531, 51))
        font = QtGui.QFont()
        font.setPointSize(36)
        self.label.setFont(font)
        self.label.setObjectName("label")

        self.retranslateUi(Dialog)
        QtCore.QMetaObject.connectSlotsByName(Dialog)

    def retranslateUi(self, Dialog):
        _translate = QtCore.QCoreApplication.translate
        Dialog.setWindowTitle(_translate("Dialog", "Dialog"))
        self.label_3.setText(_translate("Dialog", "Ranked Results"))
        self.label_4.setText(_translate("Dialog", "Un-Ranked Results"))
        self.label_2.setText(_translate("Dialog", "Query"))
        self.pushButton.setText(_translate("Dialog", "Search"))
        self.label.setText(_translate("Dialog", "Vector Space Model"))

        
    def print1(self):
        self.listWidget_2.clear()
        self.listWidget.clear()
        self.result = query_processing(self.lineEdit.text())
#         print(self.result)
        self.keys = list(self.result.keys())
        self.keys = [int(i) for i in self.keys]
        self.keys.sort()
    
        if(self.keys[0] == 0):
            self.listWidget_2.addItem("no Document Found")
            
        else:    
            for i in range (0,len(self.result)):
                if self.result[str(self.keys[i])] >= 0.005:
                    self.listWidget_2.addItem("Document "+str(self.keys[i]))
        
        self.sorted_result = dict(sorted(self.result.items(), key=operator.itemgetter(1),reverse=True))
        self.sorted_keys = list(self.sorted_result.keys())
        
        if(self.sorted_keys[0] == 0):
            self.listWidget.addItem("no Document Found")
            
        else:    
            for i in range (0,len(self.sorted_result)):
                if self.result[self.sorted_keys[i]] >= 0.005:
                    self.listWidget.addItem("Document "+str(self.sorted_keys[i]))
                
                
                
if __name__ == "__main__":
    import sys
    app = QtWidgets.QApplication(sys.argv)
    Dialog = QtWidgets.QDialog()
    ui = Ui_Dialog()
    ui.setupUi(Dialog)
    Dialog.show()
    

    sys.exit(app.exec_())

SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
