In [1]:
# Program to Implement Vector Space Model Document Recommendation

'''
Name : Aman Agarwal
ROLL : 216
PRN  : 0120180254
'''

'\nAuthor : Aman Agarwal\n'

In [2]:
#IMPORTING ALL REQUIRED LIBRARIES

import math
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

# Preprocessing Documents

## Reading Documents

In [3]:
#READING INPUT FROM SPECIFIED DOCUMENTS

def ReadingFromFile():
    
    FilesvsLines={}
    #List to store the contents of the documents
    
    NumOfFile=int(input("Enter Number Of Files You Will Enter : "))
    
    for x in range(0,NumOfFile):    
        File=open("DOC" + str(x+1) + ".txt",'r')
        Lines = File.read().splitlines()
        FilesvsLines["DOC" + str(x+1)]=Lines
        File.close()
        
    return FilesvsLines,NumOfFile


## Splitting Words

In [4]:
def SplitWords(Words):
    TWords=[]
    for Word in Words:
        TWords.extend(word_tokenize(Word))
    return TWords

## Removing Stop Words using NLTK

In [5]:
def RemoveStopWords(words):
    filtered_sentence = []
    for word in words:
        if word not in stop_words:
            filtered_sentence.append(word.capitalize())
    return filtered_sentence

## Applying SplitWords() and RemoveStopWords()

In [6]:
def PreprocessingDocuments():
    FilesvsLines,NumOfDocuments=ReadingFromFile()
    for key ,value in FilesvsLines.items():
        FilesvsLines[key]=RemoveStopWords(SplitWords(value))
    return FilesvsLines,NumOfDocuments

In [7]:
FilesvsLines,NumOfDocuments=PreprocessingDocuments()

Enter Number Of Files You Will Enter : 10


## Representing all appeared words using Vector Space Model

In [8]:
'''
Here each word appearing in all of the documents is represented using Vector Space Model, 
where a vector is assigned to each word, with value of vector elements
being 1 if the word appears in the corresponding docuement,
and 0 if not. Total number of elements in each vector are equal to the number of documents.
'''

def create2DArray(FilesvsLines):
    array=[]
    WordsVSTF={}
    for key in FilesvsLines:
        for value in FilesvsLines[key]:
            array.append(value)
    vac=sorted(list(set(array)))
    for  x in vac:
        for value in FilesvsLines.values():
            if(x in value):
                if (x in WordsVSTF.keys()):
                    WordsVSTF[x].append(1)
                else:
                    WordsVSTF[x]=[1]
            else:
                if (x in WordsVSTF.keys()):
                    WordsVSTF[x].append(0)
                else:
                    WordsVSTF[x]=[0]
    return WordsVSTF

In [9]:
WordsVSTF=create2DArray(FilesvsLines)

## Caluculating TF-IDF

In [10]:
def CalculateIDF(WordsVSTFIDF,NumOfDocuments):
    DICTOfIDF = {}
    
    for key,value in WordsVSTFIDF.items():
        sumo = 0
        
        for x in value:
            sumo = sumo + x
        # sumo holds the number of docuements a word has occured in
        
        nsumo = NumOfDocuments/sumo
        #nsumo is same as Document Frequency (df)
        
        idf = math.log10(nsumo)
        #Formula for IDF
        
        DICTOfIDF[key] = idf

        value = list(map(lambda x: x*idf,value))
        #This anonymous function gives us the TF-IDF by calculating TF*IDF for each term
        
        WordsVSTFIDF[key]=value
        # WordsVSTFIDF holds the TF-IDF values of each word, with the word & TF-IDF vector as key-value pair.
        
        
    return WordsVSTFIDF,DICTOfIDF

In [11]:
WordsVSTFIDF,DICTOfIDF=CalculateIDF(WordsVSTF,NumOfDocuments)

## Calculating Length of Documents required for Cosine Similarity

In [12]:
# This basically gives us Document vectors for our documents that
# can be used to compare with our Query Docuemnt using Cosine Similarity

def CalculateLengthOfDocuments(WordsVSTFIDF,NumOfDocuments):
    LengthOfDocuments=[]
    for i in range(0,NumOfDocuments):
        sumo=0
        for values in WordsVSTFIDF.values():
            sumo+=math.pow(values[i],2)
        LengthOfDocuments.append(math.sqrt(sumo))
    return LengthOfDocuments

In [13]:
LengthOfDocuments=CalculateLengthOfDocuments(WordsVSTFIDF,NumOfDocuments)
print(LengthOfDocuments)

[7.471611593372719, 7.67623792542539, 10.164436772134765, 9.045104147642544, 10.361882630360144, 9.490338793740293, 6.651745768713027, 6.407305406470593, 11.922735093549168, 6.186827441386043]


# Query Preprocessing

In [14]:
# Processing the Query Document by splitting words, removing Stopwords, applying IDF and TF-IDF,
# and vectorizing the query

def QueryPreprocessing(DICTOfIDF):
    Query=[]
    QueryVsTFIDF={}
    query=input("Enter Your Query : ")
    queryterm=RemoveStopWords(query.split(" "))
    listOfIDF=list(DICTOfIDF.keys())
    for x in queryterm:
        if(x in listOfIDF):
            Query.append(x)
    print(Query)        
    UniqueQuery=sorted(list(set(Query)))
    print(UniqueQuery)
    for x in UniqueQuery:
        idf=DICTOfIDF.get(x)
        QueryVsTFIDF[x]=Query.count(x)*idf
    sumo=0    
    for x in QueryVsTFIDF.values():
        sumo+=math.pow(x,2)
    LengthOfQuery=math.sqrt(sumo) 
    return QueryVsTFIDF,LengthOfQuery

In [15]:
QueryVsTFIDF,LengthOfQuery=QueryPreprocessing(DICTOfIDF)

Enter Your Query : Show recipies containing milk
['Milk']
['Milk']


In [16]:
#Calculating Cosine Similarity between the Query Documents and the given Documents

def CalculateCosineSimilarity(WordsVSTFIDF,LengthOfDocuments,QueryVsTFIDF,LengthOfQuery,NumOfDocuments):
    highVal = 0 
    highDoc = 0
    reccDoc = []
    CosineSimilarity={}
    for i in range(0,NumOfDocuments):
        sumo=0
        for key ,value in QueryVsTFIDF.items():
            sumo+=WordsVSTFIDF[key][i]*QueryVsTFIDF[key]
            
        cos=sumo/(LengthOfDocuments[i]*LengthOfQuery)
        if cos > highVal:
                highVal = cos
                highDoc = "DOC"+str(i+1)
        if cos > 0.05:
                reccDoc.append("DOC"+str(i+1))
        CosineSimilarity["DOC"+str(i+1)]=cos
    sorted_by_value=sorted(CosineSimilarity.items(), key=lambda CosineSimilarity: CosineSimilarity[1],reverse=True)
    
    return CosineSimilarity, highDoc, reccDoc

In [17]:
# Printing the recommended Document
def printOutput():
    print("\n\nThe reccomended documents are:", reccDoc)
    print ("\n\nDisplaying the most relevent document i.e ", highDoc)
    File=open(highDoc + ".txt",'r')
    Lines = File.read()
    print("\n", Lines)

In [18]:
# Exception Handling when no mathcing documents are found 
try:
    CosineSimilarity, highDoc, reccDoc = CalculateCosineSimilarity(WordsVSTFIDF,LengthOfDocuments,QueryVsTFIDF,LengthOfQuery,NumOfDocuments)
    print("Cosine similarities of the Query document with the given documents are: \n\n", CosineSimilarity)
    printOutput()
    
except:
     print("Sorry, An exception occurred! \nPlease recheck the query, NO MATCHING DOCUMENTS FOUND")


Cosine similarities of the Query document with the given documents are: 

 {'DOC1': 0.0935500990115709, 'DOC2': 0.0, 'DOC3': 0.0, 'DOC4': 0.0, 'DOC5': 0.0, 'DOC6': 0.0, 'DOC7': 0.0, 'DOC8': 0.10908954076547449, 'DOC9': 0.0, 'DOC10': 0.0}


The reccomended documents are: ['DOC1', 'DOC8']


Displaying the most relevent document i.e  DOC8

 Espresso Coffee Recipe

Awaken your senses every morning with a refreshing cup of Espresso coffee and kick start your day on a healthy note. Easy to prepare at home, this beverage will surely be a favorite of everyone at your house. You can have this refreshing cup of coffee at anytime of the day with some exotic cookies for better taste. The aroma of this amazing non-alcoholic beverage will surely win hearts of people around you.

Read less
Ingredients of Espresso Coffee
3 cup milk
2 teaspoon coffee powder
chocolate syrup as required
1/2 cup water
sugar as required
1/4 teaspoon drinking chocolate
How to make Espresso Coffee
Step 1
Add milk to a bowl a