In [8]:
import numpy as np
import pandas as pd
import os
import string
from nltk.tokenize import sent_tokenize,word_tokenize
from nltk.corpus import stopwords

<h2>Jaccard Coefficient</h2>

<img src="https://miro.medium.com/max/744/1*XiLRKr_Bo-VdgqVI-SvSQg.png"/>

In [28]:
class Jaccard:
    
    def __init__(self,corpusPath):
        '''
        Constructor  function for the jackard class. 
        Inputs:
        corpusPath => path to the corpus (str)
        returns => None.
        '''
        self.corpusPath=corpusPath
    
    def listDocuments(self):
        '''
        Returns a list containing the documents in the corpus.
        Input=>None
        returns=> list.
        '''
        docNames=[]
        
        #go through each file
        for filename in os.listdir(self.corpusPath):
            #check for txt files
            if filename.endswith(".txt"):
                docNames.append(filename[0:filename.index(".txt")])
                
        return docNames
    
    def coeff(self,query):
        '''
        Calculats the jaccard coefficient score for the corpus.
        Returns a list containing coefficient scores for the corpus.
        Input => query (str).
        Returns => tuple contaning document name and the respective jaccard score
        '''
        coefficients=[]
        docNames=[]
        
        query=query.lower()
        #get normalized query
        norm_query=self.__process_words(query)
        
        for filename in os.listdir(self.corpusPath):
            #add document name to list
            if filename.endswith(".txt"):
                docNames.append(filename[0:filename.index(".txt")])
                
                #open file
                with open(self.corpusPath+filename) as fh:
                    
                    #read file text
                    text=fh.read()
                    text=text.lower()
                    norm_text=self.__process_words(text)
                    
                    #find the total number of elements and the total elements in the corpus
                    norm_text_set=set(norm_text)
                    norm_query_set=set(norm_query)
        
                    common=len(norm_text_set.intersection(norm_query_set))
                    total=len(norm_text_set.union(norm_query_set))
                    
                    jaccard_coeff=common/total
                    coefficients.append(jaccard_coeff)
                    
        #return the info tuple
        return (coefficients,docNames)        
            
        
    def __process_words(self,text):
        '''
        Normalizes document text or query.
        Input => document text | query (str)
        returns => normalized text | query (list).
        '''
        tokens=list(map(lambda s:s.lower(),word_tokenize(text)))
        
        tokens=list(set(tokens))
        
        #remove punctuation
        punctuation=list(string.punctuation)
        punctuation.append("''")
        
        tokens=[token for token in tokens if token not in punctuation]
        
        #remove stopwords
        st_words=stopwords.words("english")
        tokens=[token for token in tokens if token not in st_words]
        return tokens
        

In [32]:
directory="/home/aahan/Documents/Academic/Information Retrieval/corpus/"
jc=Jaccard(directory)
print(jc.coeff("Doom Eternal"))
print(jc.coeff("I want to listen to music"))
print(jc.coeff("I want to play an action game"))

([0.0, 0.017391304347826087, 0.0, 0.0, 0.0, 0.0], ['dbz', 'doomEternal', 'twice', 'maroon5', 'coldplay', 'bioshock'])
([0.0, 0.0, 0.005649717514124294, 0.004807692307692308, 0.0045662100456621, 0.0], ['dbz', 'doomEternal', 'twice', 'maroon5', 'coldplay', 'bioshock'])
([0.0, 0.02586206896551724, 0.0056179775280898875, 0.0, 0.0, 0.00411522633744856], ['dbz', 'doomEternal', 'twice', 'maroon5', 'coldplay', 'bioshock'])


In [19]:
s=set(["A","B","C"])
b=set(["B","C","D"])
len(s.intersection(b))
len(s.union(b))

4

In [18]:
"Sata".lower()

'sata'