**Authour:** Boris Kundu

**Problem Statement:** Calculate tfidf for a given word in a document from corpus

In [1]:
# Import packages
import numpy as np
import nltk
import pandas as pd

In [20]:
# TFIDF class
class TFIDF:
    # Tokenize text
    def tokenize_text(self, text):
        return nltk.tokenize.word_tokenize(text)
    # Calculate token frequency
    def get_tokens_frequency(self, tokens):
        tokens_frequencey = {}
        for token in tokens:
            if token in tokens_frequencey:
                tokens_frequencey[token] +=1
            else:
                tokens_frequencey[token] = 1
        return tokens_frequencey
    # Tokenize using pipeline
    def run_tokenizing_pipeline (self, text, pipeline):
        result = text
        for step in pipeline:
            result = step(result)
        return result
    # Calculate term frequency
    def get_term_frequency(self, term, sentence):
        tokens_frequency = self.run_tokenizing_pipeline(sentence.lower(),[self.tokenize_text, self.get_tokens_frequency])
        if term in tokens_frequency:
            tf = (tokens_frequency[term]/max(tokens_frequency.values()))
        else:
            tf = 0
        return tf
    # Calculate inverse document frequency
    def get_inverse_document_frequency(self, term, corpus):
        N = len(corpus)
        t = 0
        for sentence in corpus:
            tokens_frequency = self.run_tokenizing_pipeline(sentence.lower(),[self.tokenize_text,self.get_tokens_frequency])
            if term in tokens_frequency:
                t +=1
        idf = np.log(N/t)
        return idf
    # Calculate tfidf for a word in adocument from corpus
    def get_tfidf(self, term, sentence, corpus):
        return self.get_inverse_document_frequency(term,corpus) * self.get_term_frequency(term,sentence)

In [22]:
# Initialize object
tfidf = TFIDF()
# Initialize corpus, document, and word to test
corpus = ["The quick brown fox jumps over the laxy dog.", "Never jump over the lazy dog quickly."]
sentence = corpus[0]
word = 'quick'
# Check term frequency
print(f'Term Frequency of <{word}> in sentence <{sentence}> is <{tfidf.get_term_frequency(word,sentence)}>')
# Check inverse-document frequency
print(f'Inverse Document Frequency of <{word}> in corpus is <{tfidf.get_inverse_document_frequency(word,corpus)}>')
# Check tfidf
print(f'TFIDF of <{word}> in sentence <{sentence}> is <{tfidf.get_tfidf(word,sentence,corpus)}>')

Term Frequency of <quick> in sentence <The quick brown fox jumps over the laxy dog.> is <0.5>
Inverse Document Frequency of <quick> in corpus is <0.6931471805599453>
TFIDF of <quick> in sentence <The quick brown fox jumps over the laxy dog.> is <0.34657359027997264>
