In [1]:
#Ref : https://github.com/pemagrg1/Magic-Of-TFIDF/blob/master/notebooks/TF-IDF%20from%20Scratch.ipynb
# https://stackoverflow.com/questions/53619472/how-to-make-term-frequency-matrix-in-python

#https://towardsdatascience.com/how-sklearns-tf-idf-is-different-from-the-standard-tf-idf-275fa582e73d
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer

import math
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from nltk.corpus import stopwords

In [2]:
def computeTF(wordDict, doc):
    """
        tf(t,d) = count of t in d / number of words in d

    :param wordDict:
    :param doc:
    :return:
    """
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count/float(corpusCount)
    return(tfDict)

#analytics vidhya way
def IDF(corpus, unique_words):
    idf_dict={}
    N=len(corpus)
    for i in unique_words:
        count=0
        for sen in corpus:
            if i in sen :
                count=count+1
            idf_dict[i]=(math.log((1+N)/(count+1)))+1
    return idf_dict 

def computeIDF(docList):
    """
        idf(t) = log(N/(df + 1))
    :param docList:
    :return:
    """
    idfDict = {}
    N = len(docList)

    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / (float(val) + 1))

    return (idfDict)

def computeTFIDF(tfBow, idfs):
    """
    tf-idf(t, d) = tf(t, d) * log(N/(df + 1))
    :param tfBow:
    :param idfs:
    :return:
    """
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return(tfidf)

def create_word_dict(total, sentence):
    wordDict = dict.fromkeys(total, 0)
    for word in sentence:
        wordDict[word] += 1
    return wordDict

In [3]:
sentences = [
    "7-Day Mandatory Home Quarantine For All International Arrivals In India",
    "Mandatory 7-day quarantine for all international passengers coming to India",
    "Govt mandates all international arriving passengers to undergo home quarantine for 7 days"]

### convert sentences lower case

In [4]:
for i in range(0,len(sentences)) :
    sentences[i] = sentences[i].lower()
sentences

['7-day mandatory home quarantine quarantine for all international arrivals in india',
 'mandatory 7-day quarantine for all international passengers coming to india',
 'govt mandates all international arriving passengers to undergo home quarantine for 7 days']

### remove stop words manually

In [5]:
sentence1_list = nltk.word_tokenize(sentences[0])
sentence1_list = [word for word in sentence1_list if word not in stopwords.words('english')]

sentence2_list = nltk.word_tokenize(sentences[1])
sentence2_list = [word for word in sentence2_list if word not in stopwords.words('english')]

sentence3_list = nltk.word_tokenize(sentences[2])
sentence3_list = [word for word in sentence3_list if word not in stopwords.words('english')]

unique_words = set(sentence1_list).union(set(sentence2_list)).union(set(sentence3_list))
filtered_words = [word for word in unique_words if word not in stopwords.words('english')]
print(filtered_words)

['undergo', 'coming', 'passengers', 'india', 'mandatory', 'govt', 'days', 'international', '7-day', 'mandates', 'quarantine', '7', 'home', 'arrivals', 'arriving']


In [6]:
###Solve next ?

#15 features ? --> 1. lemmatimzation -- 2. stemming

## Calculate Bag of Words

In [7]:
wordDict1 = create_word_dict(unique_words,sentence1_list)
wordDict2 = create_word_dict(unique_words,sentence2_list)
wordDict3 = create_word_dict(unique_words,sentence3_list)

print (wordDict1)
print()
print (wordDict2)
print()
print (wordDict3)

{'undergo': 0, 'coming': 0, 'passengers': 0, 'india': 1, 'mandatory': 1, 'govt': 0, 'days': 0, 'international': 1, '7-day': 1, 'mandates': 0, 'quarantine': 2, '7': 0, 'home': 1, 'arrivals': 1, 'arriving': 0}

{'undergo': 0, 'coming': 1, 'passengers': 1, 'india': 1, 'mandatory': 1, 'govt': 0, 'days': 0, 'international': 1, '7-day': 1, 'mandates': 0, 'quarantine': 1, '7': 0, 'home': 0, 'arrivals': 0, 'arriving': 0}

{'undergo': 1, 'coming': 0, 'passengers': 1, 'india': 0, 'mandatory': 0, 'govt': 1, 'days': 1, 'international': 1, '7-day': 0, 'mandates': 1, 'quarantine': 1, '7': 1, 'home': 1, 'arrivals': 0, 'arriving': 1}


## Calculate term frequency

In [8]:
tfFirst = computeTF(wordDict1, sentence1_list)
print(len(sentence1_list))
tfSecond = computeTF(wordDict2, sentence2_list)
tfThird = computeTF(wordDict3, sentence3_list)
tfFirst

8


{'undergo': 0.0,
 'coming': 0.0,
 'passengers': 0.0,
 'india': 0.125,
 'mandatory': 0.125,
 'govt': 0.0,
 'days': 0.0,
 'international': 0.125,
 '7-day': 0.125,
 'mandates': 0.0,
 'quarantine': 0.25,
 '7': 0.0,
 'home': 0.125,
 'arrivals': 0.125,
 'arriving': 0.0}

## Two different ways of IDF

In [9]:
IDF([sentence1_list,sentence2_list, sentence3_list],wordDict1.keys())

{'undergo': 1.6931471805599454,
 'coming': 1.6931471805599454,
 'passengers': 1.2876820724517808,
 'india': 1.2876820724517808,
 'mandatory': 1.2876820724517808,
 'govt': 1.6931471805599454,
 'days': 1.6931471805599454,
 'international': 1.0,
 '7-day': 1.2876820724517808,
 'mandates': 1.6931471805599454,
 'quarantine': 1.0,
 '7': 1.6931471805599454,
 'home': 1.2876820724517808,
 'arrivals': 1.6931471805599454,
 'arriving': 1.6931471805599454}

In [10]:
idfs = computeIDF([wordDict1, wordDict2, wordDict3])
idfs

{'undergo': 0.47712125471966244,
 'coming': 0.47712125471966244,
 'passengers': 0.47712125471966244,
 'india': 0.47712125471966244,
 'mandatory': 0.47712125471966244,
 'govt': 0.47712125471966244,
 'days': 0.47712125471966244,
 'international': 0.47712125471966244,
 '7-day': 0.47712125471966244,
 'mandates': 0.47712125471966244,
 'quarantine': 0.47712125471966244,
 '7': 0.47712125471966244,
 'home': 0.47712125471966244,
 'arrivals': 0.47712125471966244,
 'arriving': 0.47712125471966244}

In [11]:
#running our two sentences through the IDF:
idfFirst = computeTFIDF(tfFirst, idfs)
idfSecond = computeTFIDF(tfSecond, idfs)
idfThird = computeTFIDF(tfThird, idfs)
print (idfFirst)
print()
print (idfSecond)
print()
print (idfThird)

{'undergo': 0.0, 'coming': 0.0, 'passengers': 0.0, 'india': 0.059640156839957804, 'mandatory': 0.059640156839957804, 'govt': 0.0, 'days': 0.0, 'international': 0.059640156839957804, '7-day': 0.059640156839957804, 'mandates': 0.0, 'quarantine': 0.11928031367991561, '7': 0.0, 'home': 0.059640156839957804, 'arrivals': 0.059640156839957804, 'arriving': 0.0}

{'undergo': 0.0, 'coming': 0.06816017924566606, 'passengers': 0.06816017924566606, 'india': 0.06816017924566606, 'mandatory': 0.06816017924566606, 'govt': 0.0, 'days': 0.0, 'international': 0.06816017924566606, '7-day': 0.06816017924566606, 'mandates': 0.0, 'quarantine': 0.06816017924566606, '7': 0.0, 'home': 0.0, 'arrivals': 0.0, 'arriving': 0.0}

{'undergo': 0.047712125471966245, 'coming': 0.0, 'passengers': 0.047712125471966245, 'india': 0.0, 'mandatory': 0.0, 'govt': 0.047712125471966245, 'days': 0.047712125471966245, 'international': 0.047712125471966245, '7-day': 0.0, 'mandates': 0.047712125471966245, 'quarantine': 0.047712125471

## TF-IDF Matrix

In [12]:
#putting it in a dataframe
idf = pd.DataFrame([idfFirst, idfSecond, idfThird])
idf.head()

Unnamed: 0,undergo,coming,passengers,india,mandatory,govt,days,international,7-day,mandates,quarantine,7,home,arrivals,arriving
0,0.0,0.0,0.0,0.05964,0.05964,0.0,0.0,0.05964,0.05964,0.0,0.11928,0.0,0.05964,0.05964,0.0
1,0.0,0.06816,0.06816,0.06816,0.06816,0.0,0.0,0.06816,0.06816,0.0,0.06816,0.0,0.0,0.0,0.0
2,0.047712,0.0,0.047712,0.0,0.0,0.047712,0.047712,0.047712,0.0,0.047712,0.047712,0.047712,0.047712,0.0,0.047712


## Let's Use Libraries

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

corpus = [
    "7-Day Mandatory Home Quarantine For All International Arrivals In India",
    "Mandatory 7-day quarantine for all international passengers coming to India"]

vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'))  #https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer
X = vectorizer.fit_transform(corpus)
columns = vectorizer.get_feature_names_out()

df = pd.DataFrame(X.toarray(),columns=columns,index=['stmt1','stmt2'])
df

Unnamed: 0,arrivals,coming,day,home,india,international,mandatory,passengers,quarantine
stmt1,0.469778,0.0,0.334251,0.469778,0.334251,0.334251,0.334251,0.0,0.334251
stmt2,0.0,0.469778,0.334251,0.0,0.334251,0.334251,0.334251,0.469778,0.334251


In [14]:
corpus = [
    "7-Day Mandatory Home Quarantine For All International Arrivals In India",
    "Mandatory 7-day quarantine for all international passengers coming to India",
    "Govt mandates all international arriving passengers to undergo home quarantine for 7 days"]

vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), norm=None, smooth_idf=False, use_idf=False)  #https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer
X = vectorizer.fit_transform(corpus)
columns = vectorizer.get_feature_names_out()

df = pd.DataFrame(X.toarray(),columns=columns,index=['stmt1','stmt2','stmt3'])
df

Unnamed: 0,arrivals,arriving,coming,day,days,govt,home,india,international,mandates,mandatory,passengers,quarantine,undergo
stmt1,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
stmt2,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
stmt3,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0


# Understand each parameter of TfidfVectorizer

In [15]:
vectorizer = TfidfVectorizer(stop_words=stopwords.words('english'), smooth_idf=False)  #https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer
X = vectorizer.fit_transform(corpus)
columns = vectorizer.get_feature_names_out()

df = pd.DataFrame(X.toarray(),columns=columns,index=['stmt1','stmt2','stmt3'])
df

Unnamed: 0,arrivals,arriving,coming,day,days,govt,home,india,international,mandates,mandatory,passengers,quarantine,undergo
stmt1,0.554856,0.0,0.0,0.371594,0.0,0.0,0.371594,0.371594,0.264392,0.0,0.371594,0.0,0.264392,0.0
stmt2,0.0,0.0,0.554856,0.371594,0.0,0.0,0.0,0.371594,0.264392,0.0,0.371594,0.371594,0.264392,0.0
stmt3,0.0,0.396802,0.0,0.0,0.396802,0.396802,0.265743,0.0,0.189078,0.396802,0.0,0.265743,0.189078,0.396802


In [16]:
my_stop_words = stopwords.words('english')

In [17]:
my_stop_words.append("he")

In [18]:
len(my_stop_words)

180