In [2]:
senA = "The peace from yoga is not merely for individuals but for society"
senB = "The practice of yoga is giving wonderful inspiration for health"

# Bag Of Words

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

In [4]:
corpus = [senA, senB]

In [5]:
corpus

['The peace from yoga is not merely for individuals but for society',
 'The practice of yoga is giving wonderful inspiration for health']

In [6]:
vc= CountVectorizer()
toInt = vc.fit_transform(corpus)

In [7]:
print(vc.get_feature_names()) # shows the unique words in the corpus
print(toInt.toarray()) # prints int data for converted strings

['but', 'for', 'from', 'giving', 'health', 'individuals', 'inspiration', 'is', 'merely', 'not', 'of', 'peace', 'practice', 'society', 'the', 'wonderful', 'yoga']
[[1 2 1 0 0 1 0 1 1 1 0 1 0 1 1 0 1]
 [0 1 0 1 1 0 1 1 0 0 1 0 1 0 1 1 1]]


We can also use ngarm of different size/size range to create vectors

In [7]:
vc2 = CountVectorizer(analyzer='word', ngram_range=(2,2))
toInt2 = vc2.fit_transform(corpus)

In [8]:
print(vc2.get_feature_names())
print(toInt2.toarray())

['but for', 'for health', 'for individuals', 'for society', 'from yoga', 'giving wonderful', 'individuals but', 'inspiration for', 'is giving', 'is not', 'merely for', 'not merely', 'of yoga', 'peace from', 'practice of', 'the peace', 'the practice', 'wonderful inspiration', 'yoga is']
[[1 0 1 1 1 0 1 0 0 1 1 1 0 1 0 1 0 0 1]
 [0 1 0 0 0 1 0 1 1 0 0 0 1 0 1 0 1 1 1]]


In [9]:
vc3 = CountVectorizer(analyzer='word', ngram_range=(1,2))
toInt3 = vc3.fit_transform(corpus)

In [10]:
print(vc3.get_feature_names())
print(toInt3.toarray())

['but', 'but for', 'for', 'for health', 'for individuals', 'for society', 'from', 'from yoga', 'giving', 'giving wonderful', 'health', 'individuals', 'individuals but', 'inspiration', 'inspiration for', 'is', 'is giving', 'is not', 'merely', 'merely for', 'not', 'not merely', 'of', 'of yoga', 'peace', 'peace from', 'practice', 'practice of', 'society', 'the', 'the peace', 'the practice', 'wonderful', 'wonderful inspiration', 'yoga', 'yoga is']
[[1 1 2 0 1 1 1 1 0 0 0 1 1 0 0 1 0 1 1 1 1 1 0 0 1 1 0 0 1 1 1 0 0 0 1 1]
 [0 0 1 1 0 0 0 0 1 1 1 0 0 1 1 1 1 0 0 0 0 0 1 1 0 0 1 1 0 1 0 1 1 1 1 1]]


# TF-iDF

In [11]:
print(senA)
print(senB)

The peace from yoga is not merely for individuals but for society
The practice of yoga is giving wonderful inspiration for health


In [12]:
from nltk import word_tokenize

In [13]:
tokenA = word_tokenize(senA)

In [14]:
tokenA[:5]

['The', 'peace', 'from', 'yoga', 'is']

In [15]:
tokenB = word_tokenize(senB)

In [16]:
tokenB[:5]

['The', 'practice', 'of', 'yoga', 'is']

In [17]:
uniqueWords = set(tokenA).union(set(tokenB))

In [18]:
uniqueWords

{'The',
 'but',
 'for',
 'from',
 'giving',
 'health',
 'individuals',
 'inspiration',
 'is',
 'merely',
 'not',
 'of',
 'peace',
 'practice',
 'society',
 'wonderful',
 'yoga'}

In [19]:
dictForA = dict.fromkeys(uniqueWords, 0)
for word in tokenA:
    dictForA[word] += 1

In [20]:
dictForA

{'giving': 0,
 'peace': 1,
 'of': 0,
 'health': 0,
 'not': 1,
 'yoga': 1,
 'but': 1,
 'for': 2,
 'merely': 1,
 'practice': 0,
 'wonderful': 0,
 'society': 1,
 'from': 1,
 'is': 1,
 'individuals': 1,
 'inspiration': 0,
 'The': 1}

In [21]:
dictForB = dict.fromkeys(uniqueWords, 0)
for word in tokenB:
    dictForB[word] += 1

In [22]:
dictForB

{'giving': 1,
 'peace': 0,
 'of': 1,
 'health': 1,
 'not': 0,
 'yoga': 1,
 'but': 0,
 'for': 1,
 'merely': 0,
 'practice': 1,
 'wonderful': 1,
 'society': 0,
 'from': 0,
 'is': 1,
 'individuals': 0,
 'inspiration': 1,
 'The': 1}

TF = term frequency, 
formula for tf = number of occurance of word "x" in a sentence or corpus/ total number of words in a sentence or corpus

In [23]:
def computeTF(wordDict, Tokens):
    tf = {}
    tokenLen = len(Tokens)
    for word,count in wordDict.items():
        tf[word] = count / float(tokenLen)
    return tf

In [24]:
TFA = computeTF(dictForA,tokenA)
TFB = computeTF(dictForB,tokenB)

iDF = inverse document frequency, formula for iDF = log(total number of docs/ total number of docs that has word "x")

In [25]:
import math
def computeidf(docs):
    N = len(docs)
    idfdict = dict.fromkeys(docs[0].keys(),0)
    for doc in docs:
        for word, val in doc.items():
            if val > 0:
                idfdict[word] += 1
    
    for word,val in idfdict.items():
        idfdict[word] = math.log(N / float(val))
    return idfdict

In [26]:
idfs = computeidf([dictForA,dictForB])

tfidf = tf * idf

In [27]:
def computetfidf(tf, idfs):
    tfidf = dict()
    for word, val in tf.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [28]:
tfidfA = computetfidf(TFA, idfs)
tfidfB = computetfidf(TFB, idfs)

In [29]:
import pandas as pd

In [30]:
df = pd.DataFrame([tfidfA,tfidfB])
df

Unnamed: 0,giving,peace,of,health,not,yoga,but,for,merely,practice,wonderful,society,from,is,individuals,inspiration,The
0,0.0,0.057762,0.0,0.0,0.057762,0.0,0.057762,0.0,0.057762,0.0,0.0,0.057762,0.057762,0.0,0.057762,0.0,0.0
1,0.069315,0.0,0.069315,0.069315,0.0,0.0,0.0,0.0,0.0,0.069315,0.069315,0.0,0.0,0.0,0.0,0.069315,0.0


We can simlply use sklearn to implement tfidf method

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [32]:
tfidf = TfidfVectorizer()

In [33]:
data = pd.DataFrame({"txt":corpus})

In [34]:
data

Unnamed: 0,txt
0,The peace from yoga is not merely for individu...
1,The practice of yoga is giving wonderful inspi...


In [35]:
data['txt']

0    The peace from yoga is not merely for individu...
1    The practice of yoga is giving wonderful inspi...
Name: txt, dtype: object

In [36]:
vectors = tfidf.fit_transform(data['txt'])

In [37]:
tfidf.get_feature_names()

['but',
 'for',
 'from',
 'giving',
 'health',
 'individuals',
 'inspiration',
 'is',
 'merely',
 'not',
 'of',
 'peace',
 'practice',
 'society',
 'the',
 'wonderful',
 'yoga']