In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
documentA = 'Jupiter is the largest Planet'
documentB = 'Mars is the fourth planet from the Sun'

In [3]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [4]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [5]:
uniqueWords

{'Jupiter',
 'Mars',
 'Planet',
 'Sun',
 'fourth',
 'from',
 'is',
 'largest',
 'planet',
 'the'}

In [7]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
    numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [8]:
numOfWordsA

{'Sun': 0,
 'Jupiter': 1,
 'the': 1,
 'Mars': 0,
 'largest': 1,
 'fourth': 0,
 'from': 0,
 'Planet': 1,
 'planet': 0,
 'is': 1}

In [11]:
#calculatinf TF
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict
TfA = computeTF(numOfWordsA, bagOfWordsA)
TfB = computeTF(numOfWordsB, bagOfWordsB)

In [12]:
TfA

{'Sun': 0.0,
 'Jupiter': 0.2,
 'the': 0.2,
 'Mars': 0.0,
 'largest': 0.2,
 'fourth': 0.0,
 'from': 0.0,
 'Planet': 0.2,
 'planet': 0.0,
 'is': 0.2}

In [13]:
TfB

{'Sun': 0.125,
 'Jupiter': 0.0,
 'the': 0.25,
 'Mars': 0.125,
 'largest': 0.0,
 'fourth': 0.125,
 'from': 0.125,
 'Planet': 0.0,
 'planet': 0.125,
 'is': 0.125}

In [16]:
import math
def computeIDF(documents):
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict


In [17]:
idfs = computeIDF([numOfWordsA, numOfWordsB])
idfs

{'Sun': 0.6931471805599453,
 'Jupiter': 0.6931471805599453,
 'the': 0.0,
 'Mars': 0.6931471805599453,
 'largest': 0.6931471805599453,
 'fourth': 0.6931471805599453,
 'from': 0.6931471805599453,
 'Planet': 0.6931471805599453,
 'planet': 0.6931471805599453,
 'is': 0.0}

In [18]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

In [20]:
tfidfA = computeTFIDF(TfA, idfs)
tfidfB = computeTFIDF(TfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])
df

Unnamed: 0,Sun,Jupiter,the,Mars,largest,fourth,from,Planet,planet,is
0,0.0,0.138629,0.0,0.0,0.138629,0.0,0.0,0.138629,0.0,0.0
1,0.086643,0.0,0.0,0.086643,0.0,0.086643,0.086643,0.0,0.086643,0.0
