# TF-IDF Example

In [45]:
import pandas as pd
import math

In [91]:
docA = "Because Mary and Samantha arrived at the bus station before noon I did not see them at the bus station"
docB = "Mary and Samantha After they left on the bus Mary and Samantha realized that Joe was waiting at the train station"

In [92]:
split_A = docA.split(' ')
split_B = docB.split(' ')

In [93]:
words = set(split_A).union(set(split_B))
words

{'After',
 'Because',
 'I',
 'Joe',
 'Mary',
 'Samantha',
 'and',
 'arrived',
 'at',
 'before',
 'bus',
 'did',
 'left',
 'noon',
 'not',
 'on',
 'realized',
 'see',
 'station',
 'that',
 'the',
 'them',
 'they',
 'train',
 'waiting',
 'was'}

In [94]:
word_dict_A = dict.fromkeys(words,0)
word_dict_B = dict.fromkeys(words,0)

In [95]:
for word in split_A:
    word_dict_A[word] = 1
    
for word in split_B:
    word_dict_B[word] = 1

In [96]:
df = pd.DataFrame([word_dict_A, word_dict_B])
df

Unnamed: 0,After,Because,I,Joe,Mary,Samantha,and,arrived,at,before,...,realized,see,station,that,the,them,they,train,waiting,was
0,0,1,1,0,1,1,1,1,1,1,...,0,1,1,0,1,1,0,0,0,0
1,1,0,0,1,1,1,1,0,1,0,...,1,0,1,1,1,0,1,1,1,1


In [97]:
def calcDF(word_dict, split):
    tfDict = {}
    doc_length = len(split)
    for word,count in word_dict.items():
        tfDict[word] = count/doc_length
    return tfDict

In [98]:
tf_A = calcDF(word_dict_A, split_A)
tf_B = calcDF(word_dict_B, split_B)

In [99]:
def calcIDF(docList):
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    N = len(docList)
    for doc in docList:
        for word,val in doc.items():
            if(val > 0):
                idfDict[word] += 1 
    for word,val in idfDict.items():
        idfDict[word] = math.log10(N/val)
        
    return idfDict 

In [100]:
idf = calcIDF([word_dict_A,word_dict_B])
idf

{'I': 0.3010299956639812,
 'Mary': 0.0,
 'station': 0.0,
 'realized': 0.3010299956639812,
 'they': 0.3010299956639812,
 'Joe': 0.3010299956639812,
 'that': 0.3010299956639812,
 'at': 0.0,
 'did': 0.3010299956639812,
 'see': 0.3010299956639812,
 'noon': 0.3010299956639812,
 'on': 0.3010299956639812,
 'was': 0.3010299956639812,
 'not': 0.3010299956639812,
 'the': 0.0,
 'bus': 0.0,
 'before': 0.3010299956639812,
 'and': 0.0,
 'train': 0.3010299956639812,
 'left': 0.3010299956639812,
 'arrived': 0.3010299956639812,
 'Samantha': 0.0,
 'waiting': 0.3010299956639812,
 'Because': 0.3010299956639812,
 'After': 0.3010299956639812,
 'them': 0.3010299956639812}

In [101]:
def calcTF_IDF(tf, idf):
    tf_idfDict = {}
    for word, val in tf.items():
        tf_idfDict[word] = val*idf[word]
    return tf_idfDict

In [102]:
tf_idf_A = calcTF_IDF(tf_A,idf)
tf_idf_B = calcTF_IDF(tf_B,idf)
pd.DataFrame([tf_idf_A, tf_idf_B])

Unnamed: 0,After,Because,I,Joe,Mary,Samantha,and,arrived,at,before,...,realized,see,station,that,the,them,they,train,waiting,was
0,0.0,0.015051,0.015051,0.0,0.0,0.0,0.0,0.015051,0.0,0.015051,...,0.0,0.015051,0.0,0.0,0.0,0.015051,0.0,0.0,0.0,0.0
1,0.014335,0.0,0.0,0.014335,0.0,0.0,0.0,0.0,0.0,0.0,...,0.014335,0.0,0.0,0.014335,0.0,0.0,0.014335,0.014335,0.014335,0.014335
