# TF-IDF algorithm

In [2]:
import numpy as np

In [3]:
texts = ['i hate you','i love you','i really hate you','i like you']

In [4]:
vocabulary = list(set(' '.join(texts).split()))
vocabulary

['i', 'love', 'hate', 'you', 'really', 'like']

TF of term v in the sentence t is equal to the number of times v occurred in t devided by the total number of terms in t. Hence:

$ TF(v,t) = BOW(v,t)\ /\ len(t) $

where BOW is the bag of word algorithm result which is basically same as counting the number of times v occurred in t.

Example: 

$ TF(i, 0) = {\frac{1}{3}} $, on the first sentence, 'i' occurred 1 time and there are 3 words in that sentence

In [5]:
bow = np.zeros((len(texts),len(vocabulary)))
for i, t in enumerate(texts):
    for v in t.split():
        bow[i, vocabulary.index(v)] += 1

bow

array([[1., 0., 1., 1., 0., 0.],
       [1., 1., 0., 1., 0., 0.],
       [1., 0., 1., 1., 1., 0.],
       [1., 0., 0., 1., 0., 1.]])

In [9]:
tf = np.zeros((len(texts),len(vocabulary)))
tf

array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.]])

In [10]:

tf = np.zeros((len(texts),len(vocabulary)))
for i, t in enumerate(texts):
    tf[i] = bow[i] / len(t.split())
#     print (t.split())
        
tf

['i', 'hate', 'you']
['i', 'love', 'you']
['i', 'really', 'hate', 'you']
['i', 'like', 'you']


array([[0.33333333, 0.        , 0.33333333, 0.33333333, 0.        ,
        0.        ],
       [0.33333333, 0.33333333, 0.        , 0.33333333, 0.        ,
        0.        ],
       [0.25      , 0.        , 0.25      , 0.25      , 0.25      ,
        0.        ],
       [0.33333333, 0.        , 0.        , 0.33333333, 0.        ,
        0.33333333]])

IDF of term v is equal to:

$\log(\frac{N}{n})$

where N is the total number of documents and n is the number document that contain term v.

Example: 

$ IDF(i) = {\log(\frac{4}{4})} $, total of 4 sentences which all 4 include 'i'

In [11]:
idf = {}
for v in vocabulary:
    idf[v] = 0
    for t in texts:
        if v in t.split():
            idf[v] += 1
    idf[v] = np.log(len(texts)/idf[v])

idf

{'hate': 0.6931471805599453,
 'i': 0.0,
 'like': 1.3862943611198906,
 'love': 1.3862943611198906,
 'really': 1.3862943611198906,
 'you': 0.0}

In [7]:
X = np.zeros((len(texts),len(vocabulary)))
for i, t in enumerate(texts):
    for v in t.split():
        X[i, vocabulary.index(v)] = tf[i, vocabulary.index(v)] * idf[v]

In [8]:
X

array([[0.        , 0.        , 0.        , 0.23104906, 0.        ,
        0.        ],
       [0.46209812, 0.        , 0.        , 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , 0.1732868 , 0.        ,
        0.34657359],
       [0.        , 0.        , 0.46209812, 0.        , 0.        ,
        0.        ]])

Example:
$ TF(i, 0) \times IDF(i) = {\frac{1}{3} \times \log(\frac{4}{4})}  = {0} $