# TF-IDF
## TF-IDF stands for term-frequency document inverse frequency. It is a very common algorithm to convert text into a meaningful sequence of numbers. It becomes very important to evaluate the most meaningful words in a document

In [1]:
import numpy as np
from collections import Counter

In [2]:
#Sample document list
mydocList = ["this car got the excellence award",\
         "good car gives good mileage",\
         "this car is very expensive",\
         "the company is growing with very high production",\
         "this company is financially good"]

### Tf-idf is made up of two parts, term frequency and Inverse data frequency. Term frequency gives us the frequency of each word in each document of the corpus

#### tf ->

$$
tf_i,j =  n_i,j/(\Sigma_k n_i,j)
$$

In [3]:

tokens = list(map(lambda x:set(x.split(" ")),mydocList))

In [4]:
tokens

[{'award', 'car', 'excellence', 'got', 'the', 'this'},
 {'car', 'gives', 'good', 'mileage'},
 {'car', 'expensive', 'is', 'this', 'very'},
 {'company', 'growing', 'high', 'is', 'production', 'the', 'very', 'with'},
 {'company', 'financially', 'good', 'is', 'this'}]

In [9]:
for doc in mydocList:
    tf = Counter()
    for word in doc.split():
        print(word)
        tf[word] +=1
    print (tf.items())

this
car
got
the
excellence
award
dict_items([('award', 1), ('this', 1), ('the', 1), ('car', 1), ('excellence', 1), ('got', 1)])
good
car
gives
good
mileage
dict_items([('mileage', 1), ('gives', 1), ('good', 2), ('car', 1)])
this
car
is
very
expensive
dict_items([('this', 1), ('expensive', 1), ('is', 1), ('very', 1), ('car', 1)])
the
company
is
growing
with
very
high
production
dict_items([('is', 1), ('production', 1), ('with', 1), ('the', 1), ('company', 1), ('growing', 1), ('very', 1), ('high', 1)])
this
company
is
financially
good
dict_items([('this', 1), ('company', 1), ('is', 1), ('good', 1), ('financially', 1)])


In [10]:
import string

In [11]:
def build_lexicon(corpus):
    lexicon = set()
    for doc in corpus:
        lexicon.update([word for word in doc.split()])
    return lexicon

In [12]:
def tf(term, document):
    return freq(term, document)

def freq(term, document):
    return document.split().count(term)

In [14]:
vocabulary = build_lexicon(mydocList)

In [15]:
vocabulary

{'award',
 'car',
 'company',
 'excellence',
 'expensive',
 'financially',
 'gives',
 'good',
 'got',
 'growing',
 'high',
 'is',
 'mileage',
 'production',
 'the',
 'this',
 'very',
 'with'}

In [18]:
doc_term_matrix = []

In [24]:
mydocList.index("this car got the excellence award")+2

2

In [28]:
print ('Our vocabulary vector is [' + ', '.join(list(vocabulary)) + ']')
for doc in mydocList:
    print ('The doc is "' + doc + '"')
    tf_vector = [tf(word, doc) for word in vocabulary]
    tf_vector_string = ', '.join(format(freq, 'd') for freq in tf_vector)
    print ('The tf vector for Document %d is [%s]' % ((mydocList.index(doc)+1), tf_vector_string))
    doc_term_matrix.append(tf_vector)
    
    # here's a test: why did I wrap mydoclist.index(doc)+1 in parens?  it returns an int...
    # try it!  type(mydoclist.index(doc) + 1)

print ('All combined, here is our master document term matrix: ')
print (doc_term_matrix)

Our vocabulary vector is [very, expensive, company, high, financially, gives, got, car, is, award, production, good, growing, this, the, with, mileage, excellence]
The doc is "this car got the excellence award"
The tf vector for Document 1 is [0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1]
The doc is "good car gives good mileage"
The tf vector for Document 2 is [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0]
The doc is "this car is very expensive"
The tf vector for Document 3 is [1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0]
The doc is "the company is growing with very high production"
The tf vector for Document 4 is [1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0]
The doc is "this company is financially good"
The tf vector for Document 5 is [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0]
All combined, here is our master document term matrix: 
[[0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1], [0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 1, 

In [29]:
import math

def l2_normalizer(vec):
    denom = np.sum([el**2 for el in vec])
    return [(el / math.sqrt(denom)) for el in vec]

In [31]:
doc_term_matrix_l2 = []
for vec in doc_term_matrix:
    doc_term_matrix_l2.append(l2_normalizer(vec))

print ('A regular old document term matrix: ' )
print (np.matrix(doc_term_matrix))
print ('\nA document term matrix with row-wise L2 norms of 1:')
print (np.matrix(doc_term_matrix_l2))

A regular old document term matrix: 
[[0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0 0 1]
 [0 0 0 0 0 1 0 1 0 0 0 2 0 0 0 0 1 0]
 [1 1 0 0 0 0 0 1 1 0 0 0 0 1 0 0 0 0]
 [1 0 1 1 0 0 0 0 1 0 1 0 1 0 1 1 0 0]
 [0 0 1 0 1 0 0 0 1 0 0 1 0 1 0 0 0 0]]

A document term matrix with row-wise L2 norms of 1:
[[ 0.          0.          0.          0.          0.          0.
   0.40824829  0.40824829  0.          0.40824829  0.          0.          0.
   0.40824829  0.40824829  0.          0.          0.40824829]
 [ 0.          0.          0.          0.          0.          0.37796447
   0.          0.37796447  0.          0.          0.          0.75592895
   0.          0.          0.          0.          0.37796447  0.        ]
 [ 0.4472136   0.4472136   0.          0.          0.          0.          0.
   0.4472136   0.4472136   0.          0.          0.          0.
   0.4472136   0.          0.          0.          0.        ]
 [ 0.35355339  0.          0.35355339  0.35355339  0.          0.          0.

In [32]:
## Calculate idf
def numDocsContaining(word, doclist):
    doccount = 0
    for doc in doclist:
        if freq(word, doc) > 0:
            doccount +=1
    return doccount

In [33]:
def idf(word, doclist):
    n_samples = len(doclist)
    df = numDocsContaining(word, doclist)
    return np.log(n_samples / 1+df)

In [35]:
my_idf_vector = [idf(word, mydocList) for word in vocabulary]

In [36]:
my_idf_vector

[1.9459101490553132,
 1.791759469228055,
 1.9459101490553132,
 1.791759469228055,
 1.791759469228055,
 1.791759469228055,
 1.791759469228055,
 2.0794415416798357,
 2.0794415416798357,
 1.791759469228055,
 1.791759469228055,
 1.9459101490553132,
 1.791759469228055,
 2.0794415416798357,
 1.9459101490553132,
 1.791759469228055,
 1.791759469228055,
 1.791759469228055]

In [37]:
np.zeros((len(my_idf_vector),len(my_idf_vector)))

array([[ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
         0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,

In [38]:
def build_idf_matrix(idf_vector):
    idf_mat = np.zeros((len(idf_vector), len(idf_vector)))
    np.fill_diagonal(idf_mat, idf_vector)
    return idf_mat

In [39]:
my_idf_matrix = build_idf_matrix(my_idf_vector)

In [40]:
doc_term_matrix_tfidf = []

#performing tf-idf matrix multiplication
for tf_vector in doc_term_matrix:
    doc_term_matrix_tfidf.append(np.dot(tf_vector, my_idf_matrix))

In [43]:
doc_term_matrix_tfidf

[array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.79175947,  2.07944154,  0.        ,  1.79175947,
         0.        ,  0.        ,  0.        ,  2.07944154,  1.94591015,
         0.        ,  0.        ,  1.79175947]),
 array([ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         1.79175947,  0.        ,  2.07944154,  0.        ,  0.        ,
         0.        ,  3.8918203 ,  0.        ,  0.        ,  0.        ,
         0.        ,  1.79175947,  0.        ]),
 array([ 1.94591015,  1.79175947,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  2.07944154,  2.07944154,  0.        ,
         0.        ,  0.        ,  0.        ,  2.07944154,  0.        ,
         0.        ,  0.        ,  0.        ]),
 array([ 1.94591015,  0.        ,  1.94591015,  1.79175947,  0.        ,
         0.        ,  0.        ,  0.        ,  2.07944154,  0.        ,
         1.79175947,  0.        ,  1.79175947,  0.

In [44]:
doc_term_matrix_tfidf_l2 = []
for tf_vector in doc_term_matrix_tfidf:
    doc_term_matrix_tfidf_l2.append(l2_normalizer(tf_vector))

In [45]:
doc_term_matrix_tfidf_l2

[[0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.38143331669205544,
  0.44267564800531067,
  0.0,
  0.38143331669205544,
  0.0,
  0.0,
  0.0,
  0.44267564800531067,
  0.41424921976758283,
  0.0,
  0.0,
  0.38143331669205544],
 [0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.35213084473745043,
  0.0,
  0.4086684174016596,
  0.0,
  0.0,
  0.0,
  0.76485152872136042,
  0.0,
  0.0,
  0.0,
  0.0,
  0.35213084473745043,
  0.0],
 [0.43545416953101851,
  0.40095845743485192,
  0.0,
  0.0,
  0.0,
  0.0,
  0.0,
  0.46533571452931172,
  0.46533571452931172,
  0.0,
  0.0,
  0.0,
  0.0,
  0.46533571452931172,
  0.0,
  0.0,
  0.0,
  0.0],
 [0.36434015033694023,
  0.0,
  0.36434015033694023,
  0.33547793288559974,
  0.0,
  0.0,
  0.0,
  0.0,
  0.38934174030610619,
  0.0,
  0.33547793288559974,
  0.0,
  0.33547793288559974,
  0.0,
  0.36434015033694023,
  0.33547793288559974,
  0.0,
  0.0],
 [0.0,
  0.0,
  0.44143575738829566,
  0.0,
  0.40646619718814886,
  0.0,
  0.0,
  0.0,
  0.47172776828455232,
  0.0,
  0.0,


In [46]:
ct = np.asarray(doc_term_matrix_tfidf_l2)

In [47]:
ct

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.38143332,  0.44267565,  0.        ,  0.38143332,
         0.        ,  0.        ,  0.        ,  0.44267565,  0.41424922,
         0.        ,  0.        ,  0.38143332],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.35213084,  0.        ,  0.40866842,  0.        ,  0.        ,
         0.        ,  0.76485153,  0.        ,  0.        ,  0.        ,
         0.        ,  0.35213084,  0.        ],
       [ 0.43545417,  0.40095846,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.46533571,  0.46533571,  0.        ,
         0.        ,  0.        ,  0.        ,  0.46533571,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.36434015,  0.        ,  0.36434015,  0.33547793,  0.        ,
         0.        ,  0.        ,  0.        ,  0.38934174,  0.        ,
         0.33547793,  0.        ,  0.33547793,  0.   

In [48]:
class tfidf:
    def __init__(self,docList):
        self.docList = docList
    def build_lexicon(corpus):
        lexicon = set()
        for doc in corpus:
            lexicon.update([word for word in doc.split()])
    return lexicon