# tfidf

## 1. Problem description

There are 10 documents. Calculate tfidf for each document.

## 2. Input

In [67]:
docs = dict()

docs['doc1'] = 'Trời mưa khiến lòng mình buồn theo'.lower()
docs['doc2'] = 'Dịch bệnh hoành hành là nguyên nhân khiến cửa hàng phải đóng cửa'.lower()
docs['doc3'] = 'Vì đau chân nên mình phải từ bỏ sân cỏ'.lower()
docs['doc4'] = 'Thật là một bộ phim xuất sắc'.lower()
docs['doc5'] = 'Vô cùng vinh hạnh khi được tham dự lớp học này'.lower()

## 3. Manual calculation

First of all, we need to import important packages

In [68]:
import math
import numpy as np
import pandas as pd

### 3.1. Get all vocabularies

In [69]:
def get_all_vocabularies(docs):
    vocabularies = dict()
    for k in docs:
        doc = docs[k]
        words = doc.split(' ')
        for w in words:
            vocabularies[w] = True
    return list(vocabularies.keys())

vocabularies = get_all_vocabularies(docs)
vocabularies.sort()

In [70]:
print(vocabularies)

['buồn', 'bệnh', 'bỏ', 'bộ', 'chân', 'cùng', 'cỏ', 'cửa', 'dịch', 'dự', 'hoành', 'hàng', 'hành', 'hạnh', 'học', 'khi', 'khiến', 'là', 'lòng', 'lớp', 'mình', 'mưa', 'một', 'nguyên', 'nhân', 'này', 'nên', 'phim', 'phải', 'sân', 'sắc', 'tham', 'theo', 'thật', 'trời', 'từ', 'vinh', 'vì', 'vô', 'xuất', 'đau', 'đóng', 'được']


### 3.2. Calculate tf

In [71]:
def count_word(doc, word):
  words = doc.split(' ')
  count = 0
  for w in words:
    if w == word:
      count += 1
  return count

In [72]:
tf = dict()
for doc in docs:
    tf[doc] = []
    for word in vocabularies:
        tf[doc].append(count_word(docs[doc], word))

In [73]:
def convert_tf_to_dataframe(vocabularies, tf):
    rows = []
    for doc in tf:
        rows.append(tf[doc])
    return pd.DataFrame(rows, columns = vocabularies, index=list(tf.keys())).transpose()

tf_dataframe = convert_tf_to_dataframe(vocabularies, tf)
tf_dataframe

Unnamed: 0,doc1,doc2,doc3,doc4,doc5
buồn,1,0,0,0,0
bệnh,0,1,0,0,0
bỏ,0,0,1,0,0
bộ,0,0,0,1,0
chân,0,0,1,0,0
cùng,0,0,0,0,1
cỏ,0,0,1,0,0
cửa,0,2,0,0,0
dịch,0,1,0,0,0
dự,0,0,0,0,1


### 3.3. Normalize tf

In [74]:
def normalize(tf, norm=None):
    for doc in tf:
        if norm == 'l1':
            m = sum(tf[doc])
        elif norm == 'l2':
            m = np.power(np.dot(tf[doc], tf[doc]), 0.5)
        else:
            m = max(tf[doc])
        for idx, item in enumerate(tf[doc]):
            tf[doc][idx] = item / m
    return tf

tf = normalize(tf, norm='l2')

In [75]:
tf_dataframe = convert_tf_to_dataframe(vocabularies, tf)
tf_dataframe

Unnamed: 0,doc1,doc2,doc3,doc4,doc5
buồn,0.377964,0.0,0.0,0.0,0.0
bệnh,0.0,0.258199,0.0,0.0,0.0
bỏ,0.0,0.0,0.316228,0.0,0.0
bộ,0.0,0.0,0.0,0.377964,0.0
chân,0.0,0.0,0.316228,0.0,0.0
cùng,0.0,0.0,0.0,0.0,0.301511
cỏ,0.0,0.0,0.316228,0.0,0.0
cửa,0.0,0.516398,0.0,0.0,0.0
dịch,0.0,0.258199,0.0,0.0,0.0
dự,0.0,0.0,0.0,0.0,0.301511


### 3.3. Calculate df

In [76]:
df = []
idx = 0
for word in vocabularies:
    df.append(0)
    for doc in docs:
        if docs[doc].find(word) >= 0:
            df[idx] += 1
    idx += 1

In [77]:
# print(df)
df_dataframe = pd.DataFrame([df], columns=vocabularies, index=['df'])
df_dataframe

Unnamed: 0,buồn,bệnh,bỏ,bộ,chân,cùng,cỏ,cửa,dịch,dự,...,thật,trời,từ,vinh,vì,vô,xuất,đau,đóng,được
df,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


### 3.4. Calculate idf

In [78]:
idf = []
num_of_docs = len(docs.keys())
for i, item in enumerate(df):
  idf.append(math.log(num_of_docs / item, 2))

In [79]:
idf_dataframe = pd.DataFrame([idf], columns=vocabularies, index=['idf'])
idf_dataframe

Unnamed: 0,buồn,bệnh,bỏ,bộ,chân,cùng,cỏ,cửa,dịch,dự,...,thật,trời,từ,vinh,vì,vô,xuất,đau,đóng,được
idf,2.321928,2.321928,2.321928,2.321928,2.321928,2.321928,2.321928,2.321928,2.321928,2.321928,...,2.321928,2.321928,2.321928,2.321928,2.321928,2.321928,2.321928,2.321928,2.321928,2.321928


### 3.5. Calculate tfidf

In [82]:
def calculate_tfidf(tf, idf):
  tfidf = dict()
  for doc in tf:
    tfidf[doc] = []
    for idx, item in enumerate(tf[doc]):
        tfidf[doc].append(idf[idx] * item)
  return tfidf

tfidf = calculate_tfidf(tf, idf)

In [83]:
tfidf_dataframe = convert_tf_to_dataframe(vocabularies, tfidf)
tfidf_dataframe.head(10)

Unnamed: 0,doc1,doc2,doc3,doc4,doc5
buồn,0.877606,0.0,0.0,0.0,0.0
bệnh,0.0,0.599519,0.0,0.0,0.0
bỏ,0.0,0.0,0.734258,0.0,0.0
bộ,0.0,0.0,0.0,0.877606,0.0
chân,0.0,0.0,0.734258,0.0,0.0
cùng,0.0,0.0,0.0,0.0,0.700088
cỏ,0.0,0.0,0.734258,0.0,0.0
cửa,0.0,1.199039,0.0,0.0,0.0
dịch,0.0,0.599519,0.0,0.0,0.0
dự,0.0,0.0,0.0,0.0,0.700088


## 4. Use sklearn package

In [84]:
# import tfidf package
from sklearn.feature_extraction.text import TfidfVectorizer

In [85]:
vectorizer = TfidfVectorizer(norm='l2')
# print(list(docs.values())
X = vectorizer.fit_transform(list(docs.values())).toarray()
vocab = vectorizer.get_feature_names()
params = vectorizer.get_params()

In [86]:
# print(params)
print(vocab)

['buồn', 'bệnh', 'bỏ', 'bộ', 'chân', 'cùng', 'cỏ', 'cửa', 'dịch', 'dự', 'hoành', 'hàng', 'hành', 'hạnh', 'học', 'khi', 'khiến', 'là', 'lòng', 'lớp', 'mình', 'mưa', 'một', 'nguyên', 'nhân', 'này', 'nên', 'phim', 'phải', 'sân', 'sắc', 'tham', 'theo', 'thật', 'trời', 'từ', 'vinh', 'vì', 'vô', 'xuất', 'đau', 'đóng', 'được']


In [88]:
# X
# vocab
tfidf_result = pd.DataFrame(X, columns=vocab, index=list(docs.keys())).transpose()
tfidf_result.head(10)

Unnamed: 0,doc1,doc2,doc3,doc4,doc5
buồn,0.398352,0.0,0.0,0.0,0.0
bệnh,0.0,0.267713,0.0,0.0,0.0
bỏ,0.0,0.0,0.327881,0.0,0.0
bộ,0.0,0.0,0.0,0.387757,0.0
chân,0.0,0.0,0.327881,0.0,0.0
cùng,0.0,0.0,0.0,0.0,0.301511
cỏ,0.0,0.0,0.327881,0.0,0.0
cửa,0.0,0.535427,0.0,0.0,0.0
dịch,0.0,0.267713,0.0,0.0,0.0
dự,0.0,0.0,0.0,0.0,0.301511
