# tfidf

## 1. Problem description

There are 10 documents. Calculate tfidf for each document.

## 2. Input

In [88]:
docs = dict()

docs['doc1'] = 'A A B A A A B B B A A C D'
docs['doc2'] = 'A C D A A A B B B C C C D D D'
docs['doc3'] = 'C C D D D C D D D D D C C C'
docs['doc4'] = 'A B C D E A B C D E A B C D E'
docs['doc5'] = 'A B C C C C C C C C C C B C'
docs['doc6'] = 'D E A D E A D D D D D E'
docs['doc7'] = 'B E B B B B E E E C B B B C'
docs['doc8'] = 'A B C A A A A B B E'
docs['doc9'] = 'A E E E E A B C E E A'
docs['doc10'] = 'A B C B C B C B C D D D'

## 3. Manual calculation

First of all, we need to import important packages

In [89]:
import math
import numpy as np
import pandas as pd

### 3.1. Get all vocabularies

In [90]:
def get_all_vocabularies(docs):
    vocabularies = dict()
    for k in docs:
        doc = docs[k]
        words = doc.split(' ')
        for w in words:
            vocabularies[w] = True
    return list(vocabularies.keys())

vocabularies = get_all_vocabularies(docs)

In [91]:
print(vocabularies)

['A', 'B', 'C', 'D', 'E']


### 3.2. Calculate tf

In [104]:
def count_word(doc, word):
  count = 0
  for w in doc:
    if w == word:
      count += 1
  return count

In [105]:
tf = dict()
for doc in docs:
    tf[doc] = []
    for word in vocabularies:
        tf[doc].append(count_word(docs[doc], word))

In [106]:
def convert_tf_to_dataframe(vocabularies, tf):
    rows = []
    for doc in tf:
        rows.append(tf[doc])
    return pd.DataFrame(rows, columns = vocabularies, index=list(tf.keys()))

tf_dataframe = convert_tf_to_dataframe(vocabularies, tf)
tf_dataframe

Unnamed: 0,A,B,C,D,E
doc1,7,4,1,1,0
doc2,4,3,4,4,0
doc3,0,0,6,8,0
doc4,3,3,3,3,3
doc5,1,2,11,0,0
doc6,2,0,0,7,3
doc7,0,8,2,0,4
doc8,5,3,1,0,1
doc9,3,1,1,0,6
doc10,1,4,4,3,0


### 3.3. Normalize tf

In [107]:
def normalize(tf, norm=None):
    for doc in tf:
        if norm == 'l1':
            m = sum(tf[doc])
        elif norm == 'l2':
            m = np.power(np.dot(tf[doc], tf[doc]), 0.5)
        else:
            m = max(tf[doc])
        for idx, item in enumerate(tf[doc]):
            tf[doc][idx] = item / m
    return tf

tf = normalize(tf, norm='')

In [108]:
tf_dataframe = convert_tf_to_dataframe(vocabularies, tf)
tf_dataframe

Unnamed: 0,A,B,C,D,E
doc1,1.0,0.571429,0.142857,0.142857,0.0
doc2,1.0,0.75,1.0,1.0,0.0
doc3,0.0,0.0,0.75,1.0,0.0
doc4,1.0,1.0,1.0,1.0,1.0
doc5,0.090909,0.181818,1.0,0.0,0.0
doc6,0.285714,0.0,0.0,1.0,0.428571
doc7,0.0,1.0,0.25,0.0,0.5
doc8,1.0,0.6,0.2,0.0,0.2
doc9,0.5,0.166667,0.166667,0.0,1.0
doc10,0.25,1.0,1.0,0.75,0.0


### 3.3. Calculate df

In [109]:
df = []
idx = 0
for word in vocabularies:
    df.append(0)
    for doc in docs:
        if docs[doc].find(word) >= 0:
            df[idx] += 1
    idx += 1

In [110]:
# print(df)
df_dataframe = pd.DataFrame([df], columns=vocabularies, index=['df'])
df_dataframe

Unnamed: 0,A,B,C,D,E
df,8,8,9,6,5


### 3.4. Calculate idf

In [111]:
idf = []
num_of_docs = len(docs.keys())
for i, item in enumerate(df):
  idf.append(math.log(num_of_docs / item, 2))

In [112]:
idf_dataframe = pd.DataFrame([idf], columns=vocabularies, index=['idf'])
idf_dataframe

Unnamed: 0,A,B,C,D,E
idf,0.321928,0.321928,0.152003,0.736966,1.0


### 3.5. Calculate tfidf

In [113]:
def calculate_tfidf(tf, idf):
  tfidf = dict()
  for doc in tf:
    tfidf[doc] = []
    for idx, item in enumerate(tf[doc]):
        tfidf[doc].append(idf[idx] * item)
  return tfidf

tfidf = calculate_tfidf(tf, idf)

In [114]:
tfidf_dataframe = convert_tf_to_dataframe(vocabularies, tfidf)
tfidf_dataframe

Unnamed: 0,A,B,C,D,E
doc1,0.321928,0.183959,0.021715,0.105281,0.0
doc2,0.321928,0.241446,0.152003,0.736966,0.0
doc3,0.0,0.0,0.114002,0.736966,0.0
doc4,0.321928,0.321928,0.152003,0.736966,1.0
doc5,0.029266,0.058532,0.152003,0.0,0.0
doc6,0.091979,0.0,0.0,0.736966,0.428571
doc7,0.0,0.321928,0.038001,0.0,0.5
doc8,0.321928,0.193157,0.030401,0.0,0.2
doc9,0.160964,0.053655,0.025334,0.0,1.0
doc10,0.080482,0.321928,0.152003,0.552724,0.0


## 4. Use sklearn package

In [115]:
# import tfidf package
from sklearn.feature_extraction.text import TfidfVectorizer

In [116]:
vectorizer = TfidfVectorizer(analyzer='char', binary=True, lowercase=False, norm='l1', use_idf=True, smooth_idf=False)
# print(list(docs.values())
X = vectorizer.fit_transform(list(docs.values())).toarray()
vocab = vectorizer.get_feature_names()

In [117]:
params = vectorizer.get_params()
print(params)

{'analyzer': 'char', 'binary': True, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': False, 'max_df': 1.0, 'max_features': None, 'min_df': 1, 'ngram_range': (1, 1), 'norm': 'l1', 'preprocessor': None, 'smooth_idf': False, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}


In [54]:
# X
# vocab
tfidf_result = pd.DataFrame(X, columns=vocab, index=list(docs.keys()))
tfidf_result

Unnamed: 0,Unnamed: 1,A,B,C,D,E
doc1,0.365232,0.446731,0.446731,0.403713,0.551801,0.0
doc2,0.365232,0.446731,0.446731,0.403713,0.551801,0.0
doc3,0.471173,0.0,0.0,0.520816,0.711861,0.0
doc4,0.310635,0.379951,0.379951,0.343364,0.469315,0.525951
doc5,0.437941,0.535664,0.535664,0.484082,0.0,0.0
doc6,0.361659,0.442361,0.0,0.0,0.546404,0.612342
doc7,0.389703,0.0,0.476663,0.430762,0.0,0.659824
doc8,0.351783,0.430281,0.430281,0.388847,0.0,0.59562
doc9,0.351783,0.430281,0.430281,0.388847,0.0,0.59562
doc10,0.365232,0.446731,0.446731,0.403713,0.551801,0.0
