# 문서 표현 (Document Representation)

# 3 TF-IDF (Term Frequency-Inverse Document Frequency)

<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/10109d0e60cc9d50a1ea2f189bac0ac29a030a00" />



*  TF(단어 빈도, Term Frequency) : 단어가 문서 내에 등장하는 빈도
*  IDF(역문서 빈도, Inverse Document Frequency) : 단어가 여러 문서에 공통적으로 등장하는 빈도
*  한 문서 내에 자주 등장하고 다른 문서에 자주 등장하지 않는 단어를 주요 단어로 판별할 수 있음


https://en.wikipedia.org/wiki/Tf%E2%80%93idf

## 3.1 직접계산하기

weighting schema|weight|설명
--|--|--
term frequency|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4" />|=토큰빈도/문서내토큰빈도
inverse document frequency|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />|=log(총문서갯수/(토큰이 등장한 문서수))

In [1]:
d1 = "The cat sat on my face I hate a cat"
d2 = "The dog sat on my bed I love a dog" 
doc_ls = [d1, d2]

In [2]:
import numpy as np
from collections import defaultdict

def tf(t, d) :
  return d.count(t) / len(d)

def idf (t, D) :
  N = len(D)
  n = len([True for d in D if t in d])
  return np.log(N/n)

def tfidf (t,d,D) :
  return tf(t,d) * idf(t,D)

def tokenizer(d) :
  return d.split()

def tfidfScorer(D) :
  doc_ls = [tokenizer(d) for d in D]
  word2id = defaultdict(lambda:len(word2id))

  [word2id[t] for d in doc_ls for t in d]

  tfidf_mat = np.zeros((len(doc_ls), len(word2id)))
  for i, d in enumerate(doc_ls) :
    for t in d :
      tfidf_mat[i, word2id[t]] = tfidf(t, d, D)
  
  return tfidf_mat, word2id.keys()

In [3]:
mat, vocab = tfidfScorer(doc_ls)

In [4]:
import pandas as pd
pd.DataFrame(mat, columns=vocab)

Unnamed: 0,The,cat,sat,on,my,face,I,hate,a,dog,bed,love
0,0.0,0.138629,0.0,0.0,0.0,0.069315,0.0,0.069315,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.138629,0.069315,0.069315


## 3.2 sklearn 활용

In [5]:
d1 = "The cat sat on my face I hate a cat"
d2 = "The dog sat on my bed I love a dog" 
docs = [d1, d2]

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(docs)
tfidf.todense()

matrix([[0.        , 0.70600557, 0.        , 0.35300279, 0.35300279,
         0.        , 0.25116439, 0.25116439, 0.25116439, 0.25116439],
        [0.35300279, 0.        , 0.70600557, 0.        , 0.        ,
         0.35300279, 0.25116439, 0.25116439, 0.25116439, 0.25116439]])

In [7]:
import pandas as pd
pd.DataFrame(tfidf.todense(), columns=tfidf_vect.get_feature_names())



Unnamed: 0,bed,cat,dog,face,hate,love,my,on,sat,the
0,0.0,0.706006,0.0,0.353003,0.353003,0.0,0.251164,0.251164,0.251164,0.251164
1,0.353003,0.0,0.706006,0.0,0.0,0.353003,0.251164,0.251164,0.251164,0.251164




---



## 3.3 gensim 활용

In [8]:
d1 = "The cat sat on my face I hate a cat"
d2 = "The dog sat on my bed I love a dog" 
docs = [d1, d2]

In [9]:
import gensim
from gensim import corpora
from gensim.models import TfidfModel

doc_ls = [d.split() for d in docs]
id2word = corpora.Dictionary(doc_ls)
TDM = [id2word.doc2bow(d) for d in doc_ls]
model = TfidfModel(TDM)


In [10]:
model[TDM][0]

[(3, 0.8164965809277261), (4, 0.4082482904638631), (5, 0.4082482904638631)]

In [11]:
from gensim.matutils import sparse2full

TDM_matrix = [ sparse2full(d, len(id2word)).tolist() for d in model[TDM]]

In [12]:
import pandas as pd
pd.DataFrame(TDM_matrix, columns=id2word.values())

Unnamed: 0,I,The,a,cat,face,hate,my,on,sat,bed,dog,love
0,0.0,0.0,0.0,0.816497,0.408248,0.408248,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.408248,0.816497,0.408248


---