# 3. TF-IDF (Term Frequency-Inverse Document Frequency)

# 3-1 직접계산하기

In [1]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

#### 1) 띄어쓰기 단위로 토큰화

In [2]:
doc_ls = [i.split() for i in docs]
doc_ls

[['오늘', '동물원에서', '원숭이를', '봤어'],
 ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'],
 ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]

#### 2) 각 고유 토큰에 인텍스(Index)를 지정

In [3]:
from collections import defaultdict

word2id = defaultdict(lambda : len(word2id))
[word2id[t] for doc in doc_ls for t in doc]

word2id

defaultdict(<function __main__.<lambda>()>,
            {'오늘': 0,
             '동물원에서': 1,
             '원숭이를': 2,
             '봤어': 3,
             '코끼리를': 4,
             '원숭이에게': 5,
             '바나나를': 6,
             '줬어': 7})

#### 3) DTM 생성

In [5]:
import numpy as np

DTM = np.zeros((len(doc_ls), len(word2id)), dtype = int)

for i, doc in enumerate(doc_ls):
    for word in doc:
        DTM[i, word2id[word]] += 1
DTM

array([[1, 1, 1, 1, 0, 0, 0, 0],
       [1, 1, 0, 2, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 1, 2, 1]])

In [6]:
# 행만 가져올때
DTM[:, 3]

array([1, 2, 0])

In [7]:
np.count_nonzero(DTM[:, 3]) # 열에서 0 값이 아닌것의 갯수(4열의 0값이 아닌거)

2

#### 4) TF 계산
#### TF = 문서내 토크빈도 / 문서 내 전체 토큰 갯수

문서내 토큰 수 (2 or 1 or 0) / 문서내 전체 토큰 빈도수 (1행 : 4, 2행 : 5, 3행 : 5) 

In [8]:
def computeTF(DTM):
    doc_len = len(doc_ls)
    word_len = len(word2id)
    tf = np.zeros((doc_len, word_len))
    for doc_idx in range(doc_len):
        for word_idx in range(word_len):
            tf[doc_idx, word_idx] = DTM[doc_idx, word_idx]/DTM[doc_idx].sum()

    return tf

computeTF(DTM)

array([[0.25, 0.25, 0.25, 0.25, 0.  , 0.  , 0.  , 0.  ],
       [0.2 , 0.2 , 0.  , 0.4 , 0.2 , 0.  , 0.  , 0.  ],
       [0.  , 0.2 , 0.  , 0.  , 0.  , 0.2 , 0.4 , 0.2 ]])

#### 5) IDF 계산
#### IDF = log(총 문서 수 / 토큰이 등장한 문서 수)

In [9]:
import math

def computeIDF(DTM):
    doc_len = len(DTM)
    word_len = len(DTM[0])
    idf = np.zeros(word_len)
    for i in range(word_len):
        idf[i] = -math.log10(np.count_nonzero(DTM[:, i])/doc_len)

    return idf
      
computeIDF(DTM)    

array([ 0.17609126, -0.        ,  0.47712125,  0.17609126,  0.47712125,
        0.47712125,  0.47712125,  0.47712125])

#### 6) TF-IDF 계산

In [10]:
print(DTM.shape)

(3, 8)


In [12]:
def computeTFIDF(DTM):
    tf = computeTF(DTM)
    idf = computeIDF(DTM)
    
#     tfidf = tf*idf # broadcasting
    
    tfidf = np.zeros(tf.shape)
    for doc_idx in range(tf.shape[0]):
        for word_idx in range(tf.shape[1]):
            tfidf[doc_idx, word_idx] = tf[doc_idx, word_idx] * idf[word_idx]
    return tfidf

computeTFIDF(DTM)

array([[ 0.04402281, -0.        ,  0.11928031,  0.04402281,  0.        ,
         0.        ,  0.        ,  0.        ],
       [ 0.03521825, -0.        ,  0.        ,  0.0704365 ,  0.09542425,
         0.        ,  0.        ,  0.        ],
       [ 0.        , -0.        ,  0.        ,  0.        ,  0.        ,
         0.09542425,  0.1908485 ,  0.09542425]])

In [13]:
import pandas as pd

sorted_vocab = sorted((value, key) for key, value in word2id.items())
vocab = [v[1] for v in sorted_vocab]
tfidf = computeTFIDF(DTM)
pd.DataFrame(tfidf, columns=vocab)

Unnamed: 0,오늘,동물원에서,원숭이를,봤어,코끼리를,원숭이에게,바나나를,줬어
0,0.044023,-0.0,0.11928,0.044023,0.0,0.0,0.0,0.0
1,0.035218,-0.0,0.0,0.070437,0.095424,0.0,0.0,0.0
2,0.0,-0.0,0.0,0.0,0.0,0.095424,0.190849,0.095424


# 3-2 직접 계산하기 2

In [14]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [15]:
D = [doc.split() for doc in docs]
print(D)
print(D[1])
D[1].count('봤어')

[['오늘', '동물원에서', '원숭이를', '봤어'], ['오늘', '동물원에서', '코끼리를', '봤어', '봤어'], ['동물원에서', '원숭이에게', '바나나를', '줬어', '바나나를']]
['오늘', '동물원에서', '코끼리를', '봤어', '봤어']


2

In [16]:
t = "봤어"
[True for d in D if t in d]

[True, True]

In [17]:
# t, d, D, 값이 무엇을 넣어야 하는건지
# f 함수는 무엇을 말하고 출력값이 의미하는 것은?
from math import log10
import numpy as np
from collections import defaultdict

# document 내 토큰이 등장한 빈도수 계산
def f(t, d):
    return d.count(t)

# tf 계산
def tf(t, d):
    return 0.5 + 0.5*f(t,d) / max([f(w, d) for w in d])

# idf 계산
def idf(t, D):
    N = len(D)
    nt = len([True for d in D if t in d])
    return 1+log10(N/(1+nt))

# tf-idf 계산
def tfidf_score(t, d, D):
    return tf(t, d) * idf(t, D)


# 공백을 기준으로 토큰과
def tokenizer(d):
    return d.split()


# tfidf 계산  
def tfidfScorer(D):
    D_ls = [tokenizer(d) for d in D] # 문자열을 리스트로 변환함
    
    vocab = list(set().union(*D_ls)) # 사전을 만들기 위해 중복값 처리 (set의 합집합으로)
    print(vocab)
    print(len(vocab))
    
    word2id = defaultdict(lambda : len(word2id)) # 사전 만듬!
#     [word2id[t] for d in D_ls for t in d]
    [word2id[v] for v in vocab]
    print(word2id)
    
    tfidf = np.zeros((len(D_ls), len(word2id))) # 빈 리스트 만들고 각 문자열 값 입력 
    for i in range(len(D_ls)):
        for t in D_ls[i]:
            tfidf[i, word2id[t]] = tfidf_score(t, D_ls[i], D_ls)
    
    
    return tfidf, vocab


tfidfScorer(docs)

['코끼리를', '원숭이에게', '봤어', '동물원에서', '바나나를', '줬어', '오늘', '원숭이를']
8
defaultdict(<function tfidfScorer.<locals>.<lambda> at 0x00000138722E8F78>, {'코끼리를': 0, '원숭이에게': 1, '봤어': 2, '동물원에서': 3, '바나나를': 4, '줬어': 5, '오늘': 6, '원숭이를': 7})


(array([[0.        , 0.        , 1.        , 0.87506126, 0.        ,
         0.        , 1.        , 1.17609126],
        [0.88206844, 0.        , 1.        , 0.65629595, 0.        ,
         0.        , 0.75      , 0.        ],
        [0.        , 0.88206844, 0.        , 0.65629595, 1.17609126,
         0.88206844, 0.        , 0.        ]]),
 ['코끼리를', '원숭이에게', '봤어', '동물원에서', '바나나를', '줬어', '오늘', '원숭이를'])

In [18]:
import pandas as pd
tfidf,vocab = tfidfScorer(docs)

pd.DataFrame(tfidf, columns=vocab)

['코끼리를', '원숭이에게', '봤어', '동물원에서', '바나나를', '줬어', '오늘', '원숭이를']
8
defaultdict(<function tfidfScorer.<locals>.<lambda> at 0x00000138702F1DC8>, {'코끼리를': 0, '원숭이에게': 1, '봤어': 2, '동물원에서': 3, '바나나를': 4, '줬어': 5, '오늘': 6, '원숭이를': 7})


Unnamed: 0,코끼리를,원숭이에게,봤어,동물원에서,바나나를,줬어,오늘,원숭이를
0,0.0,0.0,1.0,0.875061,0.0,0.0,1.0,1.176091
1,0.882068,0.0,1.0,0.656296,0.0,0.0,0.75,0.0
2,0.0,0.882068,0.0,0.656296,1.176091,0.882068,0.0,0.0


# 3-3 Sklearn 활용

In [19]:
docs = ['오늘 동물원에서 원숭이를 봤어',
        '오늘 동물원에서 코끼리를 봤어 봤어',
        '동물원에서 원숭이에게 바나나를 줬어 바나나를']

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidf = tfidf_vect.fit_transform(docs)
tfidf
# tfidf.toarray()
tfidf.todense() 
# toarray(array)와 todense(matrix)는 크게 차이는 없지만 서로 성질이 다르다


matrix([[0.37311881, 0.        , 0.4804584 , 0.4804584 , 0.63174505,
         0.        , 0.        , 0.        ],
        [0.28680065, 0.        , 0.73861611, 0.36930805, 0.        ,
         0.        , 0.        , 0.48559571],
        [0.2344005 , 0.79374908, 0.        , 0.        , 0.        ,
         0.39687454, 0.39687454, 0.        ]])

In [25]:
tfidf_vect.get_feature_names() #get_feature_names는 속성값 불러오는 것!

['동물원에서', '바나나를', '봤어', '오늘', '원숭이를', '원숭이에게', '줬어', '코끼리를']

In [26]:
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer 

count_vect = CountVectorizer() # CountVectorizer는 DTM을 만들어준다.
# count_vect = CountVectorizer(max_df=0.85)
DTM = count_vect.fit_transform(docs)
tfidf_trans = TfidfTransformer() # DTM을 tfidf로 계산해준다.
tfidf = tfidf_trans.fit_transform(DTM)
tfidf.todense()

matrix([[0.37311881, 0.        , 0.4804584 , 0.4804584 , 0.63174505,
         0.        , 0.        , 0.        ],
        [0.28680065, 0.        , 0.73861611, 0.36930805, 0.        ,
         0.        , 0.        , 0.48559571],
        [0.2344005 , 0.79374908, 0.        , 0.        , 0.        ,
         0.39687454, 0.39687454, 0.        ]])

In [27]:
count_vect.get_feature_names()

['동물원에서', '바나나를', '봤어', '오늘', '원숭이를', '원숭이에게', '줬어', '코끼리를']

In [28]:
import pandas as pd
vocab = count_vect.get_feature_names()
pd.DataFrame(tfidf.todense(), columns=vocab)

Unnamed: 0,동물원에서,바나나를,봤어,오늘,원숭이를,원숭이에게,줬어,코끼리를
0,0.373119,0.0,0.480458,0.480458,0.631745,0.0,0.0,0.0
1,0.286801,0.0,0.738616,0.369308,0.0,0.0,0.0,0.485596
2,0.2344,0.793749,0.0,0.0,0.0,0.396875,0.396875,0.0
