# 단어의 표현 (Word Representation)


기계는 문자를 그대로 인식할 수 없기때문에 숫자로 변환



#1 TF-IDF를 활용한 단어 벡터

##1-1 직접 구현하기

weighting schema|weight|설명
--|--|--
term frequency|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/91699003abf4fe8bdf861bbce08e73e71acf5fd4"/>|=토큰빈도/문서내토큰빈도
inverse document frequency|<img src="https://wikimedia.org/api/rest_v1/media/math/render/svg/864fcfdc0c16344c11509f724f1aa7081cf9f657" />|=log(총문서갯수/(토큰이 등장한 문서수))

In [1]:
d1 = "The cat sat on my face I hate a cat"
d2 = "The dog sat on my bed I love a dog" 

In [2]:
import numpy as np
from collections import defaultdict

def tf(t, d) :
  return d.count(t) / len(d)

def idf (t, D) :
  N = len(D)
  n = len([True for d in D if t in d])
  return np.log(N/n)

def tfidf (t,d,D) :
  return tf(t,d) * idf(t,D)

def tokenizer(d) :
  return d.split()

def tdidf_score(D):
    docs = [tokenizer(d) for d in D]
    result = []
    for d in docs:
        result.append([(t, tfidf(t, d, docs)) for t in d])
    return result        

In [3]:
tdidf_score([d1,d2])

[[('The', 0.0),
  ('cat', 0.13862943611198905),
  ('sat', 0.0),
  ('on', 0.0),
  ('my', 0.0),
  ('face', 0.06931471805599453),
  ('I', 0.0),
  ('hate', 0.06931471805599453),
  ('a', 0.0),
  ('cat', 0.13862943611198905)],
 [('The', 0.0),
  ('dog', 0.13862943611198905),
  ('sat', 0.0),
  ('on', 0.0),
  ('my', 0.0),
  ('bed', 0.06931471805599453),
  ('I', 0.0),
  ('love', 0.06931471805599453),
  ('a', 0.0),
  ('dog', 0.13862943611198905)]]

## 1-2 sklearn 활용

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

docs = [d1, d2]
count_vect = CountVectorizer()
countv = count_vect.fit_transform(docs)

In [5]:
print(countv.toarray())
print(count_vect.vocabulary_)

[[0 2 0 1 1 0 1 1 1 1]
 [1 0 2 0 0 1 1 1 1 1]]
{'the': 9, 'cat': 1, 'sat': 8, 'on': 7, 'my': 6, 'face': 3, 'hate': 4, 'dog': 2, 'bed': 0, 'love': 5}


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer()
tfidfv = tfidf_vect.fit_transform(docs)
print(tfidfv.toarray())
print(tfidf_vect.vocabulary_)

[[0.         0.70600557 0.         0.35300279 0.35300279 0.
  0.25116439 0.25116439 0.25116439 0.25116439]
 [0.35300279 0.         0.70600557 0.         0.         0.35300279
  0.25116439 0.25116439 0.25116439 0.25116439]]
{'the': 9, 'cat': 1, 'sat': 8, 'on': 7, 'my': 6, 'face': 3, 'hate': 4, 'dog': 2, 'bed': 0, 'love': 5}


## 1-3 gensim 활용

In [7]:
from gensim.models import TfidfModel
from gensim import corpora

doc_ls = [d.split() for d in docs]
id2word = corpora.Dictionary(doc_ls)
bow = [id2word.doc2bow(d) for d in doc_ls]

tfidf = TfidfModel(bow)
tfidf[bow[0]]

[(3, 0.8164965809277261), (4, 0.4082482904638631), (5, 0.4082482904638631)]

In [8]:
id2word[3]

'cat'



---

