<a href="https://colab.research.google.com/github/bwowby/DS/blob/master/nlp-basic/tfidf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## BOW : Bag of Words

*  가장 간단하지만, 자연어 처리에서 널리 쓰이는 개념적 방법.
*  각 단어가 말뭉치(corpus)에 얼마나 많이 나타나는지만 헤아림.
*  구조와 상관없이 단어의 출현 횟수만 카운팅.   
*  횟수가 중요하기 때문에 단어의 순서가 달라도 같은 반환값을 가짐.
*  이를 보완하기 위해 n-gram (n개의 토큰) 사용








   




### 예제

In [33]:
#사이킷런의 CountVectorizer를 통해 벡터화
from sklearn.feature_extraction.text import CountVectorizer

In [34]:
vectorizer = CountVectorizer(analyzer = 'word', #캐릭터 단위로도 벡터화 가능
                             tokenizer = None,
                             preprocessor = None, #전처리 도구
                             stop_words = None, #불용어 명시 가능
                             min_df = 1, #토큰이 나타날 최소 문서 갯수
                             ngram_range = (1,1))
vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [35]:

corpus = ['The proposed rising was a dismal failure, but the Habeas Corpus Act was suspended and Thistlewood and Watson were seized, although upon being tried they were acquitted.', 
         'Before the prorogation, however, he saw the invaluable Act of Habeas Corpus, which he had carried through parliament, receive the royal assent.', 
         'These Personal Liberty Laws forbade justices and judges to take cognizance of claims, extended the habeas corpus act and the privilege of jury trial to fugitives, and punished false testimony severely.', 
         'The procession of the Host on Corpus Christi day became, as it were, a public demonstration of Catholic orthodoxy against Protestantism and later against religious Liberalism.']
#출처: https://hong-yp-ml-records.tistory.com/34 [HONG YP's Data Science BLOG]
#https://sentence.yourdictionary.com/corpus

In [36]:
#단어를 기준으로 한 featrue 만들기 -> sparse한 array 형태로 만들어짐
features = vectorizer.fit_transform(corpus)
print(features)

  (0, 61)	2
  (0, 46)	1
  (0, 53)	1
  (0, 70)	2
  (0, 19)	1
  (0, 21)	1
  (0, 10)	1
  (0, 25)	1
  (0, 16)	1
  (0, 1)	1
  (0, 58)	1
  (0, 4)	2
  (0, 64)	1
  (0, 71)	1
  (0, 72)	2
  (0, 56)	1
  (0, 3)	1
  (0, 69)	1
  (0, 9)	1
  (0, 68)	1
  (0, 63)	1
  (0, 0)	1
  (1, 61)	3
  (1, 25)	1
  (1, 16)	1
  :	:
  (2, 22)	1
  (2, 60)	1
  (2, 57)	1
  (3, 61)	2
  (3, 16)	1
  (3, 4)	1
  (3, 72)	1
  (3, 39)	2
  (3, 45)	1
  (3, 28)	1
  (3, 40)	1
  (3, 13)	1
  (3, 17)	1
  (3, 7)	1
  (3, 5)	1
  (3, 31)	1
  (3, 49)	1
  (3, 18)	1
  (3, 12)	1
  (3, 41)	1
  (3, 2)	2
  (3, 48)	1
  (3, 35)	1
  (3, 52)	1
  (3, 37)	1


In [37]:
#4개의 관측치, 74개의 단어로 이루어짐
features.shape

(4, 74)

In [38]:
vocab = vectorizer.get_feature_names()
print(len(vocab))
vocab[:10]

74


['acquitted',
 'act',
 'against',
 'although',
 'and',
 'as',
 'assent',
 'became',
 'before',
 'being']

In [39]:
import pandas as pd

In [40]:
pd.DataFrame(features.toarray(), columns = vocab).head()

Unnamed: 0,acquitted,act,against,although,and,as,assent,became,before,being,but,carried,catholic,christi,claims,cognizance,corpus,day,demonstration,dismal,extended,failure,false,forbade,fugitives,habeas,had,he,host,however,invaluable,it,judges,jury,justices,later,laws,liberalism,liberty,of,on,orthodoxy,parliament,personal,privilege,procession,proposed,prorogation,protestantism,public,punished,receive,religious,rising,royal,saw,seized,severely,suspended,take,testimony,the,these,they,thistlewood,through,to,trial,tried,upon,was,watson,were,which
0,1,1,0,1,2,0,0,0,0,1,1,0,0,0,0,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0,0,2,0,1,1,0,0,0,1,1,2,1,2,0
1,0,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,2,0,1,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,1,0,0,0,0,0,3,0,0,0,1,0,0,0,0,0,0,0,1
2,0,1,0,0,3,0,0,0,0,0,0,0,0,0,1,1,1,0,0,0,1,0,1,1,1,1,0,0,0,0,0,0,1,1,1,0,1,0,1,2,0,0,0,1,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,1,2,1,0,0,0,2,1,0,0,0,0,0,0
3,0,0,2,0,1,1,0,1,0,0,0,0,1,1,0,0,1,1,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,1,0,2,1,1,0,0,0,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0,0,0,1,0


## TF-IDF : Term Frequency - Inverse Documnet Frequency
* 카운트 기반 벡터화는 카운트 값이 높을 수록 중요 단어로 인식 -> 모든 문서에서 자주 쓰일 수 밖에 없는 불용어(the,a..) 들이 중요하다 인식 될 수 있음
* 어떤 단어가 개별 문서 내에서 얼마나 중요한 것인지를 나타내 주는 통계적 수치로 '특정 단어가 해당 문서에는 많이 출현하지만. 다른 문서에서는 나오지 않는다면 중요한 단어일 것이다'라는 가정.
* 이 값이 크면 중요한 단어이므로 모든 문서에서 흔하게 나타나는 불용어들을 걸러낼 수 있음



### 직접 구현해보기
https://wikidocs.net/31698

In [44]:
# idf 계산을 위한 log
from math import log

In [49]:
docs = [
  '먹고 싶은 사과',
  '먹고 싶은 딸기',
  '빨갛고 맛있는 과일 사과 사과',
  '저는 과일이 좋아요'
] 

vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()
vocab

['과일', '과일이', '딸기', '맛있는', '먹고', '빨갛고', '사과', '싶은', '저는', '좋아요']

In [50]:
# 총 문서의 수
N = len(docs)

# 한 문서 내에서 단어의 갯수 세기
def tf(t,d) : 
  return d.count(t)

def idf(t):
  df = 0
  #t가 나타난 문서수
  for doc in docs:
    df += t in doc
  #t가 나타난 문서수/전체 문서수 의 역수의 로그
  return log(N/(df+1))

def tfidf(t,d) : 
  return tf(t,d) * idf(t)


In [51]:
result = []

for i in range(N) : 
    result.append([])
    d = docs[i]
    for j in range(len(vocab)) : 
      t = vocab[j]
      result[-1].append(tf(t,d))

tf_ = pd.DataFrame(result, columns = vocab)
tf_


Unnamed: 0,과일,과일이,딸기,맛있는,먹고,빨갛고,사과,싶은,저는,좋아요
0,0,0,0,0,1,0,1,1,0,0
1,0,0,1,0,1,0,0,1,0,0
2,1,0,0,1,0,1,2,0,0,0
3,1,1,0,0,0,0,0,0,1,1


In [52]:
result = []

for j in range(len(vocab)) : 
  t = vocab[j]
  result.append(idf(t))

idf_ = pd.DataFrame(result, index = vocab, columns = ["IDF"])
idf_

Unnamed: 0,IDF
과일,0.287682
과일이,0.693147
딸기,0.693147
맛있는,0.693147
먹고,0.287682
빨갛고,0.693147
사과,0.287682
싶은,0.287682
저는,0.693147
좋아요,0.693147


In [53]:
result = []

for i in range(N):
  result.append([])
  d = docs[i]
  for j in range(len(vocab)) : 
    t = vocab[j]
    result[-1].append(tfidf(t,d))

tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,과일,과일이,딸기,맛있는,먹고,빨갛고,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.0,0.287682,0.0,0.287682,0.287682,0.0,0.0
1,0.0,0.0,0.693147,0.0,0.287682,0.0,0.0,0.287682,0.0,0.0
2,0.287682,0.0,0.0,0.693147,0.0,0.693147,0.575364,0.0,0.0,0.0
3,0.287682,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


In [41]:
#단순히 빈도수만 표현해주는 CountVectorizer와 달리 다른 문장에서의 단어 빈도를 고려해 중요도 표현
#Countvectorizer보다 많이 쓰임
from sklearn.feature_extraction.text import TfidfVectorizer

In [55]:
tfidfv_ = TfidfVectorizer().fit(docs)
print(tfidfv_.transform(docs).toarray())
print(tfidfv_.vocabulary_)

[[0.         0.         0.         0.         0.57735027 0.
  0.57735027 0.57735027 0.         0.        ]
 [0.         0.         0.66767854 0.         0.52640543 0.
  0.         0.52640543 0.         0.        ]
 [0.42693074 0.         0.         0.42693074 0.         0.42693074
  0.6731942  0.         0.         0.        ]
 [0.         0.57735027 0.         0.         0.         0.
  0.         0.         0.57735027 0.57735027]]
{'먹고': 4, '싶은': 7, '사과': 6, '딸기': 2, '빨갛고': 5, '맛있는': 3, '과일': 0, '저는': 8, '과일이': 1, '좋아요': 9}


In [42]:
tfidfv = TfidfVectorizer().fit(corpus)
print(tfidfv.transform(corpus).toarray())
print(tfidfv.vocabulary_)

[[0.19884046 0.12691729 0.         0.19884046 0.25383458 0.
  0.         0.         0.         0.19884046 0.19884046 0.
  0.         0.         0.         0.         0.1037632  0.
  0.         0.19884046 0.         0.19884046 0.         0.
  0.         0.12691729 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.19884046 0.
  0.         0.         0.         0.         0.         0.19884046
  0.         0.         0.19884046 0.         0.19884046 0.
  0.         0.2075264  0.         0.19884046 0.19884046 0.
  0.         0.         0.19884046 0.19884046 0.39768092 0.19884046
  0.31353621 0.        ]
 [0.         0.13946702 0.         0.         0.         0.
  0.21850204 0.         0.21850204 0.         0.         0.21850204
  0.         0.         0.         0.         0.11402343 0.
  0.         0.         0.         0.         0.   