---
### 4. 단어 가방 모형과 TF-IDF
---

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [2]:
## 한글설정 
#!pip install koreanize_matplotlib
import koreanize_matplotlib




[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


---
1. CountVectorizer() 
- analyzer : 단어, 문자 단위 벡터화 방법 
- ngram_range : 토큰을 묶는 범위 ex) 1~3 --> "한", "한글화" 
- max_df : 불용어 제외 기준(default = 1) ex) max_df = 10 ~ 10개의 단어만 사용 / max_df = 0.9 ~ 문서의 90%만 사용 
- min_df : 컷오프 기준 ex) min_df = 10 ~ 10번 이상 등장하는 단어만 사용 / min_df = 0.01 ~ 1% 이상 등장하는 단어만 사용
---

In [3]:
corpus = ["코로나 거리두기와 코로나 상생지원금 문의입니다.",
          "지하철 운행시간과 지하철 요금 문의입니다.",
          "지하철 승강장 문의입니다.",
          "택시 승강장 문의입니다."] 
corpus

['코로나 거리두기와 코로나 상생지원금 문의입니다.',
 '지하철 운행시간과 지하철 요금 문의입니다.',
 '지하철 승강장 문의입니다.',
 '택시 승강장 문의입니다.']

In [4]:
## BOW (incoding vector) 만들기 
from sklearn.feature_extraction.text import CountVectorizer
cvect = CountVectorizer()
dtm = cvect.fit_transform(corpus)
dtm

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [5]:
## get_feature_names_out() : word dictionary 반환 
df_dtm = pd.DataFrame(dtm.toarray(), columns = cvect.get_feature_names_out())
print(df_dtm.shape)
df_dtm

(4, 9)


Unnamed: 0,거리두기와,문의입니다,상생지원금,승강장,요금,운행시간과,지하철,코로나,택시
0,1,1,1,0,0,0,0,2,0
1,0,1,0,0,1,1,2,0,0
2,0,1,0,1,0,0,1,0,0
3,0,1,0,1,0,0,0,0,1


In [6]:
## feature별 합계 후, Transpose 
df_dtm.sum().to_frame()

Unnamed: 0,0
거리두기와,1
문의입니다,4
상생지원금,1
승강장,2
요금,1
운행시간과,1
지하철,3
코로나,2
택시,1


In [7]:
df_dtm.sum().to_frame().T

Unnamed: 0,거리두기와,문의입니다,상생지원금,승강장,요금,운행시간과,지하철,코로나,택시
0,1,4,1,2,1,1,3,2,1


In [8]:
## 매개변수 지정 
stop_words = ['코로나', '문의입니다'] 

cvect = CountVectorizer(analyzer = 'char',         ## char(문자) 단위로 BOW 생성, default는 word(단어)  
                        ngram_range = (1, 5),      ## 토큰으로 자르는 단위는 1~5개, 즉 문자가 1~5개 사이인 것들만 추출 
                        min_df = 2, max_df = 1.0,  ## 2번 이하로 등장하는 문자는 제외, 문서내 모든 문자에 대해 처리 
                        max_features = 30,         ## 단어갯수는 30개로 제한 
                        stop_words = stop_words
                       )

dtm = cvect.fit_transform(corpus) 
vocab = cvect.get_feature_names_out()
df_dtm = pd.DataFrame(dtm.toarray(), columns = vocab)
df_dtm

Unnamed: 0,Unnamed: 1,문,문의,문의입,문의입니,.,니,니다,니다.,다,...,의입니다.,입,입니,입니다,입니다.,지,철,철.1,하,하철
0,4,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,0,0,0,0
1,4,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,2,2,2,2,2
2,2,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
3,2,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,0,0,0,0,0


In [9]:
stop_words = ['코로나', '문의입니다'] 

cvect = CountVectorizer(ngram_range = (1, 4),      ## 토큰으로 자르는 단위는 1~5개, 즉 문자가 1~5개 사이인 것들만 추출 
                        min_df = 1, max_df = 1.0,  ## 2번 이하로 등장하는 문자는 제외, 문서내 모든 문자에 대해 처리 
                        max_features = 30,         ## 단어갯수는 30개로 제한 
                        stop_words = stop_words
                       )

dtm = cvect.fit_transform(corpus) 
vocab = cvect.get_feature_names_out()
df_dtm = pd.DataFrame(dtm.toarray(), columns = vocab)
df_dtm

Unnamed: 0,거리두기와,거리두기와 상생지원금,상생지원금,승강장,요금,운행시간과,운행시간과 지하철,운행시간과 지하철 요금,지하철,지하철 승강장,지하철 요금,지하철 운행시간과,지하철 운행시간과 지하철,지하철 운행시간과 지하철 요금,택시,택시 승강장
0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,1,1,1,1,2,0,1,1,1,1,0,0
2,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1


--- 
#### 2. TfidfVectorizer() 
- norm = 'l2' : 정규화 방법 ~ L1 : 제곱합 / L2 : 절대값 기준 
- smooth_idf = False : feature값 0일때 처리방법. True이면 작은 값을 더해 없어지지 않도록 조정함 
- sublinear_tf = False : outlier 대응위해 로그변환이 필요하다면 True로 지정 
- use_idf = True : idf 가중치를 적용하여 feature 생성. False이면 그냥 단어 빈도만 사용 
---

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer 
tfidf_vect = TfidfVectorizer()
dtm = tfidf_vect.fit_transform(corpus)
dtm

<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 14 stored elements in Compressed Sparse Row format>

In [13]:
vocab = tfidf_vect.get_feature_names_out()
df_dtm = pd.DataFrame(dtm.toarray(), columns = vocab) 
#df_dtm
df_dtm.style.background_gradient()   ## 가중치가 적용된 부분을 색깔 처리 

Unnamed: 0,거리두기와,문의입니다,상생지원금,승강장,요금,운행시간과,지하철,코로나,택시
0,0.399288,0.208365,0.399288,0.0,0.0,0.0,0.0,0.798575,0.0
1,0.0,0.239219,0.0,0.0,0.458412,0.458412,0.722835,0.0,0.0
2,0.0,0.423897,0.0,0.640434,0.0,0.0,0.640434,0.0,0.0
3,0.0,0.379192,0.0,0.572892,0.0,0.0,0.0,0.0,0.726641


In [15]:
## 참고. td-idf 가중치 없이 빈도만 적용한 예 
tfidf_vect = TfidfVectorizer(use_idf = False)
dtm = tfidf_vect.fit_transform(corpus)
vocab = tfidf_vect.get_feature_names_out()
df_dtm = pd.DataFrame(dtm.toarray(), columns = vocab) 
df_dtm

Unnamed: 0,거리두기와,문의입니다,상생지원금,승강장,요금,운행시간과,지하철,코로나,택시
0,0.377964,0.377964,0.377964,0.0,0.0,0.0,0.0,0.755929,0.0
1,0.0,0.377964,0.0,0.0,0.377964,0.377964,0.755929,0.0,0.0
2,0.0,0.57735,0.0,0.57735,0.0,0.0,0.57735,0.0,0.0
3,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.57735
