- Tokenizer
- Tfidf + Countvect 차이?

In [1]:
!pip install Konlpy > /dev/null

In [2]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [4]:
train_df = pd.read_csv('naver_movie_train.tsv', sep = '\t')
test_df = pd.read_csv('naver_movie_test.tsv', sep = '\t')

In [5]:
train_df.shape, test_df.shape

((145393, 3), (48852, 3))

### 토크나이저 함수 정의

In [6]:
from konlpy.tag import Okt
okt = Okt()

In [7]:
stopwords = ['이','가','의','은','들','는','좀','잘','걍',
             '과','도','를','으로','자','에','와','한','하다','을','ㅋㅋ','ㅠㅠ']

In [17]:
# 토크나이저  : str이 들어오면 list로 바꿔줌.
def otk_tokenizer(text):
    morphs = okt.morphs(text, stem = True)
    tokens =  [word for word in morphs if word not in stopwords]
    return tokens

### Pipeline으로 Feature변환과 분류 동시 진행

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

In [18]:
pipeline = Pipeline([
    ('TFIDF', TfidfVectorizer(tokenizer=otk_tokenizer)), 
    ('LR', LogisticRegression(random_state=2022))
])

In [19]:
%time pipeline.fit(train_df.document, train_df.label)

CPU times: user 5min 48s, sys: 5.16 s, total: 5min 53s
Wall time: 5min 52s


Pipeline(steps=[('TFIDF',
                 TfidfVectorizer(tokenizer=<function otk_tokenizer at 0x7f59d2caf680>)),
                ('LR', LogisticRegression(random_state=2022))])

In [20]:
pipeline.score(test_df.document, test_df.label)

0.84283140915418

- map(function, iterable)
    - function : 적용할 함수
    - iterable : 반복 가능한 자료형(리스트, 튜플 등)
    - map 반환값은 map 객체이기 때문에 리스트로 변환시킴
- 참고 : https://blockdmask.tistory.com/531

In [24]:
import re

In [25]:
reviews = ['방금 보고나왔는데 요 최근 한국영화중 최고.. 배우들 연기는 물론이고 스토리도 최고였음 얼른 이 문제 해결되길 바람... 대한민국 국민이면 이 영화 한번 쯤 보는게 좋을 듯']
reviews = map(lambda x : re.sub('[^가-힣]', ' ', x), reviews)

In [26]:
pipeline.predict(reviews)

array([1])

### 최적 파라미터 찾기

In [30]:
from sklearn.model_selection import GridSearchCV
params = {
    'TFIDF__ngram_range': [(1,1),(1,2)],
    'TFIDF__max_df': [0.95, 0.98],
    'LR__C': [1, 5]
}

In [None]:
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv = 3)
%time grid_pipe.fit(train_df.document, train_df.label)

- 최적 파라미터 결과 적용

In [34]:
pipeline = Pipeline([
    ('TFIDF', TfidfVectorizer(tokenizer=otk_tokenizer, max_df = 0.95, ngram_range = (1,2))), 
    ('LR', LogisticRegression(random_state=2022))
])

In [35]:
%time pipeline.fit(train_df.document, train_df.label)

CPU times: user 6min 8s, sys: 13.6 s, total: 6min 21s
Wall time: 6min 3s


Pipeline(steps=[('TFIDF',
                 TfidfVectorizer(max_df=0.95, ngram_range=(1, 2),
                                 tokenizer=<function otk_tokenizer at 0x7f59d2caf680>)),
                ('LR', LogisticRegression(random_state=2022))])

In [36]:
pipeline.score(test_df.document, test_df.label)

0.8582453123720626