# 네이버 영화평 감성 분석 - TfidfVectorizer

In [1]:
import numpy as np 
import pandas as pd 

In [2]:
train_df = pd.read_csv('../00_data/NaverMovie/train.tsv', sep='\t')
test_df = pd.read_csv('../00_data/NaverMovie/test.tsv', sep='\t')

### Tokenizer 함수 정의

In [3]:
from konlpy.tag import Okt

okt = Okt()
def tw_tokenizer(text):
    token_ko = okt.morphs(text)
    return token_ko

### TfidfVectorizer로 학습/변환

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvector = TfidfVectorizer(tokenizer=tw_tokenizer, ngram_range=(1, 2), min_df=3, max_df=0.9)

In [5]:
%time tvector.fit(train_df.document)

Wall time: 4min 26s


TfidfVectorizer(max_df=0.9, min_df=3, ngram_range=(1, 2),
                tokenizer=<function tw_tokenizer at 0x00000198A657F1F0>)

In [6]:
%time X_train_tvect = tvector.transform(train_df['document'])

Wall time: 5min


In [7]:
%time X_test_tvect = tvector.transform(test_df['document'])

Wall time: 1min 41s


In [8]:
y_train = train_df.label.values
y_test = test_df.label.values

### LogisticRegression으로 학습/예측/평가

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [10]:
lr_clf = LogisticRegression(C=3.5)
lr_clf.fit(X_train_tvect, y_train)
pred = lr_clf.predict(X_test_tvect)
accuracy_score(y_test, pred)

0.8584753546280233

### 실제 테스트

In [11]:
review1 = '진짜 개노잼이다.. 1편이랑 같은 감독맞나?러닝타임도 길어서 개지루함 ㄹㅇ'
review2 = '이런 사랑영화가 다시 나올 수 있을까?'

In [12]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다']

In [13]:
import re
review1 = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", review1)
review_tvect = tvector.transform([review1])
pred = lr_clf.predict(review_tvect)
pred[0]

0

In [14]:
review2 = re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", review2)
review_tvect = tvector.transform([review2])
pred = lr_clf.predict(review_tvect)
pred[0]

1

In [15]:
reviews = ['진짜 개노잼이다.. 1편이랑 같은 감독맞나?러닝타임도 길어서 개지루함 ㄹㅇ', 
            '이런 사랑영화가 다시 나올 수 있을까?']

In [16]:
reviews = list(map(lambda x: re.sub("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]", "", x), reviews))
print(reviews)

['진짜 개노잼이다 편이랑 같은 감독맞나러닝타임도 길어서 개지루함 ㄹㅇ', '이런 사랑영화가 다시 나올 수 있을까']


In [17]:
review_tvect = tvector.transform(reviews)
pred = lr_clf.predict(review_tvect)
pred[0], pred[1]

(0, 1)

### 최적 파라미터 도출하기

- Tfidf Vectorizer + Logistic Regression

In [21]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [32]:
lr_clf = LogisticRegression()

params = {
    'C': [2, 3, 4],
    'max_iter': [100, 200, 300]
}
grid_cv = GridSearchCV(lr_clf, param_grid=params, cv=3, scoring='accuracy', verbose=1)
grid_cv.fit(X_train_tvect, y_train)
print(grid_cv.best_params_, grid_cv.best_score_)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  27 out of  27 | elapsed:  2.6min finished
{'C': 4, 'max_iter': 100} 0.8551419497774212


In [33]:
pred = grid_cv.predict(X_test_tvect)
acc = accuracy_score(y_test, pred)
print(f'Tfidf Vectorizer + Logistic Regression 정확도: {acc:.4f}')

Tfidf Vectorizer + Logistic Regression 정확도: 0.8593
