# 네이버 영화 감성 분석 -TfidfVectorizer

In [17]:
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import tqdm
import re

In [2]:
train_df = pd.read_csv('./data/movie_train.tsv',sep='\t')
test_df = pd.read_csv('./data/movie_test.tsv',sep='\t')

In [3]:
from konlpy.tag import Okt

okt = Okt()
def tw_tokenizer(text):
    tokens_ko = okt.morphs(text)
    return tokens_ko

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

tvector = TfidfVectorizer(tokenizer = tw_tokenizer, ngram_range = (1,2),
min_df = 3, max_df = 0.9)

In [5]:
%time
tvector.fit(train_df.document)

Wall time: 0 ns


TfidfVectorizer(max_df=0.9, min_df=3, ngram_range=(1, 2),
                tokenizer=<function tw_tokenizer at 0x000002161FA47DC0>)

In [6]:
%time
X_train_tvect = tvector.transform(train_df['document'])

Wall time: 0 ns


In [7]:
%time
X_test_tvect = tvector.transform(test_df['document'])

Wall time: 0 ns


In [10]:
y_train = train_df.label.values
y_test = test_df.label.values

## LogisticRegression으로 학습/예측/평가

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [12]:
lr_clf = LogisticRegression(C=3.5)
lr_clf.fit(X_train_tvect, y_train)
pred = lr_clf.predict(X_test_tvect)
accuracy_score(y_test, pred)

0.8590060210225533

In [13]:
review1 = "진짜 개노잼이다... 1편이랑 같은 감독맞나?러닝타임도 길어서 개지루함 ㄹㅇ"
review2 = "이런 사랑영화가 다시 나올 수 있을까?"

In [32]:
review1 = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', review1)
review_tvect = tvector.transform([review1])
pred = lr_clf.predict(review_tvect)
pred[0]

0

In [33]:
review2 = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣 ]', '', review2)
review_tvect = tvector.transform([review2])
pred = lr_clf.predict(review_tvect)
pred[0]

1

## GridSearchCV로 최적 파라미터 찾기

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [41]:
""" pipeline = Pipeline([
    ('cvector', CountVectorizer()),
    ('lr_clf', LogisticRegression())
]) """
params = {
    'C': [1,3,3.5,5,10]
}
grid_cv = GridSearchCV(lr_clf, param_grid=params, cv=3, verbose=1, scoring='accuracy')
grid_cv.fit(X_train_tvect, y_train)
print(grid_cv.best_params_, grid_cv.best_score_ )

Fitting 3 folds for each of 18 candidates, totalling 54 fits
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  54 out of  54 | elapsed:    0.9s finished


AttributeError: lower not found