In [1]:
import pandas as pd

In [2]:
df_train = pd.read_csv('data/ratings_train.txt', delimiter='\t', keep_default_na=False)
df_train.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


In [3]:
text_train, y_train = df_train['document'].values, df_train['label'].values

In [4]:
df_test = pd.read_csv('data/ratings_test.txt', delimiter='\t', keep_default_na=False)
text_test, y_test = df_test['document'].values, df_test['label'].values

In [6]:
import numpy as np
len(text_train), np.bincount(y_train)

(150000, array([75173, 74827]))

In [7]:
len(text_test), np.bincount(y_test)

(50000, array([24827, 25173]))

In [8]:
from konlpy.tag import Okt
okt_tag = Okt()

In [9]:
def okt_tokenizer(text):
    return okt_tag.morphs(text)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

okt_param_grid = {'tfidfvectorizer__min_df': [3, 5, 7],
                 'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
                 'logisticregression__C': [.1, 1, 10]}

okt_pipe = make_pipeline(TfidfVectorizer(tokenizer=okt_tokenizer), LogisticRegression(solver='liblinear'))

okt_grid = GridSearchCV(okt_pipe, okt_param_grid, cv=3)
okt_grid.fit(text_train[0:1000], y_train[0:1000])

print(okt_grid.best_score_)
print(okt_grid.best_params_)

0.7039914165662667
{'logisticregression__C': 1, 'tfidfvectorizer__min_df': 3, 'tfidfvectorizer__ngram_range': (1, 1)}


In [17]:
X_test_okt = okt_grid.best_estimator_.named_steps['tfidfvectorizer'].transform(text_test)
score = okt_grid.best_estimator_.named_steps['logisticregression'].score(X_test_okt, y_test)

print(score)

0.70516


In [13]:
from konlpy.tag import Mecab
mecab = Mecab()

def mecab_tokenizer(text):
    return mecab.morphs(text)

In [14]:
mecab_param_grid = {'tfidfvectorizer__min_df': [3, 5, 7],
                    'tfidfvectorizer__ngram_range': [(1, 1), (1, 2), (1, 3)],
                    'logisticregression__C': [.1, 1, 10]}

mecab_pipe = make_pipeline(TfidfVectorizer(tokenizer=mecab_tokenizer), LogisticRegression(solver='liblinear'))

mecab_grid = GridSearchCV(mecab_pipe, mecab_param_grid, n_jobs=1, cv=3)

mecab_grid.fit(text_train, y_train)
print(mecab_grid.best_score_)
print(mecab_grid.best_params_)

0.8699133333333334
{'logisticregression__C': 10, 'tfidfvectorizer__min_df': 3, 'tfidfvectorizer__ngram_range': (1, 3)}


In [18]:
X_test_mecab = mecab_grid.best_estimator_.named_steps['tfidfvectorizer'].transform(text_test)
score = mecab_grid.best_estimator_.named_steps['logisticregression'].score(X_test_mecab, y_test)

print(score)

0.87538
