In [10]:
import sklearn
import pandas as pd
import numpy as np
from konlpy.tag import Twitter
twitter_tag = Twitter()

In [11]:
df_train = pd.read_csv('data/ratings_train.txt', delimiter='\t',
                      keep_default_na=False)

In [12]:
df_train.head(n=3)

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0


In [13]:
text_train = df_train['document'].as_matrix()
y_train = df_train['label'].as_matrix()

In [14]:
df_test = pd.read_csv('data/ratings_test.txt', delimiter='\t',
                     keep_default_na=False)
df_test.head(n=3)

Unnamed: 0,id,document,label
0,6270596,굳 ㅋ,1
1,9274899,GDNTOPCLASSINTHECLUB,0
2,8544678,뭐야 이 평점들은.... 나쁘진 않지만 10점 짜리는 더더욱 아니잖아,0


In [6]:
text_test = df_test['document'].as_matrix()
y_test = df_test['label'].as_matrix()

In [7]:
len(text_train), np.bincount(y_train)

(150000, array([75173, 74827]))

In [8]:
len(text_test), np.bincount(y_test)

(50000, array([24827, 25173]))

In [9]:
def twitter_tokenizer(text):
    return twitter_tag.morphs(text)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV

In [11]:
twit_param_grid = {'tfidfvectorizer__min_df': [3,5,7],
                  'tfidfvectorizer__ngram_range': [(1,1),(1,2),(1,3)],
                  'logisticregression__C': [0.1, 1, 10]}
twit_pipe = make_pipeline(TfidfVectorizer(tokenizer=twitter_tokenizer),
                         LogisticRegression())
twit_grid = GridSearchCV(twit_pipe, twit_param_grid)

In [None]:
twit_grid.fit(text_train[0:1000], y_train[0:1000])

In [1]:
from konlpy.tag import Mecab

In [2]:
mecab = Mecab()

In [3]:
def mecab_tokenizer(text):
    return mecab.morphs(text)

In [4]:
mecab_param_grid = {'tfidfvectorizer__min_df': [3,5,7],
                   'tfidfvectorizer__ngram_range': [(1,1), (1,2), (1,3)],
                   'logisticregression__C': [0.1, 1, 10]}

In [6]:
mecab_pipe = make_pipeline(TfidfVectorizer(tokenizer=mecab_tokenizer),
                          LogisticRegression())

In [7]:
mecab_grid = GridSearchCV(mecab_pipe, mecab_param_grid, n_jobs=-1)

In [None]:
mecab_grid.fit(text_train, y_train)