In [24]:
import pandas as pd
import numpy as np
import re
import urllib.request
from konlpy.tag import Okt
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


In [3]:
test_df = pd.read_table('../static/data/naver/ratings_test.txt')
train_df = pd.read_table('../static/data/naver/ratings_train.txt')
train_df.head()

Unnamed: 0,id,document,label
0,9976970,아 더빙.. 진짜 짜증나네요 목소리,0
1,3819312,흠...포스터보고 초딩영화줄....오버연기조차 가볍지 않구나,1
2,10265843,너무재밓었다그래서보는것을추천한다,0
3,9045019,교도소 이야기구먼 ..솔직히 재미는 없다..평점 조정,0
4,6483659,사이몬페그의 익살스런 연기가 돋보였던 영화!스파이더맨에서 늙어보이기만 했던 커스틴 ...,1


### 데이터 전처리

In [5]:
# 중복 샘플 제거
train_df.drop_duplicates(subset=['document'], inplace=True) 
# Null 값이 존재하는 행 제거
train_df = train_df.dropna(how = 'any') 
train_df.isnull().sum()

id          0
document    0
label       0
dtype: int64

In [6]:
# Null 값이 존재하는 행 제거
test_df = test_df.dropna(how = 'any') 
test_df.drop_duplicates(subset=['document'], inplace=True) 
test_df.isnull().sum()

id          0
document    0
label       0
dtype: int64

### 텍스트 전처리

In [9]:
# 한글과 공백을 제외하고 모두 제거
train_df['document'] = train_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
# 공백 제거
train_df['document'].replace('', np.nan, inplace=True)
# NA 제거
train_df = train_df.dropna(how = 'any')

In [10]:
test_df['document'] = test_df['document'].str.replace("[^ㄱ-ㅎㅏ-ㅣ가-힣 ]","")
test_df['document'].replace('', np.nan, inplace=True)
test_df = test_df.dropna(how = 'any')

In [11]:
train_df.to_csv('../static/data/naver/train_df.tsv',sep='\t')
test_df.to_csv('../static/data/naver/test_df.tsv',sep='\t')

In [12]:
stopwords = ['의','가','이','은','들','는','좀','잘','걍','과','도','를','으로','자','에','와','한','하다','을']
okt = Okt()
okt.morphs('와 이런 것도 영화라고 차라리 뮤직비디오를 만드는 게 나을 뻔', stem = True)

['오다', '이렇다', '것', '도', '영화', '라고', '차라리', '뮤직비디오', '를', '만들다', '게', '나다', '뻔']

In [13]:
from tqdm.notebook import tqdm
X_train = []
for sentence in tqdm(train_df['document']):
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True) # 토큰화 stem => 어근으로 추정
    temp_X = ' '.join([word for word in temp_X if not word in stopwords]) # 불용어 제거
    X_train.append(temp_X)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=145791.0), HTML(value='')))




In [14]:
X_test = []
for sentence in tqdm(test_df['document']):
    temp_X = []
    temp_X = okt.morphs(sentence, stem=True) # 토큰화 stem => 어근으로 추정
    temp_X = ' '.join([word for word in temp_X if not word in stopwords]) # 불용어 제거
    X_test.append(temp_X)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=48995.0), HTML(value='')))




In [15]:
y_train = train_df.label.values
y_test = test_df.label.values

In [17]:
cvector = CountVectorizer()
tfidf_vect = TfidfVectorizer()
nb = MultinomialNB()
lr_clf = LogisticRegression()


CountVectorizer + LogisticRegression

In [33]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer()),
    ('lr_clf', LogisticRegression())
])
params = {
    'count_vect__ngram_range':[(1,1),(1,2)],
    'count_vect__max_df':[0.8,0.9],
    'count_vect__min_df':[1,2],
}
grid_pipe = GridSearchCV(pipeline, param_grid=params,cv=3,
                            scoring='accuracy',verbose=1,n_jobs=-1)
grid_pipe.fit(X_train,y_train)
pred = grid_pipe.predict(X_test)
acc = accuracy_score(y_test,pred)
print(acc)
best_count_lr = grid_pipe.best_estimator_

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  2.1min finished
0.8466374119808143


CountVectorizer + naive bayes

In [35]:
pipeline = Pipeline([
    ('count_vect', CountVectorizer()),
    ('nb', MultinomialNB())
])
params = {
    'count_vect__ngram_range':[(1,1),(1,2)],
    'count_vect__max_df':[0.8,0.9],
    'count_vect__min_df':[1,2],
}
grid_pipe = GridSearchCV(pipeline, param_grid=params,cv=3,
                            scoring='accuracy',verbose=1,n_jobs=-1)
grid_pipe.fit(X_train,y_train)
pred = grid_pipe.predict(X_test)
acc = accuracy_score(y_test,pred)
print(acc)
best_count_nb = grid_pipe.best_estimator_

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   33.1s finished
0.8447392591080722


TfidfVectorizer + LogisticRegression

In [36]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer()),
    ('lr_clf', LogisticRegression())
])
params = {
    'tfidf_vect__ngram_range':[(1,1),(1,2)],
    'tfidf_vect__max_df':[0.8,0.9],
    'tfidf_vect__min_df':[1,2],
}
grid_pipe = GridSearchCV(pipeline, param_grid=params,cv=3,
                            scoring='accuracy',verbose=1,n_jobs=-1)
grid_pipe.fit(X_train,y_train)
pred = grid_pipe.predict(X_test)
acc = accuracy_score(y_test,pred)
print(acc)
best_tfidf_lr = grid_pipe.best_estimator_

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  1.9min finished
0.8448209000918461


TfidfVectorizer + naive bayes

In [38]:
pipeline = Pipeline([
    ('tfidf_vect', TfidfVectorizer()),
    ('nb', MultinomialNB())
])
params = {
    'tfidf_vect__ngram_range':[(1,1),(1,2)],
    'tfidf_vect__max_df':[0.8,0.9],
    'tfidf_vect__min_df':[1,2],
}
grid_pipe = GridSearchCV(pipeline, param_grid=params,cv=3,
                            scoring='accuracy',verbose=1,n_jobs=-1)
grid_pipe.fit(X_train,y_train)
pred = grid_pipe.predict(X_test)
acc = accuracy_score(y_test,pred)
print(acc)
best_tfidf_nb = grid_pipe.best_estimator_

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:   33.6s finished
0.8459434636187366


In [39]:
import joblib
joblib.dump(best_count_lr, '../static/model/naver_count_lr.pkl')
joblib.dump(best_count_nb, '../static/model/naver_count_nb.pkl')
joblib.dump(best_tfidf_lr, '../static/model/naver_tfidf_lr.pkl')
joblib.dump(best_tfidf_nb, '../static/model/naver_tfidf_nb.pkl')

['../static/model/naver_tfidf_nb.pkl']

In [57]:
naver_count_lr = joblib.load('../static/model/naver_count_lr.pkl')

In [59]:
naver_count_lr.predict(X_test)[0]

1

In [71]:
test_data = []
for sentence in test_df['document'][0]:
            temp_X = []
            temp_X = okt.morphs(sentence, stem=True)
            temp_X = ' '.join([word for word in temp_X if not word in stopwords]) 
            test_data.append(temp_X)
test_data

['굳다', '', 'ㅋ']

In [70]:
naver_count_lr.predict(test_data)[0]

1