In [189]:
import konlpy
import pickle
import re

from konlpy.tag import Hannanum, Kkma, Komoran, Mecab, Twitter
hannanum = Hannanum()
kkma = Kkma()
twitter = Twitter()

import xgboost
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from scipy.sparse import csr_matrix
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score



---

# Train (역삼동 - 중국집)

In [62]:
location = '역삼동'
category = '중국집'

In [63]:
train_df = pickle.load(open('./data/{}_{}_df.pkl'.format(location, category),'rb'))
train_df.index += 1

## hannanum

## twitter

In [64]:
hannanum.nouns(train_df['Review'][1])

['잘먹었습니닿ㅎㅎㅎㅎ', 'ㅎ']

In [65]:
twitter.nouns(train_df['Review'][1])

[]

In [66]:
hannanum.morphs(train_df['Review'][1])

['잘먹었습니닿ㅎㅎㅎㅎ', 'ㅎ']

In [67]:
twitter.morphs(train_df['Review'][1])

['잘', '먹었', '습니다', 'ㅎㅎㅎㅎㅎㅎ']

In [68]:
train_df['Total'].value_counts()

5    6268
4    2421
3    1100
1     823
2     381
Name: Total, dtype: int64

In [69]:
train_df.loc[train_df['Total'] == '2', 'Total'] = '1'

In [70]:
train_df.loc[train_df['Total'] == '4', 'Total'] = '5'

In [71]:
train_df['Total'].value_counts()
train_df['Total'].value_counts()

5    8689
1    1204
3    1100
Name: Total, dtype: int64

---

# Test (방이2동 - 중국집)

In [74]:
location = '방이2동'
category = '중국집'

In [75]:
test_df = pickle.load(open('./data/{}_{}_df.pkl'.format(location, category),'rb'))
test_df.index += 1

## hannanum

## twitter

In [76]:
hannanum.nouns(test_df['Review'][1])

['존맛탱']

In [77]:
twitter.nouns(test_df['Review'][1])

['처음', '봄', '존맛', '임']

In [78]:
hannanum.morphs(test_df['Review'][1])

['처음', '먹', '어', '보', 'ㅁ', '존맛탱', '이', 'ㅁ']

In [79]:
twitter.morphs(test_df['Review'][1])

['처음', '먹어', '봄', '존맛', '탱', '임']

In [82]:
test_df['Total'].value_counts()

5    1901
4     966
3     433
1     382
2     166
Name: Total, dtype: int64

In [83]:
test_df.loc[test_df['Total'] == '2', 'Total'] = '1'

In [85]:
test_df.loc[test_df['Total'] == '4', 'Total'] = '5'

In [86]:
test_df['Total'].value_counts()
test_df['Total'].value_counts()

5    2867
1     548
3     433
Name: Total, dtype: int64

---

## Naive Bayes Model

In [169]:
X = train_df['Review']
len(X)

10993

In [170]:
y = train_df['Total']
len(y)

10993

In [171]:
MNB_model = Pipeline([
            ('vect', CountVectorizer()), 
            ('mb', MultinomialNB()),
        ])

In [172]:
%time MNB_model.fit(X, y)

CPU times: user 182 ms, sys: 8.21 ms, total: 191 ms
Wall time: 189 ms


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [173]:
X_test = test_df['Review']
len(X_test)

3848

In [174]:
y_test = test_df['Total']
len(y_test)

3848

In [175]:
print(accuracy_score(y_test, MNB_model.predict(X_test)))

0.7905405405405406


In [176]:
print(confusion_matrix(y_test, MNB_model.predict(X_test)))

[[ 190    3  355]
 [  20    9  404]
 [  16    8 2843]]


In [177]:
print(classification_report(y_test, MNB_model.predict(X_test)))

             precision    recall  f1-score   support

          1       0.84      0.35      0.49       548
          3       0.45      0.02      0.04       433
          5       0.79      0.99      0.88      2867

avg / total       0.76      0.79      0.73      3848



In [178]:
result = pd.DataFrame(MNB_model.predict(X_test), y_test).reset_index()
result.columns = ['Pred','Real']
# result[result['Pred']!=result['Real']]

---

## XGB

In [181]:
XGB_model = Pipeline([
            ('vect', CountVectorizer()), 
            ('xgb', xgboost.XGBClassifier(max_depth=30, n_estimators=300, n_jobs=6)),
        ])

In [182]:
%time XGB_model.fit(X, y)

CPU times: user 1min 13s, sys: 199 ms, total: 1min 13s
Wall time: 1min 14s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [183]:
print(accuracy_score(y_test, XGB_model.predict(X_test)))

0.7954781704781705


  if diff:


In [184]:
print(confusion_matrix(y_test, XGB_model.predict(X_test)))

[[ 233   37  278]
 [  48   55  330]
 [  30   64 2773]]


  if diff:


In [185]:
print(classification_report(y_test, XGB_model.predict(X_test)))

             precision    recall  f1-score   support

          1       0.75      0.43      0.54       548
          3       0.35      0.13      0.19       433
          5       0.82      0.97      0.89      2867

avg / total       0.76      0.80      0.76      3848



  if diff:
