# Sentiment Analysis
> * Positive or Negative

In [1]:
import os
from glob import glob
import pandas as pd
import html
import itertools
import pickle
import re
from konlpy.tag import Mecab
from konlpy.utils import pprint
import numpy as np
from ckonlpy.tag import Twitter as ctwitter
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.naive_bayes import MultinomialNB,GaussianNB
mecab = Mecab()
ct = ctwitter()


In [2]:
rawdata = pd.read_csv('./data/sentiment_data/raw_data_for_sentiment.txt',header=None, encoding='utf-8')
rawdata.head()

Unnamed: 0,0,1
0,이젠 민주개혁의 길로 사설 이 90년대가 우리 모두에게 성취의 시대이기를 기원하고...,1
1,민주당과 공화당은 새해부터 정계개편을 적극 추진한다는 방침이며 양당의 김영삼ㆍ김종필...,1
2,"4당,지자제 연합공천 대비 파트너 탐색전 이같은 구상을 실현시키기 위해 민정당은...",1
3,90년대는 국내적으로 정치민주화와 선진국에로의 도약여부가 판가름날 것이고 한반도를 ...,1
4,"특히 노태우대통령이 3일 대국민 특별담화를 발표, 지난시대 의 완전청산을 선언한 후...",1


In [3]:
X = list(rawdata[0])
y = np.array(list(rawdata[1]), dtype=int)

In [4]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
print (len(train_X), len(test_X), len(train_y), len(test_y))

383316 95830 383316 95830


In [6]:
model_mb1 = Pipeline([
            ('vect', CountVectorizer()), 
            ('mb', MultinomialNB()),
        ])

In [7]:
%%time 
model_mb1.fit(train_X, train_y)

CPU times: user 53.1 s, sys: 3.91 s, total: 57.1 s
Wall time: 58 s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [8]:
print(classification_report(test_y, model_mb1.predict(test_X)))

             precision    recall  f1-score   support

          0       0.76      0.80      0.78     49155
          1       0.77      0.74      0.76     46675

avg / total       0.77      0.77      0.77     95830



In [9]:
model_mb2 = Pipeline([
            ('vect', TfidfVectorizer()), 
            ('mb', MultinomialNB()),
        ])

In [10]:
%%time 
model_mb2.fit(train_X, train_y)

CPU times: user 56.8 s, sys: 2.07 s, total: 58.8 s
Wall time: 58.9 s


Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
  ...True,
        vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [11]:
print(classification_report(test_y, model_mb2.predict(test_X)))

             precision    recall  f1-score   support

          0       0.73      0.89      0.80     49155
          1       0.85      0.66      0.74     46675

avg / total       0.79      0.78      0.77     95830



In [12]:
model_log1 = Pipeline([
            ('vect', CountVectorizer()), 
            ('log', LogisticRegression()),
        ])

In [13]:
%%time 
model_log1.fit(train_X, train_y)

CPU times: user 5min 39s, sys: 4.18 s, total: 5min 43s
Wall time: 5min 46s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [14]:
print(classification_report(test_y, model_log1.predict(test_X)))

             precision    recall  f1-score   support

          0       0.91      0.88      0.89     49155
          1       0.87      0.91      0.89     46675

avg / total       0.89      0.89      0.89     95830



### twitter

In [15]:
def tokenize_pos1(doc):
    return ['/'.join(t) for t in ct.pos(doc)]

In [16]:
model_ct_mb1 = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_pos1)), 
            ('mb', MultinomialNB()),
        ])

In [17]:
%%time 
model_ct_mb1.fit(train_X, train_y)

CPU times: user 1h 26min 56s, sys: 18.3 s, total: 1h 27min 14s
Wall time: 1h 27min 7s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_pos1 at 0x11544c0d0>, vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [18]:
print(classification_report(test_y, model_ct_mb1.predict(test_X)))

             precision    recall  f1-score   support

          0       0.77      0.73      0.75     49155
          1       0.73      0.77      0.75     46675

avg / total       0.75      0.75      0.75     95830



In [19]:
model_ct_mb2 = Pipeline([
            ('vect', TfidfVectorizer(tokenizer=tokenize_pos1, ngram_range=(1,2))), 
            ('mb', MultinomialNB()),
        ])

In [20]:
%%time 
model_ct_mb2.fit(train_X, train_y)

CPU times: user 1h 29min 20s, sys: 25.9 s, total: 1h 29min 46s
Wall time: 1h 29min 46s


Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
  ...True,
        vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [21]:
print(classification_report(test_y, model_ct_mb2.predict(test_X)))

             precision    recall  f1-score   support

          0       0.72      0.92      0.81     49155
          1       0.89      0.61      0.73     46675

avg / total       0.80      0.77      0.77     95830



In [22]:
model_ct_log1 = Pipeline([
            ('vect', TfidfVectorizer(tokenizer=tokenize_pos1, ngram_range=(1,2))), 
            ('log', LogisticRegression()),
        ])

In [23]:
%%time 
model_ct_log1.fit(train_X, train_y)

CPU times: user 1h 32min 30s, sys: 29.3 s, total: 1h 32min 59s
Wall time: 1h 33min 1s


Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [24]:
print(classification_report(test_y, model_ct_log1.predict(test_X)))

             precision    recall  f1-score   support

          0       0.92      0.91      0.92     49155
          1       0.91      0.92      0.91     46675

avg / total       0.91      0.91      0.91     95830



### mecab

In [25]:
def tokenize_pos2(doc):
    return ['/'.join(t) for t in mecab.pos(doc)]

In [26]:
model_mecab_mb1 = Pipeline([
            ('vect', CountVectorizer(tokenizer=tokenize_pos2)), 
            ('mb', MultinomialNB()),
        ])

In [27]:
%%time 
model_mecab_mb1.fit(train_X, train_y)

CPU times: user 4min 58s, sys: 7.73 s, total: 5min 6s
Wall time: 5min 8s


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...400>,
        vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [28]:
print(classification_report(test_y, model_mecab_mb1.predict(test_X)))

             precision    recall  f1-score   support

          0       0.77      0.74      0.76     49155
          1       0.74      0.76      0.75     46675

avg / total       0.75      0.75      0.75     95830



In [29]:
model_mecab_mb2 = Pipeline([
            ('vect', TfidfVectorizer(tokenizer=tokenize_pos2, ngram_range=(1,2))), 
            ('mb', MultinomialNB()),
        ])

In [30]:
%%time 
model_mecab_mb2.fit(train_X, train_y)

CPU times: user 7min 52s, sys: 18.7 s, total: 8min 11s
Wall time: 8min 12s


Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
  ...True,
        vocabulary=None)), ('mb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [31]:
print(classification_report(test_y, model_mecab_mb2.predict(test_X)))

             precision    recall  f1-score   support

          0       0.71      0.92      0.81     49155
          1       0.88      0.61      0.72     46675

avg / total       0.80      0.77      0.77     95830



In [32]:
model_mecab_log1 = Pipeline([
            ('vect', TfidfVectorizer(tokenizer=tokenize_pos2, ngram_range=(1,2))), 
            ('log', LogisticRegression()),
        ])

In [33]:
%%time 
model_mecab_log1.fit(train_X, train_y)

CPU times: user 10min 45s, sys: 18.2 s, total: 11min 3s
Wall time: 11min 5s


Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
  ...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [34]:
print(classification_report(test_y, model_mecab_log1.predict(test_X)))

             precision    recall  f1-score   support

          0       0.92      0.92      0.92     49155
          1       0.91      0.92      0.91     46675

avg / total       0.92      0.92      0.92     95830

