# Sentiment Analysis
> * Positive or Negative

In [1]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os

import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_curve
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

In [2]:
rawdata = pd.read_csv('./data/sentiment_data/raw_data_for_sentiment.txt',header=None, encoding='utf-8')
rawdata.head()

Unnamed: 0,0,1
0,주택조합 승인 자체가 몹시 어려웠던 시절 한국기자협회 주택조합 형식으로 추진된 이 ...,1
1,"서 의원은 여의도 회견을 열어 “압도적으로 승리할 것이고 국민, 당원이 주문하신 ...",1
2,박근혜 대통령이 14일 열린 새누리당 전당대회에 참석한 것은 그만큼 아직 여권에서 ...,1
3,"“살기 좋은 전북 만들기, 중앙정부와 연결고리 최선” “월드컵에 출전한 우리 국가대...",1
4,"""여러분 너무나 고맙고 감사합니다. 도와주신 여러분께 저의 마음속의 큰 절을 받아주...",1


In [3]:
from numba import jit
@jit
def Tagging2(sentence, tagger, stopwords):
    pos = pd.Series(tagger.pos(sentence)).str[0]
    pos = pos[~pos.isin(stopwords)]
    return pos.tolist()

## Stopwords

In [4]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## Twitter 

In [5]:
from ckonlpy.tag import Twitter
ct = Twitter()

In [6]:
%%time
corpus = []
y = []
for idx in tqdm(rawdata.index):
    corpus.append(' '.join(Tagging2(rawdata.loc[idx][0], ct, stopwords)))
    y.append(rawdata.loc[idx][1])

100%|██████████| 2500/2500 [02:50<00:00, 14.63it/s]

CPU times: user 2min 13s, sys: 7.03 s, total: 2min 20s
Wall time: 2min 50s





In [7]:
tfidf = TfidfVectorizer().fit(corpus)
X = tfidf.transform(corpus).toarray()
y = np.array(y)

In [8]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.15, random_state=42)
train_X.shape, test_X.shape, train_y.shape, test_y.shape

((2125, 25745), (375, 25745), (2125,), (375,))

### Naive Bayesian 모형

In [9]:
nb_model = MultinomialNB()
nb_model.fit(train_X, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [10]:
print(classification_report(train_y, nb_model.predict(train_X)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1057
          1       1.00      1.00      1.00      1068

avg / total       1.00      1.00      1.00      2125



In [11]:
print(classification_report(test_y, nb_model.predict(test_X)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       194
          1       1.00      1.00      1.00       181

avg / total       1.00      1.00      1.00       375



### RandomFrest

In [12]:
%%time
rf_model = RandomForestClassifier()
rf_model.fit(train_X, train_y)

CPU times: user 1.44 s, sys: 185 ms, total: 1.63 s
Wall time: 1.94 s


In [13]:
print(classification_report(train_y, rf_model.predict(train_X)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1057
          1       1.00      1.00      1.00      1068

avg / total       1.00      1.00      1.00      2125



In [14]:
print(classification_report(test_y, rf_model.predict(test_X)))

             precision    recall  f1-score   support

          0       0.99      0.99      0.99       194
          1       0.99      0.99      0.99       181

avg / total       0.99      0.99      0.99       375



### Logistic Regression

In [15]:
log_model = LogisticRegression()
log_model.fit(train_X, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [16]:
print(classification_report(train_y, log_model.predict(train_X)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1057
          1       1.00      1.00      1.00      1068

avg / total       1.00      1.00      1.00      2125



In [17]:
print(classification_report(test_y, log_model.predict(test_X)))

             precision    recall  f1-score   support

          0       1.00      0.99      1.00       194
          1       0.99      1.00      1.00       181

avg / total       1.00      1.00      1.00       375



## mecab

In [18]:
from konlpy.tag import Mecab
mecab = Mecab()

In [19]:
%%time
corpus = []
y = []
for idx in tqdm(rawdata.index):
    corpus.append(' '.join(Tagging2(rawdata.loc[idx][0], mecab, stopwords)))
    y.append(rawdata.loc[idx][1])

100%|██████████| 2500/2500 [00:22<00:00, 112.45it/s]

CPU times: user 14.7 s, sys: 2.55 s, total: 17.3 s
Wall time: 22.2 s





In [20]:
tfidf = TfidfVectorizer().fit(corpus)
X = tfidf.transform(corpus).toarray()
y = np.array(y)

In [21]:
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.15, random_state=42)
train_X.shape, test_X.shape, train_y.shape, test_y.shape

((2125, 22654), (375, 22654), (2125,), (375,))

### Naive Bayesian 모형

In [22]:
nb_model = MultinomialNB()
nb_model.fit(train_X, train_y)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
print(classification_report(train_y, nb_model.predict(train_X)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1057
          1       1.00      1.00      1.00      1068

avg / total       1.00      1.00      1.00      2125



In [24]:
print(classification_report(test_y, nb_model.predict(test_X)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00       194
          1       1.00      1.00      1.00       181

avg / total       1.00      1.00      1.00       375



### RandomFrest

In [25]:
%%time
rf_model = RandomForestClassifier()
rf_model.fit(train_X, train_y)

CPU times: user 1.16 s, sys: 150 ms, total: 1.31 s
Wall time: 1.51 s


In [26]:
print(classification_report(train_y, rf_model.predict(train_X)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1057
          1       1.00      1.00      1.00      1068

avg / total       1.00      1.00      1.00      2125



In [27]:
print(classification_report(test_y, rf_model.predict(test_X)))

             precision    recall  f1-score   support

          0       0.99      0.98      0.99       194
          1       0.98      0.99      0.99       181

avg / total       0.99      0.99      0.99       375



### Logistic Regression

In [28]:
log_model = LogisticRegression()
log_model.fit(train_X, train_y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [29]:
print(classification_report(train_y, log_model.predict(train_X)))

             precision    recall  f1-score   support

          0       1.00      1.00      1.00      1057
          1       1.00      1.00      1.00      1068

avg / total       1.00      1.00      1.00      2125



In [30]:
print(classification_report(test_y, log_model.predict(test_X)))

             precision    recall  f1-score   support

          0       1.00      0.99      1.00       194
          1       0.99      1.00      1.00       181

avg / total       1.00      1.00      1.00       375

