# Sentiment Analysis
> * Positive or Negative

In [None]:
import os
from glob import glob
import pandas as pd
import html
import itertools
import pickle
import re

from collections import namedtuple
from gensim.models import doc2vec
import multiprocessing
from konlpy.tag import Mecab
from konlpy.utils import pprint
import numpy as np
from ckonlpy.tag import Twitter as ctwitter
mecab = Mecab()
ct = ctwitter()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
newsSentence = pd.read_csv('./data/sentiment_data/merged_sentiment_data.txt',encoding='utf-8', header=None)
newsSentence.head()

## Doc2Vec

In [161]:
def read_data(filename):
    with open(filename, 'r',encoding='utf-8') as f:
        data = [line.split('\t') for line in f.read().splitlines()]
        data = data[1:] # header
    return data

In [163]:
rating = read_data('./data/sentiment_data/ratings.txt')

In [162]:
def tokenize(doc):
    return ['/'.join(t) for t in ct.pos(doc)]

In [165]:
rating_doc = [(tokenize(row[1]), row[2]) for row in rating]
news_doc = [(tokenize(newsSentence.loc[idx][0]), newsSentence.loc[idx][1]) for idx in newsSentence.index]

In [166]:
pickle.dump(rating_doc+news_doc, open('./data/pre_data/pre_data_for_sentiment_analysis.pickled','wb'))

In [169]:
from gensim.models.doc2vec import TaggedDocument

In [167]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags')

In [168]:
tagged_run_docs = [TaggedDocument(d, [c]) for d, c in rating_doc+news_doc]
pickle.dump(tagged_run_docs, open('./data/pre_data/pre_data_tagged_run_docs.pickled','wb'))

In [170]:
tagged_run_docs = pickle.load(open('./data/pre_data/pre_data_tagged_run_docs.pickled','rb'))

In [171]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(tagged_run_docs, test_size=0.25, random_state=42)

In [172]:
doc_vectorizer = doc2vec.Doc2Vec(size=500, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer.build_vocab(train)

In [173]:
for epoch in range(25):
    doc_vectorizer.train(train, total_examples=doc_vectorizer.corpus_count, epochs=doc_vectorizer.iter)
    doc_vectorizer.alpha -= 0.002  # decrease the learning rate
    doc_vectorizer.min_alpha = doc_vectorizer.alpha  # fix the learning rate, no decay

In [174]:
#To save
doc_vectorizer.save('./model/doc2vec_size500_epoch25.model')
pprint(doc_vectorizer.most_similar('문재인/Noun'))
pprint(doc_vectorizer.most_similar('노무현/Noun'))
pprint(doc_vectorizer.most_similar('박근혜/Noun'))

  This is separate from the ipykernel package so we can avoid doing imports until


[('안철수/Noun', 0.5092235803604126),
 ('손학규/Noun', 0.4781286120414734),
 ('문/Noun', 0.47694167494773865),
 ('김종인/Noun', 0.41631853580474854),
 ('박근혜/Noun', 0.40870070457458496),
 ('정세균/Noun', 0.39632225036621094),
 ('김무성/Noun', 0.3882656991481781),
 ('김한길/Noun', 0.36604562401771545),
 ('이태근/Noun', 0.3626297116279602),
 ('박/Noun', 0.3578115403652191)]
[('이명박/Noun', 0.6439641118049622),
 ('박근혜/Noun', 0.6232602000236511),
 ('노/Noun', 0.6187499761581421),
 ('김영삼/Noun', 0.5640239715576172),
 ('박/Noun', 0.48870864510536194),
 ('김대중/Noun', 0.4885497987270355),
 ('박정희/Noun', 0.47176212072372437),
 ('노태우/Noun', 0.46269893646240234),
 ('盧/Foreign', 0.4458937346935272),
 ('이승만/Noun', 0.4335157871246338)]
[('이명박/Noun', 0.7078977823257446),
 ('박/Noun', 0.670047402381897),
 ('노무현/Noun', 0.6232602000236511),
 ('김영삼/Noun', 0.5593474507331848),
 ('노/Noun', 0.534331202507019),
 ('김대중/Noun', 0.4972655177116394),
 ('김/Noun', 0.46960729360580444),
 ('노태우/Noun', 0.45434486865997314),
 ('朴/Foreign', 0.44911116

  after removing the cwd from sys.path.
  """


In [177]:
doc_vectorizer2 = doc2vec.Doc2Vec(size=1000, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer2.build_vocab(train)

for epoch in range(25):
    doc_vectorizer2.train(train, total_examples=doc_vectorizer2.corpus_count, epochs=doc_vectorizer2.iter)
    doc_vectorizer2.alpha -= 0.002  # decrease the learning rate
    doc_vectorizer2.min_alpha = doc_vectorizer2.alpha  # fix the learning rate, no decay

#To save
doc_vectorizer2.save('./model/doc2vec_size1000_epoch25.model')
pprint(doc_vectorizer2.most_similar('문재인/Noun'))
pprint(doc_vectorizer2.most_similar('노무현/Noun'))
pprint(doc_vectorizer2.most_similar('박근혜/Noun'))

  This is separate from the ipykernel package so we can avoid doing imports until


[('문/Noun', 0.3691737651824951),
 ('안철수/Noun', 0.32957664132118225),
 ('손학규/Noun', 0.32620835304260254),
 ('박근혜/Noun', 0.3075103461742401),
 ('이태근/Noun', 0.29577916860580444),
 ('정세균/Noun', 0.29367774724960327),
 ('추미애/Noun', 0.2902194559574127),
 ('박/Noun', 0.2826129198074341),
 ('전혜숙/Noun', 0.27465057373046875),
 ('조수용/Noun', 0.27053505182266235)]
[('노/Noun', 0.5221443176269531),
 ('이명박/Noun', 0.5167340636253357),
 ('박근혜/Noun', 0.4996199607849121),
 ('김영삼/Noun', 0.4124770760536194),
 ('박/Noun', 0.3791698217391968),
 ('노태우/Noun', 0.352672815322876),
 ('김대중/Noun', 0.3470269441604614),
 ('박정희/Noun', 0.3383614420890808),
 ('盧/Foreign', 0.32480141520500183),
 ('全斗換/Foreign', 0.32205379009246826)]
[('박/Noun', 0.6092792749404907),
 ('이명박/Noun', 0.5681461095809937),
 ('노무현/Noun', 0.4996200203895569),
 ('노/Noun', 0.4388248026371002),
 ('김영삼/Noun', 0.42257383465766907),
 ('김대중/Noun', 0.4170014262199402),
 ('김/Noun', 0.4114646911621094),
 ('朴/Foreign', 0.36898529529571533),
 ('노태우/Noun', 0.3511

  after removing the cwd from sys.path.
  """


In [None]:
doc_vectorizer3 = doc2vec.Doc2Vec(size=2000, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizer3.build_vocab(train)

for epoch in range(35):
    doc_vectorizer3.train(train, total_examples=doc_vectorizer3.corpus_count, epochs=doc_vectorizer3.iter)
    doc_vectorizer3.alpha -= 0.002  # decrease the learning rate
    doc_vectorizer3.min_alpha = doc_vectorizer2.alpha  # fix the learning rate, no decay

#To save
doc_vectorizer3.save('./model/doc2vec_size2000_epoch35.model')
pprint(doc_vectorizer3.most_similar('문재인/Noun'))
pprint(doc_vectorizer3.most_similar('노무현/Noun'))
pprint(doc_vectorizer3.most_similar('박근혜/Noun'))

In [180]:
def tokenize2(doc):
    return ['/'.join(t) for t in mecab.pos(doc)]

In [181]:
rating_doc2 = [(tokenize2(row[1]), row[2]) for row in rating]
news_doc2 = [(tokenize2(newsSentence.loc[idx][0]), newsSentence.loc[idx][1]) for idx in newsSentence.index]

In [182]:
pickle.dump(rating_doc2+news_doc2, open('./data/pre_data/pre_data_by_mecab_for_sentiment_analysis.pickled','wb'))

In [None]:
tagged_run_docs2 = [TaggedDocument(d, [c]) for d, c in rating_doc2+news_doc2]
pickle.dump(tagged_run_docs2, open('./data/pre_data/pre_data_by_mecab_tagged_run_docs.pickled','wb'))

In [None]:
tagged_run_docs2 = pickle.load(open('./data/pre_data/pre_data_by_mecab_tagged_run_docs.pickled','rb'))

In [None]:
from sklearn.model_selection import train_test_split
train2, test2 = train_test_split(tagged_run_docs2, test_size=0.25, random_state=42)

In [None]:
doc_vectorizerx = doc2vec.Doc2Vec(size=1500, alpha=0.025, min_alpha=0.025, seed=1234)
doc_vectorizerx.build_vocab(train2)

In [None]:
for epoch in range(25):
    doc_vectorizerx.train(train, total_examples=doc_vectorizerx.corpus_count, epochs=doc_vectorizerx.iter)
    doc_vectorizerx.alpha -= 0.002  # decrease the learning rate
    doc_vectorizerx.min_alpha = doc_vectorizerx.alpha  # fix the learning rate, no decay

In [None]:
#To save
doc_vectorizerx.save('./model/doc2vec_by_mecab_size1500_epoch25.model')
pprint(doc_vectorizerx.most_similar('문재인/Noun'))
pprint(doc_vectorizerx.most_similar('노무현/Noun'))
pprint(doc_vectorizerx.most_similar('박근혜/Noun'))