# 수집된 뉴스 기사 및 댓글에 대한 감정 분석
## * FastText
* 데이터 
> 2017년 12월 1일부터 2018년 2월 1일까지 63일간 [네이버](http://www.naver.com)와 [다음](http://www.daum.net)의 랭킹뉴스와 뉴스의 댓글을 크롤링함.

In [1]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os
from glob import glob
import warnings

os.environ['KERAS_BACKEND']='tensorflow'

from numba import jit
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from gensim.models import FastText, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve,  accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.manifold import TSNE

import keras.backend.tensorflow_backend as K
from keras.preprocessing import sequence
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Flatten, Dense, Embedding, embeddings, merge, Dropout, Activation,  LSTM, Bidirectional, SimpleRNN, GRU
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import SpatialDropout1D
from keras.utils import np_utils
from tensorflow.python.client import device_lib
from keras.layers.merge import dot

import xgboost as xgb

import matplotlib.pyplot as plt

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import Database_Handler as dh
import Basic_Module as bm

In [3]:
from ckonlpy.tag import Twitter
from konlpy.tag import Mecab
ct = Twitter()
mecab = Mecab()

## Stopwords

In [4]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## TaggedDocument For FastText

In [5]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags sentiment')

## Load Data

### Path

In [6]:
if sys.platform =='darwin':
    loadModelPath = '/Volumes/disk1/model/'
    classifierPath = '/Volumes/disk1/data/pre_data/classifier/'
    news_senti_outcome = '/Volumes/disk1/outcome_for_News_sentiment_analysis/'
    daumCommentsPath = '/Volumes/disk1/data/daum_Comments/'
    naverCommentsPath = '/Volumes/disk1/data/naver_Comments/'
    outcomeDaumCommentsPath = '/Volumes/disk1/outcome_comments_for_daum/'
    outcomeNaverCommentsPath = '/Volumes/disk1/outcome_comments_for_naver/'
    outcome_predata = '/Volumes/disk1/pre_data_for_comments/'
    outcome_tagged_data = '/Volumes/disk1/pre_data_for_comments2/'
    outcome_vectorized_data = '/Volumes/disk1/pre_data_for_comments3/'
elif sys.platform =='win32':
    loadModelPath = 'd:/model/'
    classifierPath = 'd:/data/pre_data/classifier/'
    newsPath = './data/pre_data/news_sentiment/'
    news_senti_outcome = './outcome_for_News_sentiment_analysis/'
    daumCommentsPath = 'd:/data/daum_Comments/'
    naverCommentsPath = 'd:/data/naver_Comments/'
    outcomeDaumCommentsPath = 'd:/outcome_comments_for_daum/'
    outcomeNaverCommentsPath = 'd:/outcome_comments_for_naver/'
    outcome_predata = 'd:/pre_data_for_comments/'
    outcome_tagged_data = 'd:/pre_data_for_comments2/'
    outcome_vectorized_data = 'd:/pre_data_for_comments3/'

### News

In [7]:
os.listdir(news_senti_outcome)

['naver_news_sentiment_analysis.csv', 'daum_news_sentiment_analysis.csv']

In [8]:
# Naver
naverData = pd.read_csv(os.path.join(news_senti_outcome, 'naver_news_sentiment_analysis.csv'), index_col=0, header= 0, encoding = 'utf-8')
naverData['site'] = ['Naver'] * naverData.shape[0]
reNaverData = naverData[naverData.number_of_crawled_comment != 0]
print (reNaverData.shape)
reNaverData.head()

(15015, 14)


Unnamed: 0_level_0,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords,negative,positive,Decision,site
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5a29c445588c132954d1973a,정치,2017.12.07,연합뉴스,1713,1465,1,"北외무성 ""전쟁 바라지 않지만 결코 피하지 않을 것""","美고위인사 대북언급 비난하며 ""전쟁 기정사실화"" 위협 며칠 새 이어지는 북한 군민...","['외무성', '핵전쟁', '대변인']","{'미국', '조선반도', '핵전쟁', '북한', '중앙', '도화선', '고위',...",86.0,22.0,negative,Naver
5a29c445588c132954d1973b,정치,2017.12.07,한국일보,2551,2062,2,"예산전쟁, 예결위 간사ㆍ호남이 웃었다",예결위 간사들이 최대 수혜자..당 지도부 내 몫 챙기기도 여전 황주홍ㆍ김도읍 등...,"['예산', '예결위', 'soc']","{'호남', '예산안', '의원', '정부안', '증액', '지역구', '국민의당'}",46.0,62.0,positive,Naver
5a29c445588c132954d1973c,정치,2017.12.07,뉴시스,610,536,3,"혐의 부인에 20시간 조사…檢, 최경환 구속 카드 꺼내나",【서울=뉴시스】 최진석 기자 = 박근혜 정부 시절 국가정보원 특수활동비 수수 의혹 ...,"['최경환', '구속영장', '국가정보원']","{'구속영장 청구', '조사', '의원', '정기국회', '국정원장', '검찰', ...",77.0,31.0,negative,Naver
5a29c445588c132954d1973d,정치,2017.12.07,연합뉴스,145,133,4,"최재형 감사원장 후보자 ""독립성 강화는 임명권자의 뜻""",감사원장에 내정된 최재형 사법연수원장(고양=연합뉴스) 이희열 기자 = 7일 감사원장...,"['이슈 · 최재형 감사원장 내정', '감사원장', '최재형', '감사원']","{'공직 사회', '법관', '생활', '감사원장', '지명', '후보자'}",39.0,69.0,positive,Naver
5a29c445588c132954d1973e,정치,2017.12.07,동아일보,1074,932,5,"B-1B 한반도에 뜨자, 평양 비운 김정은",[동아일보] 북중 접경지 양강도 삼지연 시찰… 방북 유엔 사무차장 면담 안할듯 B-...,"['김정은', 'b-1b', '한반도']","{'공장', '양강도', '사무차장', '삼지연', '시찰', '훈련', '접경',...",72.0,36.0,negative,Naver


In [9]:
# Daum
daumData = pd.read_csv(os.path.join(news_senti_outcome, 'daum_news_sentiment_analysis.csv'), index_col=0, header= 0, encoding = 'utf-8')
daumData['site'] = ['daum'] * daumData.shape[0]
reDaumData = daumData[daumData.number_of_crawled_comment != 0]
print (reDaumData.shape)
reDaumData.head()

(9369, 14)


Unnamed: 0_level_0,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords,negative,positive,Decision,site
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5a2a61bf588c13481c229d1e,뉴스,2017.12.07,세계일보,1093,903,1,"""밤이 무섭다""..비아그라 공장 연기에 남성들 부작용 호소","주민들은 공장에서 배출된 연기가 '남성이 매우 건강해지는 부작용'을 일으킨다며, ...","['부작용', '비아그라', '아일랜드']","{'건강', '부작용', '공장', '남성들', '지역', '세보 효과', '연기'}",50.0,58.0,positive,daum
5a2a61bf588c13481c229d1f,뉴스,2017.12.07,헬스조선,603,386,2,식후 커피·늦은 양치질..점심식사 후 하면 안 좋은 습관 3가지,점심식사를 마친 후 후식으로 커피를 마시는 사람들이 많다. 실제로 직장이 밀집돼 ...,"['커피', '낮잠', '음식물']","{'건강', '커피', '자세', '치아', '철분', '디스크', '점심 식사',...",81.0,27.0,negative,daum
5a2a61bf588c13481c229d20,뉴스,2017.12.07,연합뉴스,1067,807,3,"'십년지기 생매장' 진짜 이유는..""'청부 통정' 알려질까 봐""",(성남=연합뉴스) 최해민 기자 = 십년지기 지인을 산 채로 묻어 살해한 50대 여...,"['살인혐의', '철원', '검찰송치']","{'앙심', '남편', '성관계', '주변', '진술', '지인', '범행', '아...",99.0,9.0,negative,daum
5a2a61bf588c13481c229d21,뉴스,2017.12.07,헤럴드경제,418,368,4,"신영자, 억 소리나는 갑질","신영자, 적용안된 혐의→검찰 상고에서 인정\n신영자, 얼마를 어떻게 받았나 [헤럴...","['신영자', '갑질', '롯데백화점']","{'네이처리퍼블릭', '롯데', '징역', '유통업체', '신영자 이사장', '검찰...",77.0,31.0,negative,daum
5a2a61bf588c13481c229d22,뉴스,2017.12.07,연합뉴스,434,367,5,"""배신하지마"" 20대女 살인 피의자 유치장서 공범 남친에 쪽지",(청주=연합뉴스) 이승민 기자 = 지난 9월 청주의 한 하천에서 20대 여성을 둔기...,"['공범', '살인', '과자']","{'폭행', '혐의', '과자', '범행', '쪽지', '남자친구', '경찰', '...",102.0,6.0,negative,daum


### 댓글

In [10]:
os.listdir(outcome_predata)

['predata_for_daum_news_comment.csv',
 'predata_for_naver_news_comment.csv',
 'filtered_predata_for_naver_news_comment.csv',
 'filtered_predata_for_daum_news_comment.csv']

In [11]:
%%time
predata_naver = outcome_predata +  'filtered_predata_for_naver_news_comment.csv'
dfNaver = pd.read_csv(predata_naver, header = 0, index_col = 0, encoding = 'utf-8')
print (dfNaver.shape)

  mask |= (ar1 == a)


(11888645, 8)
CPU times: user 51.1 s, sys: 9 s, total: 1min
Wall time: 1min 7s


In [12]:
extDfNaver = dfNaver.loc[:,['_id', 'category', 'date', 'rank', 'site', '공감', '비공감']]

In [13]:
%%time
predata_daum = outcome_predata +  'filtered_predata_for_daum_news_comment.csv'
dfDaum = pd.read_csv(predata_daum, header = 0, index_col = 0, encoding = 'utf-8')
print (dfDaum.shape)

  mask |= (ar1 == a)


(4924599, 8)
CPU times: user 20.2 s, sys: 2.99 s, total: 23.1 s
Wall time: 32.8 s


In [14]:
extDfDaum = dfDaum.loc[:,['_id', 'category', 'date', 'rank', 'site', '공감', '비공감']]

In [15]:
dfNaver = dfNaver[dfNaver._id == '5a29c445588c132954d1973a']
extDfNaver = extDfNaver[extDfNaver._id == '5a29c445588c132954d1973a']
dfDaum = dfDaum[dfDaum._id == '5a2a61bf588c13481c229d1e']
extDfDaum = extDfDaum[extDfDaum._id == '5a2a61bf588c13481c229d1e']

## FastText Model

### Twitter

#### News to tagged Document

In [20]:
%%time
tagged_by_ct_daum_file = outcome_tagged_data+'fastText_tagged_data_by_ct_for_daum_news_comment.pickled'
if os.path.isfile(tagged_by_ct_daum_file):
    tagged_daum_ct = pickle.load(open(tagged_by_ct_daum_file, 'rb'))
else:
    tagged_daum_ct = bm.MakeTaggedData_For_Comments(dfDaum, TaggedDocument, ct, stopwords)
    pickle.dump(tagged_daum_ct, open(tagged_by_ct_daum_file, 'wb'))
    #del tagged_daum_ct

tagged_by_ct_naver_file = outcome_tagged_data+'fastText_tagged_data_by_ct_for_naver_news_comment.pickled'
if os.path.isfile(tagged_by_ct_naver_file):
    tagged_naver_ct = pickle.load(open(tagged_by_ct_naver_file, 'rb'))
else:
    tagged_naver_ct = bm.MakeTaggedData_For_Comments(dfNaver, TaggedDocument, ct, stopwords)
    pickle.dump(tagged_naver_ct, open(tagged_by_ct_naver_file, 'wb'))
    #del tagged_naver_ct


CPU times: user 29 ms, sys: 47.6 ms, total: 76.6 ms
Wall time: 489 ms


#### Train data set으로부터 TF-IDF Vectorizer을 만듦

In [21]:
trainName_ct = './data/pre_data/train_test_Data/pre_data_train_for_fastText_sentiment_by_ct.pickled'
train_ct = pickle.load(open(trainName_ct, 'rb'))
tfidf_ct = bm.Build_tfidf(train_ct)
del train_ct

100%|██████████| 442359/442359 [00:00<00:00, 721017.89it/s]


(442359, 159010)
vocab size : 159010


#### model 1

##### Load Model

In [22]:
taggerName_ct = 'ct'
print ( '{}'.format(taggerName_ct))
model1_ct = FastText.load(loadModelPath+'fastText_size-1000_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-0_min_count-2_by-ct.model')
model1_ct_Name = bm.Return_ModelName('fastText', model1_ct,'ct')

ct


##### Vectorization

In [23]:
if os.path.isfile(outcome_vectorized_data+model1_ct_Name+'-daum'):
    daum_vecs_by_model1 = pickle.load(open(outcome_vectorized_data+model1_ct_Name+'-daum', 'rb'))
else:
    wv1, daum_vecs_by_model1 = bm.Make_Pre_Data_For_DAUM(model1_ct, tfidf_ct, 1000, tagged_daum_ct)
    pickle.dump(daum_vecs_by_model1, open(outcome_vectorized_data+model1_ct_Name+'-daum', 'wb'))
    del wv1#, daum_vecs_by_model1

  1%|          | 1691/162564 [00:00<00:09, 16885.98it/s]

FastText(vocab=162564, size=1000, alpha=0.025)


100%|██████████| 162564/162564 [00:02<00:00, 74750.59it/s]
3it [00:00, 29.79it/s]

running time : 0:00:02.228029
Vectorizing Data


903it [00:02, 423.67it/s]


scaling Data
total running time : 0:00:04.435935


In [24]:
if os.path.isfile(outcome_vectorized_data+model1_ct_Name+'-naver'):
    naver_vecs_by_model1 = pickle.load(open(outcome_vectorized_data+model1_ct_Name+'-naver', 'rb'))
else:
    wv1, naver_vecs_by_model1 = bm.Make_Pre_Data_For_DAUM(model1_ct, tfidf_ct, 1000, tagged_naver_ct)
    pickle.dump(naver_vecs_by_model1, open(outcome_vectorized_data+model1_ct_Name+'-naver', 'wb'))
    del wv1#, naver_vecs_by_model1

  5%|▌         | 8857/162564 [00:00<00:01, 88537.15it/s]

FastText(vocab=162564, size=1000, alpha=0.025)


100%|██████████| 162564/162564 [00:01<00:00, 98343.74it/s]
20it [00:00, 197.86it/s]

running time : 0:00:01.657101
Vectorizing Data


1465it [00:02, 488.61it/s]


scaling Data
total running time : 0:00:04.731622


##### Load Classifier

In [25]:
classifier_by_model1 = glob(classifierPath+'*'+model1_ct_Name)
load_Classifier_by_model1_Dict = dict(map(lambda x:bm.LoadClassifier(x), classifier_by_model1))

In [26]:
%%time
warnings.filterwarnings('ignore')
predict_Outcome_daum = dict(map(lambda x: bm.PredictSentiment(daum_vecs_by_model1, x, load_Classifier_by_model1_Dict[x]), load_Classifier_by_model1_Dict))
predict_Outcome_daum = pd.DataFrame.from_dict(predict_Outcome_daum)
predict_Outcome_daum = extDfDaum.merge(predict_Outcome_daum,
                                   left_index = True, right_index = True)
predict_Outcome_daum.to_csv(outcomeDaumCommentsPath+'outcome_comments_sentiment_daum_'+model1_ct_Name,index=None, encoding='utf-8')

903it [00:00, 525962.58it/s]
903it [00:00, 420128.29it/s]
903it [00:00, 200490.00it/s]

CPU times: user 2.43 s, sys: 241 ms, total: 2.67 s
Wall time: 2.67 s





In [27]:
%%time
warnings.filterwarnings('ignore')
predict_Outcome_naver = dict(map(lambda x: bm.PredictSentiment(naver_vecs_by_model1, x, load_Classifier_by_model1_Dict[x]), load_Classifier_by_model1_Dict))
predict_Outcome_naver = pd.DataFrame.from_dict(predict_Outcome_naver)
predict_Outcome_naver = extDfNaver.merge(predict_Outcome_naver,
                                   left_index = True, right_index = True)
predict_Outcome_naver.to_csv(outcomeNaverCommentsPath+'outcome_comments_sentiment_naver_'+model1_ct_Name,index=None, encoding='utf-8')

1465it [00:00, 408771.64it/s]
1465it [00:00, 426979.04it/s]
1465it [00:00, 645379.20it/s]

CPU times: user 3.83 s, sys: 148 ms, total: 3.98 s
Wall time: 3.9 s





In [28]:
del model1_ct
del model1_ct_Name
del classifier_by_model1
del load_Classifier_by_model1_Dict
del daum_vecs_by_model1
del naver_vecs_by_model1
del predict_Outcome_naver
del predict_Outcome_daum

#### model2

##### Load Model

In [37]:
taggerName_ct = 'ct'
print ( '{}'.format(taggerName_ct))
model2_ct = FastText.load(loadModelPath+'fastText_size-1000_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-1_min_count-2_by-ct.model')
model2_ct_Name = bm.Return_ModelName('fastText', model2_ct,'ct')

ct


##### Vectorization

In [38]:
if os.path.isfile(outcome_vectorized_data+model2_ct_Name+'-daum'):
    daum_vecs_by_model2 = pickle.load(open(outcome_vectorized_data+model2_ct_Name+'-daum', 'rb'))
else:
    wv1, daum_vecs_by_model2 = bm.Make_Pre_Data_For_DAUM(model2_ct, tfidf_ct, 1000, tagged_daum_ct)
    pickle.dump(daum_vecs_by_model2, open(outcome_vectorized_data+model2_ct_Name+'-daum', 'wb'))
    del wv1#, daum_vecs_by_model2

In [39]:
if os.path.isfile(outcome_vectorized_data+model2_ct_Name+'-naver'):
    naver_vecs_by_model2 = pickle.load(open(outcome_vectorized_data+model2_ct_Name+'-naver', 'rb'))
else:
    wv1, naver_vecs_by_model2 = bm.Make_Pre_Data_For_DAUM(model2_ct, tfidf_ct, 1000, tagged_naver_ct)
    pickle.dump(naver_vecs_by_model2, open(outcome_vectorized_data+model2_ct_Name+'-naver', 'wb'))
    del wv1#, naver_vecs_by_model2

##### Load Classifier

In [40]:
classifier_by_model2 = glob(classifierPath+'*'+model2_ct_Name)
load_Classifier_by_model2_Dict = dict(map(lambda x:bm.LoadClassifier(x), classifier_by_model2))

In [41]:
%%time
warnings.filterwarnings('ignore')
predict_Outcome_daum = dict(map(lambda x: bm.PredictSentiment(daum_vecs_by_model2, x, load_Classifier_by_model2_Dict[x]), load_Classifier_by_model2_Dict))
predict_Outcome_daum = pd.DataFrame.from_dict(predict_Outcome_daum)
predict_Outcome_daum = extDfDaum.merge(predict_Outcome_daum,
                                   left_index = True, right_index = True)
predict_Outcome_daum.to_csv(outcomeDaumCommentsPath+'outcome_comments_sentiment_daum_'+model2_ct_Name,index=None, encoding='utf-8')

903it [00:00, 227229.21it/s]
903it [00:00, 607743.34it/s]
903it [00:00, 249651.08it/s]

CPU times: user 3.19 s, sys: 415 ms, total: 3.6 s
Wall time: 4 s





In [42]:
%%time
warnings.filterwarnings('ignore')
predict_Outcome_naver = dict(map(lambda x: bm.PredictSentiment(naver_vecs_by_model2, x, load_Classifier_by_model2_Dict[x]), load_Classifier_by_model2_Dict))
predict_Outcome_naver = pd.DataFrame.from_dict(predict_Outcome_naver)
predict_Outcome_naver = extDfNaver.merge(predict_Outcome_naver,
                                   left_index = True, right_index = True)
predict_Outcome_naver.to_csv(outcomeNaverCommentsPath+'outcome_comments_sentiment_naver_'+model2_ct_Name,index=None, encoding='utf-8')

1465it [00:00, 792245.40it/s]
1465it [00:00, 521706.18it/s]
1465it [00:00, 545464.30it/s]

CPU times: user 4.42 s, sys: 181 ms, total: 4.6 s
Wall time: 4.56 s





In [43]:
del model2_ct
del model2_ct_Name
del classifier_by_model2
del load_Classifier_by_model2_Dict
del daum_vecs_by_model2
del naver_vecs_by_model2
del predict_Outcome_naver
del predict_Outcome_daum

#### model 3

##### Load Model

In [44]:
taggerName_ct = 'ct'
print ( '{}'.format(taggerName_ct))
model3_ct = FastText.load(loadModelPath+'fastText_size-1000_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-1_cbow_mean-0_min_count-2_by-ct.model')
model3_ct_Name = bm.Return_ModelName('fastText', model3_ct,'ct')
print ('Load Model')

ct
Load Model


##### Vectorization

In [45]:
if os.path.isfile(outcome_vectorized_data+model3_ct_Name+'-daum'):
    daum_vecs_by_model3 = pickle.load(open(outcome_vectorized_data+model3_ct_Name+'-daum', 'rb'))
else:
    wv1, daum_vecs_by_model3 = bm.Make_Pre_Data_For_DAUM(model3_ct, tfidf_ct, 1000, tagged_daum_ct)
    pickle.dump(daum_vecs_by_model3, open(outcome_vectorized_data+model3_ct_Name+'-daum', 'wb'))
    del wv1#, daum_vecs_by_model3

  1%|          | 823/162564 [00:00<00:20, 7982.72it/s]

FastText(vocab=162564, size=1000, alpha=0.025)


100%|██████████| 162564/162564 [00:01<00:00, 83295.29it/s]
27it [00:00, 253.40it/s]

running time : 0:00:01.957660
Vectorizing Data


903it [00:01, 716.86it/s]


scaling Data
total running time : 0:00:03.300235


In [46]:
if os.path.isfile(outcome_vectorized_data+model3_ct_Name+'-naver'):
    naver_vecs_by_model3 = pickle.load(open(outcome_vectorized_data+model3_ct_Name+'-naver', 'rb'))
else:
    wv1, naver_vecs_by_model3 = bm.Make_Pre_Data_For_DAUM(model3_ct, tfidf_ct, 1000, tagged_naver_ct)
    pickle.dump(naver_vecs_by_model3, open(outcome_vectorized_data+model3_ct_Name+'-naver', 'wb'))
    del wv1#, naver_vecs_by_model3

  9%|▊         | 14019/162564 [00:00<00:02, 70078.86it/s]

FastText(vocab=162564, size=1000, alpha=0.025)


100%|██████████| 162564/162564 [00:01<00:00, 91225.07it/s]
50it [00:00, 495.43it/s]

running time : 0:00:01.786829
Vectorizing Data


1465it [00:02, 677.85it/s]


scaling Data
total running time : 0:00:04.006457


##### Load Classifier

In [47]:
classifier_by_model3 = glob(classifierPath+'*'+model3_ct_Name)
load_Classifier_by_model3_Dict = dict(map(lambda x:bm.LoadClassifier(x), classifier_by_model3))

In [48]:
%%time
warnings.filterwarnings('ignore')
predict_Outcome_daum = dict(map(lambda x: bm.PredictSentiment(daum_vecs_by_model3, x, load_Classifier_by_model3_Dict[x]), load_Classifier_by_model3_Dict))
predict_Outcome_daum = pd.DataFrame.from_dict(predict_Outcome_daum)
predict_Outcome_daum = extDfDaum.merge(predict_Outcome_daum,
                                   left_index = True, right_index = True)
predict_Outcome_daum.to_csv(outcomeDaumCommentsPath+'outcome_comments_sentiment_daum_'+model3_ct_Name,index=None, encoding='utf-8')

903it [00:00, 224521.70it/s]
903it [00:00, 302899.59it/s]
903it [00:00, 494187.96it/s]

CPU times: user 3.33 s, sys: 395 ms, total: 3.73 s
Wall time: 3.82 s





In [49]:
%%time
warnings.filterwarnings('ignore')
predict_Outcome_naver = dict(map(lambda x: bm.PredictSentiment(naver_vecs_by_model3, x, load_Classifier_by_model3_Dict[x]), load_Classifier_by_model3_Dict))
predict_Outcome_naver = pd.DataFrame.from_dict(predict_Outcome_naver)
predict_Outcome_naver = extDfNaver.merge(predict_Outcome_naver,
                                   left_index = True, right_index = True)
predict_Outcome_naver.to_csv(outcomeNaverCommentsPath+'outcome_comments_sentiment_naver_'+model3_ct_Name,index=None, encoding='utf-8')

1465it [00:00, 552179.67it/s]
1465it [00:00, 415375.88it/s]
1465it [00:00, 659014.95it/s]

CPU times: user 5.67 s, sys: 235 ms, total: 5.91 s
Wall time: 7.93 s





In [50]:
del model3_ct
del model3_ct_Name
del classifier_by_model3
del load_Classifier_by_model3_Dict
del daum_vecs_by_model3
del naver_vecs_by_model3
del predict_Outcome_naver
del predict_Outcome_daum
del tfidf_ct

### Mecab

#### News to tagged Document

In [51]:
%%time
tagged_by_mecab_daum_file = outcome_tagged_data+'fastText_tagged_data_by_mecab_for_daum_news_comment.pickled'
if os.path.isfile(tagged_by_mecab_daum_file):
    tagged_daum_mecab = pickle.load(open(tagged_by_mecab_daum_file, 'rb'))
else:
    tagged_daum_mecab = bm.MakeTaggedData_For_Comments(dfDaum, TaggedDocument, mecab, stopwords)
    pickle.dump(tagged_daum_mecab, open(tagged_by_mecab_daum_file, 'wb'))
    #del tagged_daum_mecab

tagged_by_mecab_naver_file = outcome_tagged_data+'fastText_tagged_data_by_mecab_for_naver_news_comment.pickled'
if os.path.isfile(tagged_by_mecab_naver_file):
    tagged_naver_mecab = pickle.load(open(tagged_by_mecab_naver_file, 'rb'))
else:
    tagged_naver_mecab = bm.MakeTaggedData_For_Comments(dfNaver, TaggedDocument, mecab, stopwords)
    pickle.dump(tagged_naver_mecab, open(tagged_by_mecab_naver_file, 'wb'))
    #del tagged_naver_mecab

100%|██████████| 903/903 [00:02<00:00, 370.01it/s]
100%|██████████| 1465/1465 [00:03<00:00, 404.45it/s]

CPU times: user 4.08 s, sys: 417 ms, total: 4.5 s
Wall time: 6.45 s





#### Train data set으로부터 TF-IDF Vectorizer을 만듦

In [None]:
trainName_mecab = './data/pre_data/train_test_Data/pre_data_train_for_fastText_sentiment_by_mecab.pickled'
train_mecab = pickle.load(open(trainName_mecab, 'rb'))
tfidf_mecab = bm.Build_tfidf(train_mecab)
del train_mecab

100%|██████████| 442359/442359 [00:00<00:00, 813670.01it/s] 


#### model 1

##### Load Model

In [None]:
taggerName_mecab = 'mecab'
print ( '{}'.format(taggerName_mecab))
model1_mecab = FastText.load(loadModelPath+'fastText_size-1000_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-0_min_count-2_by-mecab.model')
model1_mecab_Name = bm.Return_ModelName('fastText', model1_mecab,'mecab')

##### Vectorization

In [None]:
if os.path.isfile(outcome_vectorized_data+model1_mecab_Name+'-daum'):
    daum_vecs_by_model1 = pickle.load(open(outcome_vectorized_data+model1_mecab_Name+'-daum', 'rb'))
else:
    wv1, daum_vecs_by_model1 = bm.Make_Pre_Data_For_DAUM(model1_mecab, tfidf_mecab, 1000, tagged_daum_mecab)
    pickle.dump(daum_vecs_by_model1, open(outcome_vectorized_data+model1_mecab_Name+'-daum', 'wb'))
    del wv1#, daum_vecs_by_model1

In [None]:
if os.path.isfile(outcome_vectorized_data+model1_mecab_Name+'-naver'):
    naver_vecs_by_model1 = pickle.load(open(outcome_vectorized_data+model1_mecab_Name+'-naver', 'rb'))
else:
    wv1, naver_vecs_by_model1 = bm.Make_Pre_Data_For_DAUM(model1_mecab, tfidf_mecab, 1000, tagged_naver_mecab)
    pickle.dump(naver_vecs_by_model1, open(outcome_vectorized_data+model1_mecab_Name+'-naver', 'wb'))
    del wv1#, naver_vecs_by_model1

##### Load Classifier

In [None]:
classifier_by_model1 = glob(classifierPath+'*'+model1_mecab_Name)
load_Classifier_by_model1_Dict = dict(map(lambda x:bm.LoadClassifier(x), classifier_by_model1))

In [None]:
%%time
warnings.filterwarnings('ignore')
predict_Outcome_daum = dict(map(lambda x: bm.PredictSentiment(daum_vecs_by_model1, x, load_Classifier_by_model1_Dict[x]), load_Classifier_by_model1_Dict))
predict_Outcome_daum = pd.DataFrame.from_dict(predict_Outcome_daum)
predict_Outcome_daum = extDfDaum.merge(predict_Outcome_daum,
                                   left_index = True, right_index = True)
predict_Outcome_daum.to_csv(outcomeDaumCommentsPath+'outcome_comments_sentiment_daum_'+model1_mecab_Name,index=None, encoding='utf-8')

In [None]:
%%time
warnings.filterwarnings('ignore')
predict_Outcome_naver = dict(map(lambda x: bm.PredictSentiment(naver_vecs_by_model1, x, load_Classifier_by_model1_Dict[x]), load_Classifier_by_model1_Dict))
predict_Outcome_naver = pd.DataFrame.from_dict(predict_Outcome_naver)
predict_Outcome_naver = extDfNaver.merge(predict_Outcome_naver,
                                   left_index = True, right_index = True)
predict_Outcome_naver.to_csv(outcomeNaverCommentsPath+'outcome_comments_sentiment_naver_'+model1_mecab_Name,index=None, encoding='utf-8')

In [None]:
del model1_mecab
del model1_mecab_Name
del classifier_by_model1
del load_Classifier_by_model1_Dict
del daum_vecs_by_model1
del naver_vecs_by_model1
del predict_Outcome_naver
del predict_Outcome_daum

#### model2

##### Load Model

In [None]:
taggerName_mecab = 'mecab'
print ( '{}'.format(taggerName_mecab))
model2_mecab = FastText.load(loadModelPath+'fastText_size-1000_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-1_min_count-2_by-mecab.model')
model2_mecab_Name = bm.Return_ModelName('fastText', model2_mecab,'mecab')

##### Vectorization

In [None]:
if os.path.isfile(outcome_vectorized_data+model2_mecab_Name+'-daum'):
    daum_vecs_by_model2 = pickle.load(open(outcome_vectorized_data+model2_mecab_Name+'-daum', 'rb'))
else:
    wv1, daum_vecs_by_model2 = bm.Make_Pre_Data_For_DAUM(model2_mecab, tfidf_mecab, 1000, tagged_daum_mecab)
    pickle.dump(daum_vecs_by_model2, open(outcome_vectorized_data+model2_mecab_Name+'-daum', 'wb'))
    del wv1#, daum_vecs_by_model2

In [None]:
if os.path.isfile(outcome_vectorized_data+model2_mecab_Name+'-naver'):
    naver_vecs_by_model2 = pickle.load(open(outcome_vectorized_data+model2_mecab_Name+'-naver', 'rb'))
else:
    wv1, naver_vecs_by_model2 = bm.Make_Pre_Data_For_DAUM(model2_mecab, tfidf_mecab, 1000, tagged_naver_mecab)
    pickle.dump(naver_vecs_by_model2, open(outcome_vectorized_data+model2_mecab_Name+'-naver', 'wb'))
    del wv1#, naver_vecs_by_model2

##### Load Classifier

In [None]:
classifier_by_model2 = glob(classifierPath+'*'+model2_mecab_Name)
load_Classifier_by_model2_Dict = dict(map(lambda x:bm.LoadClassifier(x), classifier_by_model2))

In [None]:
%%time
warnings.filterwarnings('ignore')
predict_Outcome_daum = dict(map(lambda x: bm.PredictSentiment(daum_vecs_by_model2, x, load_Classifier_by_model2_Dict[x]), load_Classifier_by_model2_Dict))
predict_Outcome_daum = pd.DataFrame.from_dict(predict_Outcome_daum)
predict_Outcome_daum = extDfDaum.merge(predict_Outcome_daum,
                                   left_index = True, right_index = True)
predict_Outcome_daum.to_csv(outcomeDaumCommentsPath+'outcome_comments_sentiment_daum_'+model2_mecab_Name,index=None, encoding='utf-8')

In [None]:
%%time
warnings.filterwarnings('ignore')
predict_Outcome_naver = dict(map(lambda x: bm.PredictSentiment(naver_vecs_by_model2, x, load_Classifier_by_model2_Dict[x]), load_Classifier_by_model2_Dict))
predict_Outcome_naver = pd.DataFrame.from_dict(predict_Outcome_naver)
predict_Outcome_naver = extDfNaver.merge(predict_Outcome_naver,
                                   left_index = True, right_index = True)
predict_Outcome_naver.to_csv(outcomeNaverCommentsPath+'outcome_comments_sentiment_naver_'+model2_mecab_Name,index=None, encoding='utf-8')

In [None]:
del model2_mecab
del model2_mecab_Name
del classifier_by_model2
del load_Classifier_by_model2_Dict
del daum_vecs_by_model2
del naver_vecs_by_model2
del predict_Outcome_naver
del predict_Outcome_daum

#### model 3

##### Load Model

In [None]:
taggerName_mecab = 'mecab'
print ( '{}'.format(taggerName_mecab))
model3_mecab = FastText.load(loadModelPath+'fastText_size-1000_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-1_cbow_mean-0_min_count-2_by-mecab.model')
model3_mecab_Name = bm.Return_ModelName('fastText', model3_mecab,'mecab')
print ('Load Model')

##### Vectorization

In [None]:
if os.path.isfile(outcome_vectorized_data+model3_mecab_Name+'-daum'):
    daum_vecs_by_model3 = pickle.load(open(outcome_vectorized_data+model3_mecab_Name+'-daum', 'rb'))
else:
    wv1, daum_vecs_by_model3 = bm.Make_Pre_Data_For_DAUM(model3_mecab, tfidf_mecab, 1000, tagged_daum_mecab)
    pickle.dump(daum_vecs_by_model3, open(outcome_vectorized_data+model3_mecab_Name+'-daum', 'wb'))
    del wv1#, daum_vecs_by_model3

In [None]:
if os.path.isfile(outcome_vectorized_data+model3_mecab_Name+'-naver'):
    naver_vecs_by_model3 = pickle.load(open(outcome_vectorized_data+model3_mecab_Name+'-naver', 'rb'))
else:
    wv1, naver_vecs_by_model3 = bm.Make_Pre_Data_For_DAUM(model3_mecab, tfidf_mecab, 1000, tagged_naver_mecab)
    pickle.dump(naver_vecs_by_model3, open(outcome_vectorized_data+model3_mecab_Name+'-naver', 'wb'))
    del wv1#, naver_vecs_by_model3

##### Load Classifier

In [None]:
classifier_by_model3 = glob(classifierPath+'*'+model3_mecab_Name)
load_Classifier_by_model3_Dict = dict(map(lambda x:bm.LoadClassifier(x), classifier_by_model3))

In [None]:
%%time
warnings.filterwarnings('ignore')
predict_Outcome_daum = dict(map(lambda x: bm.PredictSentiment(daum_vecs_by_model3, x, load_Classifier_by_model3_Dict[x]), load_Classifier_by_model3_Dict))
predict_Outcome_daum = pd.DataFrame.from_dict(predict_Outcome_daum)
predict_Outcome_daum = extDfDaum.merge(predict_Outcome_daum,
                                   left_index = True, right_index = True)
predict_Outcome_daum.to_csv(outcomeDaumCommentsPath+'outcome_comments_sentiment_daum_'+model3_mecab_Name,index=None, encoding='utf-8')

In [None]:
%%time
warnings.filterwarnings('ignore')
predict_Outcome_naver = dict(map(lambda x: bm.PredictSentiment(naver_vecs_by_model3, x, load_Classifier_by_model3_Dict[x]), load_Classifier_by_model3_Dict))
predict_Outcome_naver = pd.DataFrame.from_dict(predict_Outcome_naver)
predict_Outcome_naver = extDfNaver.merge(predict_Outcome_naver,
                                   left_index = True, right_index = True)
predict_Outcome_naver.to_csv(outcomeNaverCommentsPath+'outcome_comments_sentiment_naver_'+model3_mecab_Name,index=None, encoding='utf-8')

In [None]:
del model3_mecab
del model3_mecab_Name
del classifier_by_model3
del load_Classifier_by_model3_Dict
del daum_vecs_by_model3
del naver_vecs_by_model3
del predict_Outcome_naver
del predict_Outcome_daum
del tfidf_mecab