# 데이터베이스에 저장되어 있는 댓글들을 기사별로 파일로 만들자

In [1]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os
from glob import glob
import warnings

os.environ['KERAS_BACKEND']='tensorflow'

from numba import jit
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from gensim.models import doc2vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve,  accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.manifold import TSNE

import keras.backend.tensorflow_backend as K
from keras.preprocessing import sequence
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Flatten, Dense, Embedding, embeddings, merge, Dropout, Activation,  LSTM, Bidirectional, SimpleRNN, GRU
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import SpatialDropout1D
from keras.utils import np_utils
from tensorflow.python.client import device_lib
from keras.layers.merge import dot

import xgboost as xgb

import matplotlib.pyplot as plt

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import Database_Handler as dh
import Basic_Module as bm

In [3]:
from ckonlpy.tag import Twitter
from konlpy.tag import Mecab
ct = Twitter()
mecab = Mecab()

## Stopwords

In [4]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## Load Data

### Path

In [5]:
if sys.platform =='darwin':
    'daum_news_sentiment_analysis.csv'
    loadModelPath = '/Volumes/disk1/model/'
    classifierPath = '/Volumes/disk1/data/pre_data/classifier/'
    news_senti_outcome = '/Volumes/disk1/outcome_for_News_sentiment_analysis/'
    daumCommentPath = './data/daum_Comments/'
    naverCommentsPath = './data/naver_Comments/'
elif sys.platform =='win32':
    loadModelPath = 'd:/model/'
    classifierPath = 'd:/data/pre_data/classifier/'
    newsPath = './data/pre_data/news_sentiment/'
    news_senti_outcome = './outcome_for_News_sentiment_analysis/'
    daumCommentPath = 'd:/data/daum_Comments/'
    naverCommentsPath = 'd:/data/naver_Comments/'

### News

In [6]:
os.listdir(news_senti_outcome)

['daum_news_sentiment_analysis.csv', 'naver_news_sentiment_analysis.csv']

In [7]:
# Naver
naverData = pd.read_csv(os.path.join(news_senti_outcome, 'naver_news_sentiment_analysis.csv'), index_col=0, header= 0, encoding = 'utf-8')
naverData['site'] = ['Naver'] * naverData.shape[0]
#naverData.head()

#### 크롤링된 댓글수가 0개인 것을 찾아보자

In [8]:
naverData[naverData.number_of_crawled_comment == 0]

Unnamed: 0_level_0,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords,negative,positive,Decision,site
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5a32e268588c136a189e66d0,IT/과학,2017.12.14,노컷뉴스,1,0,28,"드론, 하천측량 업무에 본격 활용된다","국토부, 드론기반 하상변동조사·하천측량 시범사업 착수 [CBS노컷뉴스 곽영식 기자...","['드론', '시범사업', '국토교통부']","{'기존', '활용', '드론', '시범 사업', '국토부', '하천 측량'}",28.0,80.0,positive,Naver
5a44bcf3588c13269cc40b83,생활/문화,2017.12.27,연합뉴스,0,0,29,임실 변천사 한눈에…1730년 작성한 운수지 공개,(임실=연합뉴스) 이윤승 기자 = 1730년대 전북 임실군의 변천사를 한눈에 파악할...,"['조선시대', '달리']","{'중요한', '작성', '참여', '운수', '설명', '확보', '임실군', '...",38.0,70.0,positive,Naver
5a6668ee588c131c881e0a35,IT/과학,2018.01.19,뉴스1,0,0,21,CJ헬로 매각 부인한 CJ오쇼핑 '3개월내 매각못한다'…왜?,© News1 안은나 기자 계속 제기되는 매각설에 CJ오쇼핑과 CJ헬로 '곤혹'...,"['이슈 · LGU+, CJ헬로 인수설']","{'CJ 헬로', '공시', '결정', 'CJ오쇼핑', '매각설'}",85.0,23.0,negative,Naver


naver 뉴스를 크롤링한 데이터베이스에서는 댓글수가 0개인 뉴스가 3개 존재함

### 새롭게 정의된 뉴스목록

In [9]:
reNaverData = naverData[naverData.number_of_crawled_comment != 0]

In [10]:
reNaverData.head()

Unnamed: 0_level_0,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords,negative,positive,Decision,site
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5a29c445588c132954d1973a,정치,2017.12.07,연합뉴스,1713,1465,1,"北외무성 ""전쟁 바라지 않지만 결코 피하지 않을 것""","美고위인사 대북언급 비난하며 ""전쟁 기정사실화"" 위협 며칠 새 이어지는 북한 군민...","['외무성', '핵전쟁', '대변인']","{'중앙', '핵전쟁', '조선반도', '고위', '미국', '북한', '대변인',...",86.0,22.0,negative,Naver
5a29c445588c132954d1973b,정치,2017.12.07,한국일보,2551,2062,2,"예산전쟁, 예결위 간사ㆍ호남이 웃었다",예결위 간사들이 최대 수혜자..당 지도부 내 몫 챙기기도 여전 황주홍ㆍ김도읍 등...,"['예산', '예결위', 'soc']","{'호남', '증액', '지역구', '예산안', '정부안', '의원', '국민의당'}",46.0,62.0,positive,Naver
5a29c445588c132954d1973c,정치,2017.12.07,뉴시스,610,536,3,"혐의 부인에 20시간 조사…檢, 최경환 구속 카드 꺼내나",【서울=뉴시스】 최진석 기자 = 박근혜 정부 시절 국가정보원 특수활동비 수수 의혹 ...,"['최경환', '구속영장', '국가정보원']","{'정기국회', '혐의', '검찰', '국정원장', '의원', '구속영장 청구', ...",77.0,31.0,negative,Naver
5a29c445588c132954d1973d,정치,2017.12.07,연합뉴스,145,133,4,"최재형 감사원장 후보자 ""독립성 강화는 임명권자의 뜻""",감사원장에 내정된 최재형 사법연수원장(고양=연합뉴스) 이희열 기자 = 7일 감사원장...,"['이슈 · 최재형 감사원장 내정', '감사원장', '최재형', '감사원']","{'감사원장', '공직 사회', '지명', '생활', '법관', '후보자'}",39.0,69.0,positive,Naver
5a29c445588c132954d1973e,정치,2017.12.07,동아일보,1074,932,5,"B-1B 한반도에 뜨자, 평양 비운 김정은",[동아일보] 북중 접경지 양강도 삼지연 시찰… 방북 유엔 사무차장 면담 안할듯 B-...,"['김정은', 'b-1b', '한반도']","{'접경', '훈련', '펠트먼', '시찰', '양강도', '사무차장', '공장',...",72.0,36.0,negative,Naver


### 댓글을 dataframe으로 만들어 저장. 
* 읽어들이는데 시간이 오래 걸림. 

In [11]:
import dask.dataframe as dd

In [None]:
# 네이버
dd_naver = dd.from_pandas(reNaverData, npartitions=30)
def Naver_Comment(row):
    return bm.Make_Comments_File2(naverCommentsPath, row)
dd_naver.apply(Naver_Comment, axis = 1, meta = int).compute()

(231, 8)
(12, 8)
(493, 8)
(21, 8)
(171, 8)
(7, 8)
(669, 8)
(1503, 8)
(97, 8)
(710, 8)
(1120, 8)
(1465, 8)
(72, 8)
(35, 8)
(2108, 8)
(627, 8)
(1772, 8)
(333, 8)
(378, 8)
(2036, 8)
(44, 8)
(718, 8)
(2041, 8)
(2062, 8)
(14, 8)
(15, 8)
(228, 8)
(551, 8)
(358, 8)
(164, 8)
(450, 8)
(719, 8)
(296, 8)
(240, 8)
(1481, 8)
(536, 8)
(436, 8)
(64, 8)
(12, 8)
(241, 8)
(543, 8)
(105, 8)
(495, 8)
(326, 8)
(16, 8)
(390, 8)
(722, 8)
(133, 8)
(833, 8)
(594, 8)
(104, 8)
(89, 8)
(421, 8)
(64, 8)
(763, 8)
(950, 8)
(179, 8)
(416, 8)
(2535, 8)
(932, 8)
(7, 8)
(180, 8)
(104, 8)
(67, 8)
(786, 8)
(45, 8)
(386, 8)
(2711, 8)
(19, 8)
(154, 8)
(362, 8)
(1291, 8)
(15, 8)
(81, 8)
(68, 8)
(95, 8)
(347, 8)
(16, 8)
(176, 8)
(23, 8)
(79, 8)
(3411, 8)
(2039, 8)
(19, 8)
(236, 8)
(692, 8)
(1953, 8)
(55, 8)
(71, 8)
(10, 8)
(164, 8)
(1231, 8)
(40, 8)
(483, 8)
(1259, 8)
(35, 8)
(137, 8)
(320, 8)
(37, 8)
(148, 8)
(93, 8)
(408, 8)
(209, 8)
(998, 8)
(76, 8)
(683, 8)
(1012, 8)
(11, 8)
(2473, 8)
(55, 8)
(3072, 8)
(55, 8)
(224, 8)
(6

(807, 8)
(365, 8)
(137, 8)
(7, 8)
(411, 8)
(23, 8)
(509, 8)
(81, 8)
(1134, 8)
(7817, 8)
(1024, 8)
(2110, 8)
(350, 8)
(148, 8)
(417, 8)
(185, 8)
(607, 8)
(149, 8)
(196, 8)
(536, 8)
(623, 8)
(3862, 8)
(457, 8)
(167, 8)
(268, 8)
(48, 8)
(29, 8)
(215, 8)
(88, 8)
(281, 8)
(567, 8)
(979, 8)
(311, 8)
(184, 8)
(663, 8)
(383, 8)
(463, 8)
(68, 8)
(263, 8)
(222, 8)
(247, 8)
(143, 8)
(30, 8)
(1054, 8)
(4585, 8)
(183, 8)
(1024, 8)
(2286, 8)
(370, 8)
(19, 8)
(138, 8)
(1359, 8)
(43, 8)
(376, 8)
(893, 8)
(297, 8)
(2925, 8)
(1432, 8)
(178, 8)
(273, 8)
(796, 8)
(20, 8)
(360, 8)
(681, 8)
(49, 8)
(36, 8)
(33, 8)
(185, 8)
(244, 8)
(2263, 8)
(394, 8)
(890, 8)
(1264, 8)
(31, 8)
(9, 8)
(78, 8)
(1833, 8)
(48, 8)
(85, 8)
(7, 8)
(467, 8)
(1724, 8)
(1092, 8)
(363, 8)
(27, 8)
(738, 8)
(234, 8)
(1913, 8)
(11, 8)
(575, 8)
(539, 8)
(737, 8)
(758, 8)
(453, 8)
(186, 8)
(2215, 8)
(17, 8)
(253, 8)
(43, 8)
(10, 8)
(87, 8)
(277, 8)
(443, 8)
(377, 8)
(1449, 8)
(1213, 8)
(428, 8)
(48, 8)
(14, 8)
(71, 8)
(114, 8)
(26, 8)
(228

(1433, 8)
(2591, 8)
(103, 8)
(68, 8)
(229, 8)
(31, 8)
(5999, 8)
(41, 8)
(568, 8)
(10, 8)
(580, 8)
(311, 8)
(777, 8)
(1167, 8)
(118, 8)
(331, 8)
(264, 8)
(25, 8)
(189, 8)
(64, 8)
(2138, 8)
(228, 8)
(263, 8)
(222, 8)
(1295, 8)
(61, 8)
(911, 8)
(1287, 8)
(219, 8)
(16, 8)
(72, 8)
(270, 8)
(1280, 8)
(1412, 8)
(224, 8)
(302, 8)
(94, 8)
(19, 8)
(1020, 8)
(252, 8)
(60, 8)
(44, 8)
(19, 8)
(43, 8)
(2082, 8)
(275, 8)
(101, 8)
(150, 8)
(59, 8)
(831, 8)
(441, 8)
(235, 8)
(687, 8)
(55, 8)
(77, 8)
(7, 8)
(807, 8)
(2556, 8)
(256, 8)
(294, 8)
(103, 8)
(56, 8)
(268, 8)
(6, 8)
(221, 8)
(947, 8)
(1020, 8)
(81, 8)
(2119, 8)
(1153, 8)
(267, 8)
(334, 8)
(138, 8)
(2101, 8)
(170, 8)
(264, 8)
(139, 8)
(361, 8)
(1205, 8)
(1432, 8)
(2402, 8)
(746, 8)
(254, 8)
(1153, 8)
(50, 8)
(10, 8)
(185, 8)
(449, 8)
(214, 8)
(183, 8)
(1875, 8)
(183, 8)
(239, 8)
(3890, 8)
(381, 8)
(1717, 8)
(104, 8)
(28, 8)
(172, 8)
(193, 8)
(259, 8)
(1184, 8)
(137, 8)
(249, 8)
(603, 8)
(218, 8)
(515, 8)
(3253, 8)
(127, 8)
(196, 8)
(106, 8)
(91