# 데이터베이스에 저장되어 있는 댓글들을 기사별로 파일로 만들자

In [1]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os
from glob import glob
import warnings

os.environ['KERAS_BACKEND']='tensorflow'

from numba import jit
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from gensim.models import doc2vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve,  accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.manifold import TSNE

import keras.backend.tensorflow_backend as K
from keras.preprocessing import sequence
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Flatten, Dense, Embedding, embeddings, merge, Dropout, Activation,  LSTM, Bidirectional, SimpleRNN, GRU
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import SpatialDropout1D
from keras.utils import np_utils
from tensorflow.python.client import device_lib
from keras.layers.merge import dot

import xgboost as xgb

import matplotlib.pyplot as plt

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import Database_Handler as dh
import Basic_Module as bm

In [3]:
from ckonlpy.tag import Twitter
from konlpy.tag import Mecab
ct = Twitter()
mecab = Mecab()

## Stopwords

In [4]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## Load Data

### Path

In [58]:
if sys.platform =='darwin':
    loadModelPath = '/Volumes/disk1/model/'
    classifierPath = '/Volumes/disk1/data/pre_data/classifier/'
    news_senti_outcome = '/Volumes/disk1/outcome_for_News_sentiment_analysis/'
    daumCommentsPath = '/Volumes/disk1/data/daum_Comments/'
    naverCommentsPath = '/Volumes/disk1/data/naver_Comments/'
    outcomeDaumCommentsPath = '/Volumes/disk1/outcome_comments_for_daum/'
    outcomeNaverCommentsPath = '/Volumes/disk1/outcome_comments_for_naver/'
    outcome_predata = 'Volumes/disk1/pre_data_for_comments/'
elif sys.platform =='win32':
    loadModelPath = 'd:/model/'
    classifierPath = 'd:/data/pre_data/classifier/'
    newsPath = './data/pre_data/news_sentiment/'
    news_senti_outcome = './outcome_for_News_sentiment_analysis/'
    daumCommentsPath = 'd:/data/daum_Comments/'
    naverCommentsPath = 'd:/data/naver_Comments/'
    outcomeDaumCommentsPath = 'd:/outcome_comments_for_daum/'
    outcomeNaverCommentsPath = 'd:/outcome_comments_for_naver/'
    outcome_predata = 'd:/pre_data_for_comments/'

### News

In [10]:
os.listdir(news_senti_outcome)

['naver_news_sentiment_analysis.csv', 'daum_news_sentiment_analysis.csv']

In [11]:
# Naver
naverData = pd.read_csv(os.path.join(news_senti_outcome, 'naver_news_sentiment_analysis.csv'), index_col=0, header= 0, encoding = 'utf-8')
naverData['site'] = ['Naver'] * naverData.shape[0]
#naverData.head()

#### 크롤링된 댓글수가 0개인 것을 찾아보자

In [12]:
naverData[naverData.number_of_crawled_comment == 0]

Unnamed: 0_level_0,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords,negative,positive,Decision,site
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5a29d24d588c1367a042440e,IT/과학,2017.12.02,이데일리,195,0,1,"‘액정이 두개’..삼성, 중국 특화 고사양 폴더폰 공개",[이데일리 정병묵 기자] 삼성전자가 중국 시장에 특화된 고사양 폴더폰을 선보였다. ...,"['삼성', '삼성전자', '빅스비']","{'삼성전자', '탑재', '고사양', '폴더폰', '스마트폰', '중국', '시장'}",41.0,67.0,positive,Naver
5a29d24d588c1367a042440f,IT/과학,2017.12.02,연합뉴스,907,0,2,"한국, GDP 대비 R&D 지출비율 전 세계 1위 국가…4.23%","R&D[연합뉴스TV 제공] 일본 3.29%, 미국 2.79%, 중국 2.07%,...","['gdp', '세계1위', '국']","{'GDP 대비', '기준', 'EU', '회원국', '지출'}",37.0,71.0,positive,Naver
5a29d24d588c1367a0424410,IT/과학,2017.12.02,머니S,348,0,3,"[잇츠IT] 아이폰X, 일주일 써봤습니다",본문 이미지 영역 지난달 24일 정식 출시된 아이폰X. /사진=박흥순 기자 아이폰...,"['이슈 · 아이폰X・아이폰8', '아이폰x', '페이스id', '배터리']","{'박흥순', '페이스 ID', '모드', '느낌', '노치', '배터리', '아이...",65.0,43.0,negative,Naver
5a29d24d588c1367a0424411,IT/과학,2017.12.02,뉴스1,237,0,4,상한제 폐지 두달됐는데…'요지부동' 지원금은 여전히 '쥐꼬리',서울 시내 휴대폰 판매점의 모습/뉴스1 © News1 10월1일 '상한제' 폐지...,"['지원금', '폐지', '요금할인']","{'만원', '지원금', '단말기', '고객', '이통', '상한제 폐지'}",66.0,42.0,negative,Naver
5a29d24d588c1367a0424412,IT/과학,2017.12.02,한겨레,224,0,5,물리학자들이 ‘콜라 깡통 찌그러트리기’에 열올린 이유는?,[한겨레] 원통형 금속외피 찌그러짐 언제 일어나나 축 방향 힘과 움푹한 자국의 상...,"['로켓', '누르', '알루미늄']","{'외피', '금속', '측정', '움푹', '원통형', '깡통', '크기', '연...",48.0,60.0,positive,Naver
5a29d24d588c1367a0424413,IT/과학,2017.12.02,한국일보,87,0,6,"[SF, 미래에서 온 이야기] 비주얼 퓨처리스트, 미래를 디자인하다",<38> 미래 이미지 디자이너 시드 미드 #1 ‘스타 트렉’ 극장판의 거대 우주...,"['우주선', '바로', '육군']","{'영화', '시드 미드', '그림', '작업', 'SF', '미래', '디자인',...",21.0,87.0,positive,Naver
5a29d24d588c1367a0424414,IT/과학,2017.12.02,아이뉴스24,140,0,7,"삼성, 3Q 스마트폰 판매량 1위…전년比 19% 늘어",<아이뉴스24> [아이뉴스24 강민경기자] 3분기 전 세계 스마트폰 시장에서 삼성...,"['스마트폰', '삼성전자', '화웨이']","{'가트너', '샤오미', '시장', '판매량', '삼성전자', '수요', '스마트...",34.0,74.0,positive,Naver
5a29d24d588c1367a0424415,IT/과학,2017.12.02,서울신문,23,0,8,[와우! 과학] NASA가 ‘바퀴’를 새로 개발하는 이유는?,[서울신문 나우뉴스] 바퀴의 발명은 종종 문자나 불의 발명에 비교될 만큼 인류 ...,"['이슈 · 화성 탐사', 'nasa', '화성', '타이어']","{'장시간', '개발', '금속 바퀴', '금속판', 'NASA', '화성', '환...",37.0,71.0,positive,Naver
5a29d24d588c1367a0424416,IT/과학,2017.12.02,채널A,280,0,9,[슈퍼컴 ‘제3의 국력’②]미국 제친 중국 ‘슈퍼컴 굴기’,슈퍼컴퓨터 시리즈 오늘이 두번째 순서입니다. 전 세계가 사활을 걸고 있는 슈퍼컴퓨...,"['슈퍼컴퓨터', '굴기', '예산']","{'미국', '슈퍼컴퓨터 개발', '반격', '효율', '세계', '전력', '중국'}",33.0,75.0,positive,Naver
5a29d24d588c1367a0424417,IT/과학,2017.12.02,아이뉴스24,12,0,10,나홀로 놀 때 유용한 앱,<아이뉴스24> [아이뉴스24 민혜정기자] 겨울인데 스키장도 가지 못한다고 울상 ...,"['나홀로', '겨울나기', '다이어트']","{'게임', '기능', '댄스', '운동', '터치', '듀엣곡', '민혜정', '...",26.0,82.0,positive,Naver


naver 뉴스를 크롤링한 데이터베이스에서는 댓글수가 0개인 뉴스가 105개 존재함

### 새롭게 정의된 뉴스목록

In [13]:
reNaverData = naverData[naverData.number_of_crawled_comment != 0]

In [14]:
reNaverData.head()

Unnamed: 0_level_0,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords,negative,positive,Decision,site
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5a29c445588c132954d1973a,정치,2017.12.07,연합뉴스,1713,1465,1,"北외무성 ""전쟁 바라지 않지만 결코 피하지 않을 것""","美고위인사 대북언급 비난하며 ""전쟁 기정사실화"" 위협 며칠 새 이어지는 북한 군민...","['외무성', '핵전쟁', '대변인']","{'미국', '조선반도', '핵전쟁', '북한', '중앙', '도화선', '고위',...",86.0,22.0,negative,Naver
5a29c445588c132954d1973b,정치,2017.12.07,한국일보,2551,2062,2,"예산전쟁, 예결위 간사ㆍ호남이 웃었다",예결위 간사들이 최대 수혜자..당 지도부 내 몫 챙기기도 여전 황주홍ㆍ김도읍 등...,"['예산', '예결위', 'soc']","{'호남', '예산안', '의원', '정부안', '증액', '지역구', '국민의당'}",46.0,62.0,positive,Naver
5a29c445588c132954d1973c,정치,2017.12.07,뉴시스,610,536,3,"혐의 부인에 20시간 조사…檢, 최경환 구속 카드 꺼내나",【서울=뉴시스】 최진석 기자 = 박근혜 정부 시절 국가정보원 특수활동비 수수 의혹 ...,"['최경환', '구속영장', '국가정보원']","{'구속영장 청구', '조사', '의원', '정기국회', '국정원장', '검찰', ...",77.0,31.0,negative,Naver
5a29c445588c132954d1973d,정치,2017.12.07,연합뉴스,145,133,4,"최재형 감사원장 후보자 ""독립성 강화는 임명권자의 뜻""",감사원장에 내정된 최재형 사법연수원장(고양=연합뉴스) 이희열 기자 = 7일 감사원장...,"['이슈 · 최재형 감사원장 내정', '감사원장', '최재형', '감사원']","{'공직 사회', '법관', '생활', '감사원장', '지명', '후보자'}",39.0,69.0,positive,Naver
5a29c445588c132954d1973e,정치,2017.12.07,동아일보,1074,932,5,"B-1B 한반도에 뜨자, 평양 비운 김정은",[동아일보] 북중 접경지 양강도 삼지연 시찰… 방북 유엔 사무차장 면담 안할듯 B-...,"['김정은', 'b-1b', '한반도']","{'공장', '양강도', '사무차장', '삼지연', '시찰', '훈련', '접경',...",72.0,36.0,negative,Naver


### 댓글을 dataframe으로 만들어 저장. 
* 읽어들이는데 시간이 오래 걸림. 

In [11]:
import dask.dataframe as dd

In [12]:
# 네이버
dd_naver = dd.from_pandas(reNaverData, npartitions=30)
def Naver_Comment(row):
    return bm.Make_Comments_File2(naverCommentsPath, row)
dd_naver.apply(Naver_Comment, axis = 1, meta = int).compute()

(231, 8)
(12, 8)
(493, 8)
(21, 8)
(171, 8)
(7, 8)
(669, 8)
(1503, 8)
(97, 8)
(710, 8)
(1120, 8)
(1465, 8)
(72, 8)
(35, 8)
(2108, 8)
(627, 8)
(1772, 8)
(333, 8)
(378, 8)
(2036, 8)
(44, 8)
(718, 8)
(2041, 8)
(2062, 8)
(14, 8)
(15, 8)
(228, 8)
(551, 8)
(358, 8)
(164, 8)
(450, 8)
(719, 8)
(296, 8)
(240, 8)
(1481, 8)
(536, 8)
(436, 8)
(64, 8)
(12, 8)
(241, 8)
(543, 8)
(105, 8)
(495, 8)
(326, 8)
(16, 8)
(390, 8)
(722, 8)
(133, 8)
(833, 8)
(594, 8)
(104, 8)
(89, 8)
(421, 8)
(64, 8)
(763, 8)
(950, 8)
(179, 8)
(416, 8)
(2535, 8)
(932, 8)
(7, 8)
(180, 8)
(104, 8)
(67, 8)
(786, 8)
(45, 8)
(386, 8)
(2711, 8)
(19, 8)
(154, 8)
(362, 8)
(1291, 8)
(15, 8)
(81, 8)
(68, 8)
(95, 8)
(347, 8)
(16, 8)
(176, 8)
(23, 8)
(79, 8)
(3411, 8)
(2039, 8)
(19, 8)
(236, 8)
(692, 8)
(1953, 8)
(55, 8)
(71, 8)
(10, 8)
(164, 8)
(1231, 8)
(40, 8)
(483, 8)
(1259, 8)
(35, 8)
(137, 8)
(320, 8)
(37, 8)
(148, 8)
(93, 8)
(408, 8)
(209, 8)
(998, 8)
(76, 8)
(683, 8)
(1012, 8)
(11, 8)
(2473, 8)
(55, 8)
(3072, 8)
(55, 8)
(224, 8)
(6

(1433, 8)
(2591, 8)
(103, 8)
(68, 8)
(229, 8)
(31, 8)
(5999, 8)
(41, 8)
(568, 8)
(10, 8)
(580, 8)
(311, 8)
(777, 8)
(1167, 8)
(118, 8)
(331, 8)
(264, 8)
(25, 8)
(189, 8)
(64, 8)
(2138, 8)
(228, 8)
(263, 8)
(222, 8)
(1295, 8)
(61, 8)
(911, 8)
(1287, 8)
(219, 8)
(16, 8)
(72, 8)
(270, 8)
(1280, 8)
(1412, 8)
(224, 8)
(302, 8)
(94, 8)
(19, 8)
(1020, 8)
(252, 8)
(60, 8)
(44, 8)
(19, 8)
(43, 8)
(2082, 8)
(275, 8)
(101, 8)
(150, 8)
(59, 8)
(831, 8)
(441, 8)
(235, 8)
(687, 8)
(55, 8)
(77, 8)
(7, 8)
(807, 8)
(2556, 8)
(256, 8)
(294, 8)
(103, 8)
(56, 8)
(268, 8)
(6, 8)
(221, 8)
(947, 8)
(1020, 8)
(81, 8)
(2119, 8)
(1153, 8)
(267, 8)
(334, 8)
(138, 8)
(2101, 8)
(170, 8)
(264, 8)
(139, 8)
(361, 8)
(1205, 8)
(1432, 8)
(2402, 8)
(746, 8)
(254, 8)
(1153, 8)
(50, 8)
(10, 8)
(185, 8)
(449, 8)
(214, 8)
(183, 8)
(1875, 8)
(183, 8)
(239, 8)
(3890, 8)
(381, 8)
(1717, 8)
(104, 8)
(28, 8)
(172, 8)
(193, 8)
(259, 8)
(1184, 8)
(137, 8)
(249, 8)
(603, 8)
(218, 8)
(515, 8)
(3253, 8)
(127, 8)
(196, 8)
(106, 8)
(91

(203, 8)
(403, 8)
(9, 8)
(5, 8)
(3047, 8)
(1225, 8)
(652, 8)
(279, 8)
(13, 8)
(385, 8)
(460, 8)
(4, 8)
(348, 8)
(59, 8)
(295, 8)
(21, 8)
(41, 8)
(1019, 8)
(2651, 8)
(415, 8)
(501, 8)
(1226, 8)
(5885, 8)
(35, 8)
(24, 8)
(6, 8)
(21, 8)
(22, 8)
(308, 8)
(1575, 8)
(809, 8)
(901, 8)
(1331, 8)
(4574, 8)
(2925, 8)
(16, 8)
(12, 8)
(143, 8)
(176, 8)
(417, 8)
(27, 8)
(148, 8)
(1848, 8)
(1880, 8)
(2145, 8)
(9428, 8)
(509, 8)
(15, 8)
(227, 8)
(212, 8)
(108, 8)
(23, 8)
(41, 8)
(177, 8)
(5095, 8)
(339, 8)
(860, 8)
(323, 8)
(101, 8)
(75, 8)
(306, 8)
(316, 8)
(5098, 8)
(194, 8)
(61, 8)
(973, 8)
(1842, 8)
(448, 8)
(437, 8)
(202, 8)
(159, 8)
(1396, 8)
(509, 8)
(375, 8)
(64, 8)
(256, 8)
(697, 8)
(1025, 8)
(1259, 8)
(894, 8)
(152, 8)
(518, 8)
(2265, 8)
(57, 8)
(340, 8)
(457, 8)
(146, 8)
(91, 8)
(5598, 8)
(6015, 8)
(275, 8)
(42, 8)
(773, 8)
(421, 8)
(117, 8)
(326, 8)
(96, 8)
(211, 8)
(56, 8)
(3254, 8)
(8575, 8)
(8, 8)
(253, 8)
(1645, 8)
(1684, 8)
(261, 8)
(29, 8)
(128, 8)
(493, 8)
(698, 8)
(54, 8)
(1213, 8

(1883, 8)
(223, 8)
(1592, 8)
(980, 8)
(9, 8)
(964, 8)
(270, 8)
(63, 8)
(19, 8)
(110, 8)
(14, 8)
(96, 8)
(1730, 8)
(52, 8)
(4364, 8)
(1150, 8)
(32, 8)
(176, 8)
(967, 8)
(69, 8)
(916, 8)
(4, 8)
(7, 8)
(234, 8)
(508, 8)
(133, 8)
(2212, 8)
(1035, 8)
(640, 8)
(633, 8)
(3652, 8)
(3187, 8)
(1984, 8)
(10, 8)
(8, 8)
(27, 8)
(74, 8)
(492, 8)
(1827, 8)
(836, 8)
(274, 8)
(2253, 8)
(4485, 8)
(51, 8)
(579, 8)
(16, 8)
(57, 8)
(33, 8)
(61, 8)
(1374, 8)
(35, 8)
(521, 8)
(437, 8)
(904, 8)
(1332, 8)
(91, 8)
(147, 8)
(11, 8)
(548, 8)
(2658, 8)
(160, 8)
(1610, 8)
(45, 8)
(224, 8)
(112, 8)
(329, 8)
(10, 8)
(2886, 8)
(28, 8)
(638, 8)
(854, 8)
(95, 8)
(519, 8)
(634, 8)
(2162, 8)
(2039, 8)
(233, 8)
(1339, 8)
(10, 8)
(761, 8)
(85, 8)
(170, 8)
(272, 8)
(1455, 8)
(587, 8)
(1709, 8)
(2267, 8)
(3473, 8)
(1388, 8)
(2429, 8)
(707, 8)
(4681, 8)
(69, 8)(38, 8)

(492, 8)
(777, 8)
(408, 8)
(936, 8)
(713, 8)
(979, 8)
(128, 8)
(67, 8)
(2059, 8)
(1198, 8)
(109, 8)
(21, 8)
(547, 8)
(522, 8)
(191, 8)
(7248, 8)
(540, 8)
(3715,

(1480, 8)
(22, 8)
(2901, 8)
(243, 8)
(316, 8)
(35, 8)
(75, 8)
(1328, 8)
(4, 8)
(93, 8)
(260, 8)(880, 8)

(608, 8)
(122, 8)
(88, 8)
(257, 8)
(227, 8)
(166, 8)
(2707, 8)
(233, 8)
(114, 8)
(158, 8)
(559, 8)
(669, 8)
(960, 8)
(37, 8)
(102, 8)
(167, 8)
(2748, 8)
(23, 8)
(923, 8)
(184, 8)
(80, 8)
(17, 8)
(453, 8)
(732, 8)
(489, 8)
(163, 8)
(162, 8)
(244, 8)
(1503, 8)
(612, 8)
(463, 8)
(12, 8)
(640, 8)
(1085, 8)
(252, 8)
(769, 8)
(33, 8)
(1292, 8)
(115, 8)
(132, 8)
(1265, 8)
(4726, 8)
(2947, 8)
(5, 8)
(66, 8)
(1429, 8)
(817, 8)
(282, 8)
(358, 8)
(592, 8)
(260, 8)
(396, 8)
(1210, 8)
(975, 8)
(3111, 8)
(12, 8)
(9, 8)
(126, 8)
(32, 8)
(611, 8)
(1479, 8)
(694, 8)
(298, 8)
(369, 8)
(67, 8)
(151, 8)
(273, 8)
(2984, 8)
(928, 8)
(262, 8)
(281, 8)
(2103, 8)
(146, 8)
(229, 8)
(7134, 8)
(18588, 8)
(46, 8)
(46, 8)
(12, 8)
(47, 8)
(8553, 8)
(20, 8)
(154, 8)
(424, 8)
(143, 8)
(234, 8)
(542, 8)
(1087, 8)
(48, 8)
(10, 8)
(5, 8)
(551, 8)
(846, 8)
(213, 8)
(142, 8)
(349, 8)
(708, 8)
(287, 8)
(896, 8)
(154, 8)


(345, 8)
(412, 8)
(450, 8)
(49, 8)
(8, 8)
(887, 8)
(10, 8)
(26, 8)
(235, 8)
(130, 8)
(177, 8)
(364, 8)
(306, 8)
(10, 8)(1467, 8)

(536, 8)
(19, 8)
(1058, 8)
(42, 8)
(24, 8)
(311, 8)
(97, 8)
(342, 8)
(648, 8)
(568, 8)
(38, 8)
(688, 8)
(2861, 8)
(29, 8)
(786, 8)
(15, 8)
(39, 8)
(161, 8)
(180, 8)
(202, 8)
(77, 8)
(10, 8)
(218, 8)
(2248, 8)
(1866, 8)
(200, 8)
(25, 8)
(332, 8)
(33, 8)
(310, 8)
(49, 8)
(1403, 8)
(415, 8)
(252, 8)
(551, 8)
(619, 8)
(1746, 8)
(17, 8)
(5, 8)
(243, 8)
(747, 8)
(1103, 8)
(343, 8)
(2906, 8)
(585, 8)
(253, 8)
(740, 8)
(385, 8)
(1695, 8)
(501, 8)
(30, 8)
(58, 8)
(30, 8)
(8, 8)
(1147, 8)
(1984, 8)
(20, 8)
(10, 8)
(127, 8)
(608, 8)
(241, 8)
(11, 8)
(29, 8)
(48, 8)
(5, 8)
(427, 8)
(10, 8)
(444, 8)
(2090, 8)
(372, 8)
(161, 8)
(382, 8)
(890, 8)
(17, 8)
(35, 8)
(12, 8)
(255, 8)
(174, 8)
(75, 8)
(488, 8)
(64, 8)
(386, 8)
(1074, 8)
(350, 8)
(392, 8)
(20, 8)
(14, 8)
(1811, 8)
(116, 8)
(26, 8)
(88, 8)
(1016, 8)
(131, 8)
(10, 8)
(316, 8)
(159, 8)
(59, 8)
(20, 8)
(536, 8)
(2474

(413, 8)
(114, 8)
(357, 8)
(69, 8)
(125, 8)
(100, 8)
(402, 8)
(1272, 8)
(572, 8)
(307, 8)
(3179, 8)
(5136, 8)
(330, 8)
(331, 8)
(68, 8)
(158, 8)
(18, 8)
(17, 8)
(1092, 8)
(30, 8)
(81, 8)
(1295, 8)
(1175, 8)
(3449, 8)
(112, 8)
(160, 8)
(57, 8)
(200, 8)
(220, 8)
(29, 8)
(775, 8)
(683, 8)
(67, 8)
(342, 8)
(247, 8)
(985, 8)
(2648, 8)
(43, 8)
(240, 8)
(452, 8)
(22, 8)
(57, 8)
(598, 8)
(226, 8)
(1700, 8)
(542, 8)
(46, 8)
(304, 8)
(833, 8)
(7171, 8)
(557, 8)
(3536, 8)
(117, 8)
(33, 8)
(1106, 8)
(227, 8)
(2488, 8)
(104, 8)
(9, 8)
(346, 8)
(152, 8)
(274, 8)
(65, 8)
(189, 8)
(249, 8)
(1107, 8)
(276, 8)
(645, 8)
(4175, 8)
(160, 8)
(42, 8)
(178, 8)
(113, 8)
(22, 8)
(84, 8)
(587, 8)
(18, 8)
(489, 8)
(706, 8)
(3214, 8)
(5956, 8)
(149, 8)(5, 8)

(1502, 8)
(142, 8)
(232, 8)
(322, 8)
(264, 8)
(2341, 8)
(290, 8)
(2351, 8)
(455, 8)
(2336, 8)
(18, 8)
(598, 8)
(1487, 8)
(34, 8)
(1122, 8)
(274, 8)
(222, 8)
(488, 8)
(115, 8)
(3615, 8)
(1880, 8)
(132, 8)
(75, 8)
(1980, 8)
(509, 8)
(7, 8)
(390, 8)
(2653, 8)
(3

(144, 8)
(16, 8)
(200, 8)
(249, 8)
(28, 8)
(57, 8)
(50, 8)
(381, 8)
(511, 8)
(5, 8)
(28, 8)
(122, 8)
(831, 8)
(1256, 8)
(1289, 8)
(431, 8)
(1, 8)
(473, 8)
(1007, 8)
(482, 8)
(303, 8)
(22, 8)
(92, 8)
(328, 8)
(139, 8)
(955, 8)
(3072, 8)
(1383, 8)
(1440, 8)
(3241, 8)
(315, 8)
(1206, 8)
(832, 8)
(119, 8)(306, 8)

(306, 8)
(284, 8)
(486, 8)
(1729, 8)
(356, 8)
(2043, 8)
(3025, 8)
(1885, 8)
(768, 8)
(1244, 8)
(52, 8)
(516, 8)
(608, 8)
(528, 8)
(1480, 8)
(819, 8)
(3336, 8)
(2023, 8)
(746, 8)
(373, 8)
(141, 8)
(7624, 8)
(222, 8)
(31, 8)
(268, 8)
(5, 8)
(234, 8)
(1238, 8)
(829, 8)
(966, 8)
(549, 8)
(96, 8)
(624, 8)
(586, 8)
(10, 8)
(77, 8)
(27, 8)
(210, 8)
(459, 8)
(10, 8)
(27, 8)
(186, 8)
(2022, 8)
(1101, 8)
(328, 8)
(74, 8)
(100, 8)
(457, 8)
(484, 8)
(1911, 8)
(731, 8)
(855, 8)
(33, 8)
(1117, 8)
(1375, 8)
(6091, 8)
(126, 8)
(172, 8)
(41, 8)
(172, 8)
(378, 8)
(90, 8)
(10, 8)
(92, 8)
(35, 8)
(1389, 8)
(522, 8)
(3196, 8)
(239, 8)
(893, 8)
(138, 8)
(6, 8)
(22, 8)
(10, 8)
(10, 8)
(717, 8)
(57, 8)


(486, 8)
(5064, 8)
(451, 8)
(17, 8)(15, 8)

(10, 8)
(504, 8)
(2571, 8)
(3945, 8)
(15, 8)
(17, 8)
(249, 8)
(539, 8)
(1175, 8)
(1394, 8)
(27, 8)
(24, 8)
(4736, 8)
(1739, 8)
(7709, 8)(5527, 8)

(21, 8)
(68, 8)
(293, 8)
(410, 8)
(1160, 8)
(7362, 8)
(39, 8)(39, 8)

(439, 8)
(660, 8)
(2013, 8)
(2323, 8)
(80, 8)
(63, 8)
(10, 8)
(1015, 8)
(979, 8)
(415, 8)
(13, 8)
(33, 8)
(10, 8)
(1406, 8)
(333, 8)
(4991, 8)
(27, 8)
(20, 8)
(10, 8)
(37, 8)
(804, 8)
(57, 8)
(45, 8)
(880, 8)
(9, 8)
(151, 8)
(17, 8)
(654, 8)
(1452, 8)
(595, 8)
(37, 8)
(162, 8)
(395, 8)
(630, 8)
(1849, 8)
(3027, 8)
(5, 8)
(186, 8)
(640, 8)
(1064, 8)
(85, 8)
(1586, 8)
(28, 8)
(46, 8)
(63, 8)
(3067, 8)
(3464, 8)
(983, 8)
(15, 8)
(9, 8)
(244, 8)
(1086, 8)
(156, 8)
(1176, 8)
(4, 8)
(358, 8)
(261, 8)
(550, 8)
(649, 8)
(3557, 8)
(77, 8)
(70, 8)
(1206, 8)
(271, 8)
(196, 8)
(509, 8)
(16, 8)
(2, 8)
(98, 8)
(1184, 8)
(397, 8)
(1029, 8)
(19, 8)
(5, 8)
(303, 8)
(1048, 8)
(547, 8)
(247, 8)
(10, 8)
(900, 8)
(1006, 8)
(808, 8)
(1514, 8)
(197, 8)

(719, 8)
(288, 8)
(193, 8)
(2422, 8)
(1340, 8)
(786, 8)
(67, 8)
(326, 8)
(1420, 8)
(1675, 8)
(183, 8)
(935, 8)
(748, 8)
(523, 8)
(3279, 8)
(279, 8)
(497, 8)
(123, 8)
(203, 8)
(88, 8)
(12975, 8)
(51, 8)
(469, 8)
(954, 8)
(292, 8)
(189, 8)
(163, 8)
(85, 8)(59, 8)

(223, 8)
(297, 8)
(1939, 8)
(597, 8)
(29, 8)
(267, 8)
(248, 8)
(700, 8)
(1074, 8)
(1214, 8)
(19, 8)
(173, 8)
(513, 8)
(391, 8)
(411, 8)
(437, 8)
(104, 8)
(97, 8)
(112, 8)
(164, 8)
(831, 8)
(2226, 8)
(78, 8)
(65, 8)
(735, 8)
(144, 8)
(131, 8)
(182, 8)
(28, 8)
(804, 8)
(106, 8)
(371, 8)
(219, 8)
(494, 8)
(43, 8)
(175, 8)
(303, 8)
(294, 8)
(13077, 8)
(205, 8)
(282, 8)
(820, 8)
(234, 8)
(3606, 8)
(5724, 8)
(33, 8)
(448, 8)
(494, 8)
(2793, 8)
(2124, 8)
(5100, 8)
(33, 8)
(38, 8)
(483, 8)
(134, 8)
(4188, 8)
(15538, 8)
(32, 8)
(165, 8)
(47, 8)
(193, 8)
(3105, 8)
(9366, 8)
(651, 8)
(316, 8)
(2375, 8)
(1568, 8)
(2824, 8)
(1518, 8)
(38, 8)
(42, 8)(113, 8)

(1584, 8)
(309, 8)
(20, 8)
(150, 8)
(765, 8)(2062, 8)

(328, 8)
(1151, 8)
(71, 8)
(

id
5a29c445588c132954d1973a    None
5a29c445588c132954d1973b    None
5a29c445588c132954d1973c    None
5a29c445588c132954d1973d    None
5a29c445588c132954d1973e    None
5a29c445588c132954d1973f    None
5a29c445588c132954d19740    None
5a29c445588c132954d19741    None
5a29c445588c132954d19742    None
5a29c445588c132954d19743    None
5a29c445588c132954d19744    None
5a29c445588c132954d19745    None
5a29c445588c132954d19746    None
5a29c445588c132954d19747    None
5a29c445588c132954d19748    None
5a29c445588c132954d19749    None
5a29c445588c132954d1974a    None
5a29c445588c132954d1974b    None
5a29c445588c132954d1974c    None
5a29c445588c132954d1974d    None
5a29c445588c132954d1974e    None
5a29c445588c132954d1974f    None
5a29c445588c132954d19750    None
5a29c445588c132954d19751    None
5a29c445588c132954d19752    None
5a29c445588c132954d19753    None
5a29c445588c132954d19754    None
5a29c445588c132954d19755    None
5a29c445588c132954d19756    None
5a29c445588c132954d19757    None
       

### 뉴스의 댓글 파일을 하나로 합침

In [None]:
%%time
pre_outfile_naver = outcome_predata+'predata_for_naver_news_comment.csv'
if os.path.isfile(pre_outfile_naver):
    dfForNaverCommentsAnalysis = pd.read_csv(pre_outfile_naver, header = 0, encoding='utf-8')
else:
    dfForNaverCommentsAnalysis = pd.DataFrame()
    for idx2 in reNaverData.index:
        commentFile = os.path.join(naverCommentsPath, idx2+'.csv')
        tempDf = pd.read_csv(commentFile, header=0, encoding='utf-8')
        # comment의 mongoDB id를 News의 mongoDB id로 변환 뒤에 분석의 용이함을 위해서
        tempDf._id = idx2
        dfForNaverCommentsAnalysis = dfForNaverCommentsAnalysis.append(tempDf)
    print (dfForNaverCommentsAnalysis.shape)
    dfForNaverCommentsAnalysis.to_csv(pre_outfile_naver, header=True, encoding='utf-8')