# 데이터베이스에 저장되어 있는 댓글들을 기사별로 파일로 만들자

In [1]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os
from glob import glob
import warnings

os.environ['KERAS_BACKEND']='tensorflow'

from numba import jit
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from gensim.models import doc2vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve,  accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler
from sklearn.manifold import TSNE

import keras.backend.tensorflow_backend as K
from keras.preprocessing import sequence
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Flatten, Dense, Embedding, embeddings, merge, Dropout, Activation,  LSTM, Bidirectional, SimpleRNN, GRU
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import SpatialDropout1D
from keras.utils import np_utils
from tensorflow.python.client import device_lib
from keras.layers.merge import dot

import xgboost as xgb

import matplotlib.pyplot as plt

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
import Database_Handler as dh
import Basic_Module as bm

In [3]:
from ckonlpy.tag import Twitter
from konlpy.tag import Mecab
ct = Twitter()
mecab = Mecab()

## Stopwords

In [4]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## Load Data

### Path

In [5]:
if sys.platform =='darwin':
    loadModelPath = '/Volumes/disk1/model/'
    classifierPath = '/Volumes/disk1/data/pre_data/classifier/'
    news_senti_outcome = '/Volumes/disk1/outcome_for_News_sentiment_analysis/'
    daumCommentPath = './data/daum_Comments/'
    naverCommentsPath = './data/naver_Comments/'
elif sys.platform =='win32':
    loadModelPath = 'd:/model/'
    classifierPath = 'd:/data/pre_data/classifier/'
    newsPath = './data/pre_data/news_sentiment/'
    news_senti_outcome = './outcome_for_News_sentiment_analysis/'
    daumCommentPath = 'd:/data/daum_Comments/'
    naverCommentsPath = 'd:/data/naver_Comments/'

### News

In [6]:
os.listdir(news_senti_outcome)

['daum_news_sentiment_analysis.csv', 'naver_news_sentiment_analysis.csv']

In [7]:
# Daum
daumData = pd.read_csv(os.path.join(news_senti_outcome, 'daum_news_sentiment_analysis.csv'), index_col=0, header= 0, encoding = 'utf-8')
daumData['site'] = ['daum'] * daumData.shape[0]
#daumData.head()

#### 크롤링된 댓글수가 0개인 것을 찾아보자

In [8]:
daumData[daumData.number_of_crawled_comment == 0]

Unnamed: 0_level_0,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords,negative,positive,Decision,site
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5a3c39e7588c132bc05289fb,뉴스,2017.12.21,노컷뉴스,17384,0,7,"[인터뷰] 서민 ""'문빠' 해악, 더 두고 볼 수 없었다""","- 낚싯배 사고 & 中 기자폭행 관련 '문빠' 반응, 도를 넘은 듯 - '보수 쪽도...","['정관용', '댓글', '낚싯배']","{'대통령', '정관용', '서민 교수', '낚싯배', '댓글', '문빠들', '폭행'}",95.0,13.0,negative,daum
5a64385d588c134738c38086,연예,2018.01.07,뉴스엔,0,0,45,"'언니네' 송은이 ""라이언전, 10년 전 박소현 맞선남으로 출연""",[뉴스엔 박수인 기자] 개그우먼 송은이가 작곡가 라이언전의 과거를 공개했다. 1월 ...,"['박소현', '언니네라디오', '김숙', '개그우먼', '게스트']","{'언니네', '민망', '방송', '박소현 맞선남', '송은이', '작곡가 라이언전'}",65.0,43.0,negative,daum
5a6472e8588c131b18d57abe,스포츠,2018.01.11,MK스포츠,0,0,38,"MLB 에이전트, 선수 샤워 장면 몰래 촬영해 '충격'",[매경닷컴 MK스포츠 김재호 기자] 다수의 메이저리그 선수를 고객으로 보유하고 있는...,,"{'메이저리그', '에이전시', '회사', '사장', 'CSE', '샤워', '선수...",39.0,69.0,positive,daum


daum 뉴스를 크롤링한 데이터베이스에서는 댓글수가 0개인 뉴스가 3개 존재함.
> 뉴스 1건의 댓글수(글쓴이가 삭제한 댓글 포함)가 17,384건이지만, 크롤링된 댓글은 0개............  
>> 2017.12.21 뉴스. 발견은 2018년 3월 14일. 3달이나 지나서 댓글을 수집하면 삭제할 사람은 삭제했을 것. 
>> * 아쉽지만 데이터베이스에 댓글이 존재하는 것에 대해서만 진행하기로 함. 

### 새롭게 정의된 뉴스목록

In [9]:
reDaumData = daumData[daumData.number_of_crawled_comment != 0]

In [10]:
reDaumData.head()

Unnamed: 0_level_0,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords,negative,positive,Decision,site
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
5a2a61bf588c13481c229d1e,뉴스,2017.12.07,세계일보,1093,903,1,"""밤이 무섭다""..비아그라 공장 연기에 남성들 부작용 호소","주민들은 공장에서 배출된 연기가 '남성이 매우 건강해지는 부작용'을 일으킨다며, ...","['부작용', '비아그라', '아일랜드']","{'지역', '남성들', '건강', '공장', '연기', '부작용', '세보 효과'}",50.0,58.0,positive,daum
5a2a61bf588c13481c229d1f,뉴스,2017.12.07,헬스조선,603,386,2,식후 커피·늦은 양치질..점심식사 후 하면 안 좋은 습관 3가지,점심식사를 마친 후 후식으로 커피를 마시는 사람들이 많다. 실제로 직장이 밀집돼 ...,"['커피', '낮잠', '음식물']","{'입냄새', '건강', '자세', '철분', '낮잠', '점심 식사', '치아',...",81.0,27.0,negative,daum
5a2a61bf588c13481c229d20,뉴스,2017.12.07,연합뉴스,1067,807,3,"'십년지기 생매장' 진짜 이유는..""'청부 통정' 알려질까 봐""",(성남=연합뉴스) 최해민 기자 = 십년지기 지인을 산 채로 묻어 살해한 50대 여...,"['살인혐의', '철원', '검찰송치']","{'진술', '경찰', '남편', '아들', '성관계', '주변', '지인', '철...",99.0,9.0,negative,daum
5a2a61bf588c13481c229d21,뉴스,2017.12.07,헤럴드경제,418,368,4,"신영자, 억 소리나는 갑질","신영자, 적용안된 혐의→검찰 상고에서 인정\n신영자, 얼마를 어떻게 받았나 [헤럴...","['신영자', '갑질', '롯데백화점']","{'롯데', '혐의', '매장', '검찰', '유통업체', '징역', '신영자 이사...",77.0,31.0,negative,daum
5a2a61bf588c13481c229d22,뉴스,2017.12.07,연합뉴스,434,367,5,"""배신하지마"" 20대女 살인 피의자 유치장서 공범 남친에 쪽지",(청주=연합뉴스) 이승민 기자 = 지난 9월 청주의 한 하천에서 20대 여성을 둔기...,"['공범', '살인', '과자']","{'과자', '혐의', '쪽지', '남자친구', '유치장', '경찰', '범행', ...",102.0,6.0,negative,daum


### 댓글을 dataframe으로 만들어 저장. 
* 읽어들이는데 시간이 오래 걸림. 

In [11]:
import dask.dataframe as dd

In [None]:
# 다음
dd_daum = dd.from_pandas(reDaumData, npartitions=30)
def Daum_Comment(row):
    return bm.Make_Comments_File(daumCommentPath, row)
dd_daum.apply(Daum_Comment, axis = 1, meta = int).compute()

(96, 8)(107, 8)

(52, 8)
(93, 8)
(98, 8)
(134, 8)
(24, 8)
(195, 8)
(246, 8)
(554, 8)
(818, 8)
(1068, 8)
(39, 8)
(122, 8)
(138, 8)
(14, 8)
(4, 8)
(22, 8)
(66, 8)
(158, 8)(190, 8)

(132, 8)
(1410, 8)
(2241, 8)
(27, 8)
(55, 8)
(43, 8)
(52, 8)
(95, 8)
(134, 8)
(271, 8)
(415, 8)
(1219, 8)
(841, 8)
(585, 8)
(1732, 8)
(50, 8)
(50, 8)
(51, 8)
(94, 8)
(52, 8)
(101, 8)
(23, 8)
(102, 8)
(421, 8)
(41, 8)
(1096, 8)
(4172, 8)
(24, 8)
(50, 8)
(67, 8)(65, 8)

(20, 8)
(75, 8)
(64, 8)
(184, 8)
(389, 8)
(1041, 8)
(604, 8)
(806, 8)
