# 다음의 News Classification Using Doc2Vec

In [1]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os
from glob import glob 
import warnings

os.environ['KERAS_BACKEND']='tensorflow'

import multiprocessing
import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from gensim.models import doc2vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve,  accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler, LabelEncoder
from sklearn.manifold import TSNE

import keras.backend.tensorflow_backend as K
from keras.preprocessing import sequence
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential, load_model
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Flatten, Dense, Embedding, embeddings, merge, Dropout, Activation,  LSTM, Bidirectional, SimpleRNN, GRU
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import SpatialDropout1D
from keras.utils import np_utils
from tensorflow.python.client import device_lib
from keras.layers.merge import dot

import xgboost as xgb

import matplotlib.pyplot as plt

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
print (device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 2143610298218877933
]


In [3]:
cores = int(multiprocessing.cpu_count())

In [4]:
import Basic_Module as bm

In [5]:
from ckonlpy.tag import Twitter
from konlpy.tag import Mecab
ct = Twitter()
mecab = Mecab()
def nav_tokenizer(tagger, corpus, stopwords):
    pos = tagger.pos(corpus)
    pos = ['/'.join(t) for t in pos if not t[0] in stopwords]
    return pos

## Load Data

In [6]:
#Daum
daumData = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
daumData = pd.DataFrame.from_dict(daumData, orient = 'index')
daumData.reset_index(inplace = True)
daumData.rename(columns = {'index' : 'id'}, inplace = True)
print ('Daum : {}'.format(daumData.shape))

Daum : (9372, 11)


## Stopwords

In [7]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## Document Labeling

In [8]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags category')

## Category

In [9]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_doc2vec_news_classification.pickled'):
    le = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_doc2vec_news_classification.pickled','rb'))
else:
    le = LabelEncoder()
    le.fit(naverData['category'])
    pickle.dump(le, open('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_doc2vec_news_classification.pickled','wb'))
print (le.classes_)

['IT/과학' '경제' '사회' '생활/문화' '세계' '스포츠' '연예' '정치']


In [10]:
if sys.platform =='darwin':
    loadModelPath = '/Volumes/disk1/news_model/'
elif sys.platform =='win32':
    loadModelPath = 'd:/news_model/'
daumNewsPath = './data/pre_data/news_daum_news/'
classifierPath = './data/pre_data/news_classifier/'

In [11]:
daumData.head()

Unnamed: 0,id,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords
0,5a2a61bf588c13481c229d1e,뉴스,2017.12.07,세계일보,1093,911,1,"""밤이 무섭다""..비아그라 공장 연기에 남성들 부작용 호소","주민들은 공장에서 배출된 연기가 '남성이 매우 건강해지는 부작용'을 일으킨다며, ...","[부작용, 비아그라, 아일랜드]","{부작용, 남성들, 세보 효과, 공장, 건강, 연기, 지역}"
1,5a2a61bf588c13481c229d1f,뉴스,2017.12.07,헬스조선,603,386,2,식후 커피·늦은 양치질..점심식사 후 하면 안 좋은 습관 3가지,점심식사를 마친 후 후식으로 커피를 마시는 사람들이 많다. 실제로 직장이 밀집돼 ...,"[커피, 낮잠, 음식물]","{디스크, 건강, 철분, 식후, 점심 식사, 자세, 낮잠, 커피, 치아, 입냄새}"
2,5a2a61bf588c13481c229d20,뉴스,2017.12.07,연합뉴스,1067,811,3,"'십년지기 생매장' 진짜 이유는..""'청부 통정' 알려질까 봐""",(성남=연합뉴스) 최해민 기자 = 십년지기 지인을 산 채로 묻어 살해한 50대 여...,"[살인혐의, 철원, 검찰송치]","{철원, 지인, 성관계, 경찰, 진술, 주변, 남편, 아들, 앙심, 범행}"
3,5a2a61bf588c13481c229d21,뉴스,2017.12.07,헤럴드경제,418,369,4,"신영자, 억 소리나는 갑질","신영자, 적용안된 혐의→검찰 상고에서 인정\n신영자, 얼마를 어떻게 받았나 [헤럴...","[신영자, 갑질, 롯데백화점]","{유통업체, 검찰, 네이처리퍼블릭, 매장, 롯데, 징역, 신영자 이사장, 혐의}"
4,5a2a61bf588c13481c229d22,뉴스,2017.12.07,연합뉴스,434,368,5,"""배신하지마"" 20대女 살인 피의자 유치장서 공범 남친에 쪽지",(청주=연합뉴스) 이승민 기자 = 지난 9월 청주의 한 하천에서 20대 여성을 둔기...,"[공범, 살인, 과자]","{과자, 남자친구, 유치장, 경찰, 쪽지, 폭행, 혐의, 범행}"


## Doc2Vec Model

### Twitter

#### News to tagged Document

In [12]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_daum_news_by_ct_for_doc2vec_news_classification.pickled'):
    daumData2 = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_ct_for_doc2vec_news_classification.pickled', 'rb'))
else:
    daumData2 = bm.MakeTaggedDataDAUM(daumData, TaggedDocument, ct, stopwords, 'daum')
    pickle.dump(daumData2, open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_ct_for_doc2vec_news_classification.pickled', 'wb'))

#### Load Model

In [13]:
model1 = doc2vec.Doc2Vec.load(loadModelPath+'doc2vec_size-500_epoch-20_window-10_negative-7_hs-0_dm-1_dm_concat-0_dm_mean-1_by-ct.model')
model2 = doc2vec.Doc2Vec.load(loadModelPath+'doc2vec_size-500_epoch-20_window-5_negative-7_hs-0_dm-1_dm_concat-1_dm_mean-0_by-ct.model')
model3 = doc2vec.Doc2Vec.load(loadModelPath+'doc2vec_size-500_epoch-20_window-None_negative-7_hs-0_dm-0_dm_concat-0_dm_mean-0_by-ct.model')

#### tagged Document to Vector

In [14]:
taggerName = 'ct'
name1 = '-'.join(re.split('[\(\),\/]',str(model1)))+taggerName
if not os.path.isfile(daumNewsPath+'daum_news_'+name1):
    daum_news_by_m1 = bm.Get_Infer_Vector(daumData2, model1)
    
    pickle.dump(daum_news_by_m1,open(daumNewsPath+'daum_news_'+name1,'wb'))
    del daum_news_by_m1
    #del name1
    
name2 = '-'.join(re.split('[\(\),\/]',str(model2)))+taggerName
if not os.path.isfile(daumNewsPath+'daum_news_'+name2):
    daum_news_by_m2 = bm.Get_Infer_Vector(daumData2, model2)

    pickle.dump(daum_news_by_m2,open(daumNewsPath+'daum_news_'+name2,'wb'))
    del daum_news_by_m2
    #del name2

name3 = '-'.join(re.split('[\(\),\/]',str(model3)))+taggerName
if not os.path.isfile(daumNewsPath+'daum_news_'+name3):
    daum_news_by_m3 = bm.Get_Infer_Vector(daumData2, model3)

    pickle.dump(daum_news_by_m3,open(daumNewsPath+'daum_news_'+name3,'wb'))
    del daum_news_by_m3
    #del name3
    
if not os.path.isfile(daumNewsPath+'daum_news_tags_y_ct_for_news_classification'):
    daum_news_tags_y = [doc.tags for doc in tqdm(daumData2)]
    
    pickle.dump(daum_news_tags_y, open(daumNewsPath+'daum_news_tags_y_ct_for_news_classification','wb'))
    del daum_news_tags_y

if 'daumData2' in locals():
    del daumData2

In [15]:
extDaumData = daumData.loc[:,['id','title','extracted_keywords']].copy()

#### model1

In [16]:
classifierList = glob(classifierPath+'*'+name1)

In [17]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

SVC
RandomForestClassifier
XGBoost
NeuralNetwork_1
NeuralNetwork_2
LogisticRegression


In [18]:
daum_news_by_m1_name = daumNewsPath+'daum_news_'+name1
daum_news_by_m1 = pickle.load(open(daum_news_by_m1_name, 'rb'))

In [24]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(daum_news_by_m1, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome/outcome_news_classification_'+name1,index=None, encoding='utf-8')

9372it [00:00, 505666.76it/s]
9372it [00:00, 629417.60it/s]
9372it [00:00, 432133.43it/s]


CPU times: user 56.9 s, sys: 1.24 s, total: 58.2 s
Wall time: 58.7 s


#### model2

In [25]:
classifierList = glob(classifierPath+'*'+name2)

In [26]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

SVC
XGBoost
NeuralNetwork_2
RandomForestClassifier
NeuralNetwork_1
LogisticRegression


In [27]:
daum_news_by_m2_name = daumNewsPath+'daum_news_'+name2
daum_news_by_m2 = pickle.load(open(daum_news_by_m2_name, 'rb'))

In [28]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(daum_news_by_m2, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome/outcome_news_classification_'+name2,index=None, encoding='utf-8')

9372it [00:00, 744573.57it/s]
9372it [00:00, 932708.91it/s]
9372it [00:00, 705018.60it/s]


CPU times: user 1min 9s, sys: 998 ms, total: 1min 10s
Wall time: 1min 10s


#### model3

In [29]:
classifierList = glob(classifierPath+'*'+name3)

In [30]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

RandomForestClassifier
LogisticRegression
SVC
NeuralNetwork_1
XGBoost
NeuralNetwork_2


In [31]:
daum_news_by_m3_name = daumNewsPath+'daum_news_'+name3
daum_news_by_m3 = pickle.load(open(daum_news_by_m3_name, 'rb'))

In [32]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(daum_news_by_m3, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome/outcome_news_classification_'+name3,index=None, encoding='utf-8')

9372it [00:00, 160318.02it/s]
9372it [00:00, 458195.11it/s]
9372it [00:00, 279709.80it/s]


CPU times: user 1min 7s, sys: 1.25 s, total: 1min 8s
Wall time: 1min 12s


### Mecab

#### News to tagged Document

In [33]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_daum_news_by_mecab_for_doc2vec_news_classification.pickled'):
    daumData2 = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_mecab_for_doc2vec_news_classification.pickled', 'rb'))
else:
    daumData2 = bm.MakeTaggedDataDAUM(daumData, TaggedDocument, mecab, stopwords, 'daum')
    pickle.dump(daumData2, open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_mecab_for_doc2vec_news_classification.pickled', 'wb'))



#### Load Model

In [34]:
model1 = doc2vec.Doc2Vec.load(loadModelPath+'doc2vec_size-500_epoch-20_window-10_negative-7_hs-0_dm-1_dm_concat-0_dm_mean-1_by-mecab.model')
model2 = doc2vec.Doc2Vec.load(loadModelPath+'doc2vec_size-500_epoch-20_window-5_negative-7_hs-0_dm-1_dm_concat-1_dm_mean-0_by-mecab.model')
model3 = doc2vec.Doc2Vec.load(loadModelPath+'doc2vec_size-500_epoch-20_window-None_negative-7_hs-0_dm-0_dm_concat-0_dm_mean-0_by-mecab.model')

#### tagged Document to Vector

In [35]:
taggerName = 'mecab'
name1 = '-'.join(re.split('[\(\),\/]',str(model1)))+taggerName
if not os.path.isfile(daumNewsPath+'daum_news_'+name1):
    daum_news_by_m1 = bm.Get_Infer_Vector(daumData2, model1)
    
    pickle.dump(daum_news_by_m1,open(daumNewsPath+'daum_news_'+name1,'wb'))
    del daum_news_by_m1
    #del name1
    
name2 = '-'.join(re.split('[\(\),\/]',str(model2)))+taggerName
if not os.path.isfile(daumNewsPath+'daum_news_'+name2):
    daum_news_by_m2 = bm.Get_Infer_Vector(daumData2, model2)

    pickle.dump(daum_news_by_m2,open(daumNewsPath+'daum_news_'+name2,'wb'))
    del daum_news_by_m2
    #del name2

name3 = '-'.join(re.split('[\(\),\/]',str(model3)))+taggerName
if not os.path.isfile(daumNewsPath+'daum_news_'+name3):
    daum_news_by_m3 = bm.Get_Infer_Vector(daumData2, model3)

    pickle.dump(daum_news_by_m3,open(daumNewsPath+'daum_news_'+name3,'wb'))
    del daum_news_by_m3
    #del name3
    
if not os.path.isfile(daumNewsPath+'daum_news_tags_y_ct_for_news_classification'):
    daum_news_tags_y = [doc.tags for doc in tqdm(daumData2)]
    
    pickle.dump(daum_news_tags_y, open(daumNewsPath+'daum_news_tags_y_ct_for_news_classification','wb'))
    del daum_news_tags_y

if 'daumData2' in locals():
    del daumData2

In [36]:
extDaumData = daumData.loc[:,['id','title','extracted_keywords']].copy()

#### model1

In [37]:
classifierList = glob(classifierPath+'*'+name1)

In [38]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

XGBoost
NeuralNetwork_1
SVC
RandomForestClassifier
LogisticRegression
NeuralNetwork_2


In [39]:
daum_news_by_m1_name = daumNewsPath+'daum_news_'+name1
daum_news_by_m1 = pickle.load(open(daum_news_by_m1_name, 'rb'))

In [40]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(daum_news_by_m1, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome/outcome_news_classification_'+name1,index=None, encoding='utf-8')

9372it [00:00, 603713.86it/s]
9372it [00:00, 370284.36it/s]
9372it [00:00, 869360.78it/s]


CPU times: user 54.3 s, sys: 827 ms, total: 55.1 s
Wall time: 58.4 s


#### model2

In [41]:
classifierList = glob(classifierPath+'*'+name2)

In [42]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

NeuralNetwork_2
RandomForestClassifier
SVC
LogisticRegression
NeuralNetwork_1
XGBoost


In [43]:
daum_news_by_m2_name = daumNewsPath+'daum_news_'+name2
daum_news_by_m2 = pickle.load(open(daum_news_by_m2_name, 'rb'))

In [44]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(daum_news_by_m2, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome/outcome_news_classification_'+name2,index=None, encoding='utf-8')

9372it [00:00, 564776.61it/s]
9372it [00:00, 877923.33it/s]
9372it [00:00, 789686.55it/s]


CPU times: user 1min 9s, sys: 981 ms, total: 1min 10s
Wall time: 1min 12s


#### model3

In [45]:
classifierList = glob(classifierPath+'*'+name3)

In [46]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

NeuralNetwork_2
XGBoost
SVC
LogisticRegression
NeuralNetwork_1
RandomForestClassifier


In [47]:
daum_news_by_m3_name = daumNewsPath+'daum_news_'+name3
daum_news_by_m3 = pickle.load(open(daum_news_by_m3_name, 'rb'))

In [48]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(daum_news_by_m3, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome/outcome_news_classification_'+name3,index=None, encoding='utf-8')

9372it [00:00, 529735.42it/s]
9372it [00:00, 624517.69it/s]
9372it [00:00, 802044.79it/s]


CPU times: user 1min 1s, sys: 928 ms, total: 1min 2s
Wall time: 1min 2s
