# 다음의 News Classification Using Word2Vec

In [1]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os
from glob import glob
from numba import jit
import warnings

os.environ['KERAS_BACKEND']='tensorflow'

import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from gensim.models import Word2Vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve,  accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler, LabelEncoder
from sklearn.manifold import TSNE

import keras.backend.tensorflow_backend as K
from keras.preprocessing import sequence
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Flatten, Dense, Embedding, embeddings, merge, Dropout, Activation,  LSTM, Bidirectional, SimpleRNN, GRU
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import SpatialDropout1D
from keras.utils import np_utils
from tensorflow.python.client import device_lib
from keras.layers.merge import dot

import xgboost as xgb

import matplotlib.pyplot as plt

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
print (device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 11749583939368616445
]


In [3]:
cores = int(multiprocessing.cpu_count())

In [4]:
import Basic_Module as bm

## Load Data

In [5]:
#Daum
daumData = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
daumData = pd.DataFrame.from_dict(daumData, orient = 'index')
daumData.reset_index(inplace = True)
daumData.rename(columns = {'index' : 'id'}, inplace = True)
extDaumData = daumData.loc[:,['id','title','extracted_keywords']].copy()
print ('Daum : {}'.format(daumData.shape))

Daum : (9372, 11)


## Stopwords

In [6]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## Document Labeling

In [7]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags category')

## Category

In [8]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_doc2vec_news_classification.pickled'):
    le = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_doc2vec_news_classification.pickled','rb'))
else:
    le = LabelEncoder()
    le.fit(naverData['category'])
    pickle.dump(le, open('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_doc2vec_news_classification.pickled','wb'))
print (le.classes_)

['IT/과학' '경제' '사회' '생활/문화' '세계' '스포츠' '연예' '정치']


In [9]:
if sys.platform =='darwin':
    loadModelPath = '/Volumes/disk1/news_model/'
elif sys.platform =='win32':
    loadModelPath = 'd:/news_model/'
daumNewsPath = './data/pre_data/news_daum_news/'
classifierPath = './data/pre_data/news_classifier/'

In [10]:
daumData.head()

Unnamed: 0,id,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords
0,5a2a61bf588c13481c229d1e,뉴스,2017.12.07,세계일보,1093,911,1,"""밤이 무섭다""..비아그라 공장 연기에 남성들 부작용 호소","주민들은 공장에서 배출된 연기가 '남성이 매우 건강해지는 부작용'을 일으킨다며, ...","[부작용, 비아그라, 아일랜드]","{공장, 건강, 남성들, 지역, 부작용, 연기, 세보 효과}"
1,5a2a61bf588c13481c229d1f,뉴스,2017.12.07,헬스조선,603,386,2,식후 커피·늦은 양치질..점심식사 후 하면 안 좋은 습관 3가지,점심식사를 마친 후 후식으로 커피를 마시는 사람들이 많다. 실제로 직장이 밀집돼 ...,"[커피, 낮잠, 음식물]","{치아, 건강, 입냄새, 커피, 점심 식사, 낮잠, 디스크, 철분, 식후, 자세}"
2,5a2a61bf588c13481c229d20,뉴스,2017.12.07,연합뉴스,1067,811,3,"'십년지기 생매장' 진짜 이유는..""'청부 통정' 알려질까 봐""",(성남=연합뉴스) 최해민 기자 = 십년지기 지인을 산 채로 묻어 살해한 50대 여...,"[살인혐의, 철원, 검찰송치]","{경찰, 성관계, 지인, 앙심, 진술, 철원, 아들, 주변, 남편, 범행}"
3,5a2a61bf588c13481c229d21,뉴스,2017.12.07,헤럴드경제,418,369,4,"신영자, 억 소리나는 갑질","신영자, 적용안된 혐의→검찰 상고에서 인정\n신영자, 얼마를 어떻게 받았나 [헤럴...","[신영자, 갑질, 롯데백화점]","{신영자 이사장, 혐의, 롯데, 검찰, 유통업체, 징역, 매장, 네이처리퍼블릭}"
4,5a2a61bf588c13481c229d22,뉴스,2017.12.07,연합뉴스,434,368,5,"""배신하지마"" 20대女 살인 피의자 유치장서 공범 남친에 쪽지",(청주=연합뉴스) 이승민 기자 = 지난 9월 청주의 한 하천에서 20대 여성을 둔기...,"[공범, 살인, 과자]","{경찰, 혐의, 폭행, 쪽지, 유치장, 남자친구, 과자, 범행}"


In [11]:
from ckonlpy.tag import Twitter
from konlpy.tag import Mecab
ct = Twitter()
mecab = Mecab()

## Word2Vec Model

### Twitter

#### Train data set으로 부터 TF-IDF Vectorizer을 만듦

In [12]:
trainName = './data/pre_data/news_train_test_Data/pre_data_word2vec_train_for_news_classification_by_ct.pickled'
train = pickle.load(open(trainName, 'rb'))
tfidf = bm.Build_tfidf(train)
del train

100%|██████████| 12852/12852 [00:00<00:00, 271236.12it/s]


(12852, 73763)
vocab size : 73763


#### News to Tagged Document

In [13]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_daum_news_by_ct_for_word2vec_news_classification.pickled'):
    daumData2 = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_ct_for_word2vec_news_classification.pickled', 'rb'))
else:
    daumData2 = bm.MakeTaggedDataDAUM2(daumData, TaggedDocument, ct, stopwords, 'daum')
    pickle.dump(daumData2, open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_ct_for_word2vec_news_classification.pickled', 'wb'))

100%|██████████| 9372/9372 [1:17:52<00:00,  2.01it/s]


#### Load Model

In [14]:
model1 = Word2Vec.load(loadModelPath+'word2vec_size-500_epoch-20_window-10_negative-7_hs-0_sg-0_cbow_mean-0_min_count-2_by-ct.model')
model2 = Word2Vec.load(loadModelPath+'word2vec_size-500_epoch-20_window-10_negative-7_hs-0_sg-0_cbow_mean-1_min_count-2_by-ct.model')
model3 = Word2Vec.load(loadModelPath+'word2vec_size-500_epoch-20_window-10_negative-7_hs-0_sg-1_cbow_mean-0_min_count-2_by-ct.model')

#### Model1

In [15]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model1, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('word2vec', model1,'ct')

  6%|▌         | 4542/80534 [00:00<00:01, 45269.16it/s]

Word2Vec(vocab=80534, size=500, alpha=0.025)


100%|██████████| 80534/80534 [00:01<00:00, 42735.10it/s]
2it [00:00, 16.41it/s]

running time : 0:00:01.912634
Vectorizing Data


9372it [04:14, 36.79it/s]


scaling Data
total running time : 0:04:17.644067


In [16]:
classifierList = glob(classifierPath+'*'+modelName)

In [17]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

NeuralNetwork_2
XGBoost
RandomForestClassifier
LogisticRegression
NeuralNetwork_1
SVC


In [18]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome_news_classification/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

9372it [00:00, 377738.86it/s]
9372it [00:00, 330172.50it/s]
9372it [00:00, 377177.07it/s]


CPU times: user 48.6 s, sys: 1.22 s, total: 49.8 s
Wall time: 1min 20s


#### Model2

In [19]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model2, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('word2vec', model2,'ct')

  3%|▎         | 2063/80534 [00:00<00:03, 20623.19it/s]

Word2Vec(vocab=80534, size=500, alpha=0.025)


100%|██████████| 80534/80534 [00:02<00:00, 30755.15it/s]
1it [00:00,  6.12it/s]

running time : 0:00:02.624976
Vectorizing Data


9372it [04:42, 33.21it/s]


scaling Data
total running time : 0:04:45.615299


In [20]:
classifierList = glob(classifierPath+'*'+modelName)

In [21]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

NeuralNetwork_1
SVC
XGBoost
LogisticRegression
RandomForestClassifier
NeuralNetwork_2


In [22]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome_news_classification/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

9372it [00:00, 151431.98it/s]
9372it [00:00, 75444.97it/s]
9372it [00:00, 401887.49it/s]


CPU times: user 41.7 s, sys: 1.31 s, total: 43 s
Wall time: 1min 37s


#### Model3

In [23]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model3, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('word2vec', model3,'ct')

  2%|▏         | 1820/80534 [00:00<00:04, 18195.07it/s]

Word2Vec(vocab=80534, size=500, alpha=0.025)


100%|██████████| 80534/80534 [00:02<00:00, 30770.39it/s]
1it [00:00,  7.63it/s]

running time : 0:00:02.632258
Vectorizing Data


9372it [04:48, 32.44it/s]


scaling Data
total running time : 0:04:52.261539


In [24]:
classifierList = glob(classifierPath+'*'+modelName)

In [25]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

NeuralNetwork_1
SVC
XGBoost
RandomForestClassifier
LogisticRegression
NeuralNetwork_2


In [26]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome_news_classification/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

9372it [00:00, 134862.40it/s]
9372it [00:00, 164340.85it/s]
9372it [00:00, 148127.42it/s]


CPU times: user 1min 7s, sys: 1.33 s, total: 1min 9s
Wall time: 2min 26s


### Mecab

#### Train data set으로 부터 TF-IDF Vectorizer을 만듦

In [27]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags category')
trainName = './data/pre_data/news_train_test_Data/pre_data_word2vec_train_for_news_classification_by_mecab.pickled'
train = pickle.load(open(trainName, 'rb'))
tfidf = bm.Build_tfidf(train)
del train

100%|██████████| 12852/12852 [00:00<00:00, 757414.57it/s]


(12852, 73416)
vocab size : 73416


#### News to Tagged Document

In [28]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_daum_news_by_mecab_for_word2vec_news_classification.pickled'):
    daumData2 = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_mecab_for_word2vec_news_classification.pickled', 'rb'))
else:
    daumData2 = bm.MakeTaggedDataDAUM2(daumData, TaggedDocument, mecab, stopwords, 'daum')
    pickle.dump(daumData2, open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_mecab_for_word2vec_news_classification.pickled', 'wb'))

100%|██████████| 9372/9372 [10:34<00:00, 14.78it/s]


#### Load Model

In [29]:
model1 = Word2Vec.load(loadModelPath+'word2vec_size-500_epoch-20_window-10_negative-7_hs-0_sg-0_cbow_mean-0_min_count-2_by-mecab.model')
model2 = Word2Vec.load(loadModelPath+'word2vec_size-500_epoch-20_window-10_negative-7_hs-0_sg-0_cbow_mean-1_min_count-2_by-mecab.model')
model3 = Word2Vec.load(loadModelPath+'word2vec_size-500_epoch-20_window-10_negative-7_hs-0_sg-1_cbow_mean-0_min_count-2_by-mecab.model')

#### Model1

In [30]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model1, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('word2vec', model1,'mecab')

  2%|▏         | 1912/80260 [00:00<00:04, 18783.38it/s]

Word2Vec(vocab=80260, size=500, alpha=0.025)


100%|██████████| 80260/80260 [00:03<00:00, 21288.49it/s]
0it [00:00, ?it/s]

running time : 0:00:03.796981
Vectorizing Data


9372it [05:20, 29.20it/s]


scaling Data
total running time : 0:05:30.228746


In [31]:
classifierList = glob(classifierPath+'*'+modelName)

In [32]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

RandomForestClassifier
XGBoost
NeuralNetwork_2
LogisticRegression
NeuralNetwork_1
SVC


In [None]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome_news_classification/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

9372it [00:00, 317284.55it/s]
9372it [00:00, 79645.30it/s]
9372it [00:00, 165258.37it/s]


CPU times: user 54.1 s, sys: 1.7 s, total: 55.8 s
Wall time: 2min 17s


#### Model2

In [None]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model2, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('word2vec', model2,'mecab')

  0%|          | 0/80260 [00:00<?, ?it/s]

Word2Vec(vocab=80260, size=500, alpha=0.025)


100%|██████████| 80260/80260 [00:03<00:00, 25197.30it/s]
1it [00:00,  8.43it/s]

running time : 0:00:03.194765
Vectorizing Data


8511it [04:56, 28.70it/s]

In [None]:
classifierList = glob(classifierPath+'*'+modelName)

In [36]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

In [37]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome_news_classification/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

9372it [00:00, 223441.90it/s]
9372it [00:00, 103733.32it/s]
9372it [00:00, 112154.78it/s]


CPU times: user 44.1 s, sys: 1.59 s, total: 45.6 s
Wall time: 1min 49s


#### Model3

In [38]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model3, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('word2vec', model3,'mecab')

  1%|          | 917/80260 [00:00<00:09, 8752.59it/s]

Word2Vec(vocab=80260, size=500, alpha=0.025)


100%|██████████| 80260/80260 [00:03<00:00, 26628.90it/s]
0it [00:00, ?it/s]

running time : 0:00:03.023462
Vectorizing Data


9372it [04:42, 33.18it/s]


scaling Data
total running time : 0:04:46.134652


In [39]:
classifierList = glob(classifierPath+'*'+modelName)

In [40]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

NeuralNetwork_2
LogisticRegression
SVC
NeuralNetwork_1
RandomForestClassifier
XGBoost


In [41]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome_news_classification/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

9372it [00:00, 393996.36it/s]
9372it [00:00, 415515.54it/s]
9372it [00:00, 337633.82it/s]


CPU times: user 44.4 s, sys: 872 ms, total: 45.3 s
Wall time: 1min 22s
