# 다음의 News Classification Using FastText

In [1]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os
from glob import glob
import warnings 
from numba import jit

os.environ['KERAS_BACKEND']='tensorflow'

import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from gensim.models import FastText, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve,  accuracy_score, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler, LabelEncoder
from sklearn.manifold import TSNE
from sklearn.multiclass import OneVsRestClassifier

import keras.backend.tensorflow_backend as K
from keras.preprocessing import sequence
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Flatten, Dense, Embedding, embeddings, merge, Dropout, Activation,  LSTM, Bidirectional, SimpleRNN, GRU
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import SpatialDropout1D
from keras.utils import np_utils
from tensorflow.python.client import device_lib
from keras.layers.merge import dot

import xgboost as xgb

import matplotlib.pyplot as plt


import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
print (device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 3394246986896511570
]


In [3]:
import Basic_Module as bm

## Load Data

In [4]:
#Daum
daumData = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
daumData = pd.DataFrame.from_dict(daumData, orient = 'index')
daumData.reset_index(inplace = True)
daumData.rename(columns = {'index' : 'id'}, inplace = True)
extDaumData = daumData.loc[:,['id','title','extracted_keywords']].copy()
print ('Daum : {}'.format(daumData.shape))

Daum : (9372, 11)


## Stopwords

In [5]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## Document Labeling

In [6]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags category')

## Category

In [7]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_doc2vec_news_classification.pickled'):
    le = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_doc2vec_news_classification.pickled','rb'))
else:
    le = LabelEncoder()
    le.fit(naverData['category'])
    pickle.dump(le, open('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_doc2vec_news_classification.pickled','wb'))
print (le.classes_)

['IT/과학' '경제' '사회' '생활/문화' '세계' '스포츠' '연예' '정치']


In [8]:
if sys.platform =='darwin':
    loadModelPath = '/Volumes/disk1/news_model/'
elif sys.platform =='win32':
    loadModelPath = 'd:/news_model/'
daumNewsPath = './data/pre_data/news_daum_news/'
classifierPath = './data/pre_data/news_classifier/'

In [9]:
daumData.head()

Unnamed: 0,id,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords
0,5a2a61bf588c13481c229d1e,뉴스,2017.12.07,세계일보,1093,911,1,"""밤이 무섭다""..비아그라 공장 연기에 남성들 부작용 호소","주민들은 공장에서 배출된 연기가 '남성이 매우 건강해지는 부작용'을 일으킨다며, ...","[부작용, 비아그라, 아일랜드]","{건강, 남성들, 공장, 부작용, 세보 효과, 연기, 지역}"
1,5a2a61bf588c13481c229d1f,뉴스,2017.12.07,헬스조선,603,386,2,식후 커피·늦은 양치질..점심식사 후 하면 안 좋은 습관 3가지,점심식사를 마친 후 후식으로 커피를 마시는 사람들이 많다. 실제로 직장이 밀집돼 ...,"[커피, 낮잠, 음식물]","{커피, 철분, 건강, 입냄새, 식후, 자세, 치아, 낮잠, 점심 식사, 디스크}"
2,5a2a61bf588c13481c229d20,뉴스,2017.12.07,연합뉴스,1067,811,3,"'십년지기 생매장' 진짜 이유는..""'청부 통정' 알려질까 봐""",(성남=연합뉴스) 최해민 기자 = 십년지기 지인을 산 채로 묻어 살해한 50대 여...,"[살인혐의, 철원, 검찰송치]","{성관계, 경찰, 범행, 진술, 철원, 앙심, 지인, 주변, 남편, 아들}"
3,5a2a61bf588c13481c229d21,뉴스,2017.12.07,헤럴드경제,418,369,4,"신영자, 억 소리나는 갑질","신영자, 적용안된 혐의→검찰 상고에서 인정\n신영자, 얼마를 어떻게 받았나 [헤럴...","[신영자, 갑질, 롯데백화점]","{유통업체, 검찰, 혐의, 매장, 롯데, 네이처리퍼블릭, 신영자 이사장, 징역}"
4,5a2a61bf588c13481c229d22,뉴스,2017.12.07,연합뉴스,434,368,5,"""배신하지마"" 20대女 살인 피의자 유치장서 공범 남친에 쪽지",(청주=연합뉴스) 이승민 기자 = 지난 9월 청주의 한 하천에서 20대 여성을 둔기...,"[공범, 살인, 과자]","{폭행, 경찰, 범행, 남자친구, 쪽지, 혐의, 과자, 유치장}"


In [10]:
from ckonlpy.tag import Twitter
from konlpy.tag import Mecab
ct = Twitter()
mecab = Mecab()

## fastText Model

### Twitter

#### Train data set으로 부터 TF-IDF Vectorizer을 만듦

In [11]:
trainName = './data/pre_data/news_train_test_Data/pre_data_fastText_train_for_news_classification_by_ct.pickled'
train = pickle.load(open(trainName, 'rb'))
tfidf = bm.Build_tfidf(train)
del train

100%|██████████| 12852/12852 [00:00<00:00, 805769.82it/s]


(12852, 73912)
vocab size : 73912


#### News to Tagged Document

In [12]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags category')
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_daum_news_by_ct_for_fastText_news_classification.pickled'):
    daumData2 = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_ct_for_fastText_news_classification.pickled', 'rb'))
else:
    daumData2 = bm.MakeTaggedDataDAUM2(daumData, TaggedDocument, ct, stopwords, 'daum')
    pickle.dump(daumData2, open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_ct_for_fastText_news_classification.pickled', 'wb'))

100%|██████████| 9372/9372 [1:17:24<00:00,  2.02it/s]


#### Load Model

In [13]:
model1 = FastText.load(loadModelPath+'fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-0_min_count-2_by-ct.model')
model2 = FastText.load(loadModelPath+'fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-1_min_count-2_by-ct.model')
model3 = FastText.load(loadModelPath+'fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-1_cbow_mean-0_min_count-2_by-ct.model')

#### Model1

In [14]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model1, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('fastText', model1,'ct')

  2%|▏         | 1383/80610 [00:00<00:05, 13403.77it/s]

FastText(vocab=80610, size=500, alpha=0.025)


100%|██████████| 80610/80610 [00:02<00:00, 35105.57it/s]
0it [00:00, ?it/s]

running time : 0:00:02.320463
Vectorizing Data


9372it [05:04, 30.79it/s]


scaling Data
total running time : 0:05:07.633408


In [15]:
classifierList = glob(classifierPath+'*'+modelName)

In [16]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

SVC
NeuralNetwork_1
XGBoost
LogisticRegression
RandomForestClassifier
NeuralNetwork_2


In [17]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome_news_classification/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

9372it [00:00, 148324.16it/s]
9372it [00:00, 399234.39it/s]
9372it [00:00, 382558.34it/s]


CPU times: user 51.2 s, sys: 1.34 s, total: 52.6 s
Wall time: 1min 54s


#### Model2

In [18]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model2, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('fastText', model2,'ct')

  2%|▏         | 1357/80610 [00:00<00:06, 12700.28it/s]

FastText(vocab=80610, size=500, alpha=0.025)


100%|██████████| 80610/80610 [00:02<00:00, 28450.30it/s]
0it [00:00, ?it/s]

running time : 0:00:02.849950
Vectorizing Data


9372it [05:28, 28.52it/s]


scaling Data
total running time : 0:05:32.445265


In [19]:
classifierList = glob(classifierPath+'*'+modelName)

In [20]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

NeuralNetwork_2
XGBoost
RandomForestClassifier
LogisticRegression
SVC
NeuralNetwork_1


In [21]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome_news_classification/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

9372it [00:00, 72528.67it/s]
9372it [00:00, 253392.40it/s]
9372it [00:00, 320803.51it/s]


CPU times: user 40 s, sys: 895 ms, total: 40.9 s
Wall time: 1min 18s


#### Model3

In [22]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model3, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('fastText', model3,'ct')

  3%|▎         | 2788/80610 [00:00<00:02, 26828.09it/s]

FastText(vocab=80610, size=500, alpha=0.025)


100%|██████████| 80610/80610 [00:01<00:00, 43018.18it/s]
0it [00:00, ?it/s]

running time : 0:00:01.879819
Vectorizing Data


9372it [05:51, 26.67it/s]


scaling Data
total running time : 0:05:53.770223


In [23]:
classifierList = glob(classifierPath+'*'+modelName)

In [24]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

NeuralNetwork_2
XGBoost
LogisticRegression
RandomForestClassifier
SVC
NeuralNetwork_1


In [25]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome_news_classification/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

9372it [00:00, 374774.92it/s]
9372it [00:00, 185470.63it/s]
9372it [00:00, 423784.91it/s]


CPU times: user 40.1 s, sys: 643 ms, total: 40.7 s
Wall time: 1min 1s


### Mecab

#### Train data set으로 부터 TF-IDF Vectorizer을 만듦

In [26]:
trainName = './data/pre_data/news_train_test_Data/pre_data_fastText_train_for_news_classification_by_mecab.pickled'
train = pickle.load(open(trainName, 'rb'))
tfidf = bm.Build_tfidf(train)
del train

100%|██████████| 12852/12852 [00:00<00:00, 790248.12it/s]


(12852, 73347)
vocab size : 73347


#### News to Tagged Document

In [27]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_daum_news_by_mecab_for_fastText_news_classification.pickled'):
    daumData2 = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_mecab_for_fastText_news_classification.pickled', 'rb'))
else:
    daumData2 = bm.MakeTaggedDataDAUM2(daumData, TaggedDocument, mecab, stopwords, 'daum')
    pickle.dump(daumData2, open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_mecab_for_fastText_news_classification.pickled', 'wb'))

100%|██████████| 9372/9372 [11:05<00:00, 14.08it/s]


#### Load Model

In [28]:
model1 = FastText.load(loadModelPath+'fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-0_min_count-2_by-mecab.model')
model2 = FastText.load(loadModelPath+'fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-1_min_count-2_by-mecab.model')
model3 = FastText.load(loadModelPath+'fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-1_cbow_mean-0_min_count-2_by-mecab.model')

#### Model1

In [29]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model1, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('fastText', model1,'mecab')

  3%|▎         | 2268/80210 [00:00<00:03, 22466.14it/s]

FastText(vocab=80210, size=500, alpha=0.025)


100%|██████████| 80210/80210 [00:03<00:00, 25410.46it/s]
0it [00:00, ?it/s]

running time : 0:00:03.184548
Vectorizing Data


9372it [06:57, 22.46it/s]


scaling Data
total running time : 0:07:01.275496


In [30]:
classifierList = glob(classifierPath+'*'+modelName)

In [31]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

RandomForestClassifier
XGBoost
NeuralNetwork_2
LogisticRegression
NeuralNetwork_1
SVC


In [32]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome_news_classification/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

9372it [00:00, 375734.97it/s]
9372it [00:00, 240988.11it/s]
9372it [00:00, 214120.06it/s]


CPU times: user 54.1 s, sys: 1.56 s, total: 55.7 s
Wall time: 2min 12s


#### Model2

In [33]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model2, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('fastText', model2,'mecab')

  0%|          | 0/80210 [00:00<?, ?it/s]

FastText(vocab=80210, size=500, alpha=0.025)


100%|██████████| 80210/80210 [00:05<00:00, 14661.13it/s]
0it [00:00, ?it/s]

running time : 0:00:05.486503
Vectorizing Data


9372it [06:11, 25.21it/s]


scaling Data
total running time : 0:06:17.729918


In [34]:
classifierList = glob(classifierPath+'*'+modelName)

In [35]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

RandomForestClassifier
XGBoost
NeuralNetwork_2
LogisticRegression
SVC
NeuralNetwork_1


In [36]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome_news_classification/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

9372it [00:00, 406963.56it/s]
9372it [00:00, 139695.36it/s]
9372it [00:00, 289204.88it/s]


CPU times: user 41.3 s, sys: 868 ms, total: 42.1 s
Wall time: 1min 17s


#### Model3

In [37]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model3, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('fastText', model3,'mecab')

  3%|▎         | 2778/80210 [00:00<00:02, 27489.68it/s]

FastText(vocab=80210, size=500, alpha=0.025)


100%|██████████| 80210/80210 [00:01<00:00, 41266.08it/s]
0it [00:00, ?it/s]

running time : 0:00:01.951271
Vectorizing Data


9372it [03:52, 40.23it/s]


scaling Data
total running time : 0:03:55.253834


In [38]:
classifierList = glob(classifierPath+'*'+modelName)

In [39]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

NeuralNetwork_2
LogisticRegression
SVC
NeuralNetwork_1
RandomForestClassifier
XGBoost


In [40]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome_news_classification/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

9372it [00:00, 374610.63it/s]
9372it [00:00, 424631.82it/s]
9372it [00:00, 418599.63it/s]


CPU times: user 41.3 s, sys: 683 ms, total: 42 s
Wall time: 49.2 s
