# 다음의 News Classification Using FastText

In [19]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os
from glob import glob
import warnings 
from numba import jit

os.environ['KERAS_BACKEND']='tensorflow'

import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from gensim.models import FastText, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve,  accuracy_score, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler, LabelEncoder
from sklearn.manifold import TSNE
from sklearn.multiclass import OneVsRestClassifier

import keras.backend.tensorflow_backend as K
from keras.preprocessing import sequence
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Flatten, Dense, Embedding, embeddings, merge, Dropout, Activation,  LSTM, Bidirectional, SimpleRNN, GRU
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import SpatialDropout1D
from keras.utils import np_utils
from tensorflow.python.client import device_lib
from keras.layers.merge import dot

import xgboost as xgb

import matplotlib.pyplot as plt


import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

In [2]:
print (device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 12801240253654533557
]


In [3]:
import Basic_Module as bm

## Load Data

In [4]:
#Daum
daumData = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
daumData = pd.DataFrame.from_dict(daumData, orient = 'index')
daumData.reset_index(inplace = True)
daumData.rename(columns = {'index' : 'id'}, inplace = True)
extDaumData = daumData.loc[:,['id','title','extracted_keywords']].copy()
print ('Daum : {}'.format(daumData.shape))

Daum : (9372, 11)


## Stopwords

In [5]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## Document Labeling

In [30]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags category')

## Category

In [7]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_doc2vec_news_classification.pickled'):
    le = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_doc2vec_news_classification.pickled','rb'))
else:
    le = LabelEncoder()
    le.fit(naverData['category'])
    pickle.dump(le, open('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_doc2vec_news_classification.pickled','wb'))
print (le.classes_)

['IT/과학' '경제' '사회' '생활/문화' '세계' '스포츠' '연예' '정치']


In [8]:
if sys.platform =='darwin':
    loadModelPath = '/Volumes/disk1/news_model/'
elif sys.platform =='win32':
    loadModelPath = 'd:/news_model/'
daumNewsPath = './data/pre_data/news_daum_news/'
classifierPath = './data/pre_data/news_classifier/'

In [9]:
daumData.head()

Unnamed: 0,id,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords
0,5a2a61bf588c13481c229d1e,뉴스,2017.12.07,세계일보,1093,911,1,"""밤이 무섭다""..비아그라 공장 연기에 남성들 부작용 호소","주민들은 공장에서 배출된 연기가 '남성이 매우 건강해지는 부작용'을 일으킨다며, ...","[부작용, 비아그라, 아일랜드]","{세보 효과, 지역, 남성들, 건강, 부작용, 연기, 공장}"
1,5a2a61bf588c13481c229d1f,뉴스,2017.12.07,헬스조선,603,386,2,식후 커피·늦은 양치질..점심식사 후 하면 안 좋은 습관 3가지,점심식사를 마친 후 후식으로 커피를 마시는 사람들이 많다. 실제로 직장이 밀집돼 ...,"[커피, 낮잠, 음식물]","{식후, 디스크, 낮잠, 치아, 자세, 입냄새, 건강, 커피, 점심 식사, 철분}"
2,5a2a61bf588c13481c229d20,뉴스,2017.12.07,연합뉴스,1067,811,3,"'십년지기 생매장' 진짜 이유는..""'청부 통정' 알려질까 봐""",(성남=연합뉴스) 최해민 기자 = 십년지기 지인을 산 채로 묻어 살해한 50대 여...,"[살인혐의, 철원, 검찰송치]","{철원, 진술, 아들, 앙심, 경찰, 성관계, 범행, 남편, 지인, 주변}"
3,5a2a61bf588c13481c229d21,뉴스,2017.12.07,헤럴드경제,418,369,4,"신영자, 억 소리나는 갑질","신영자, 적용안된 혐의→검찰 상고에서 인정\n신영자, 얼마를 어떻게 받았나 [헤럴...","[신영자, 갑질, 롯데백화점]","{징역, 매장, 네이처리퍼블릭, 혐의, 롯데, 신영자 이사장, 유통업체, 검찰}"
4,5a2a61bf588c13481c229d22,뉴스,2017.12.07,연합뉴스,434,368,5,"""배신하지마"" 20대女 살인 피의자 유치장서 공범 남친에 쪽지",(청주=연합뉴스) 이승민 기자 = 지난 9월 청주의 한 하천에서 20대 여성을 둔기...,"[공범, 살인, 과자]","{남자친구, 폭행, 쪽지, 경찰, 유치장, 혐의, 범행, 과자}"


In [10]:
from ckonlpy.tag import Twitter
from konlpy.tag import Mecab
ct = Twitter()
mecab = Mecab()

## Train data set으로 부터 TF-IDF Vectorizer을 만듦

In [11]:
trainName = './data/pre_data/news_train_test_Data/pre_data_word2vec_train_for_news_classification_by_mecab.pickled'
train = pickle.load(open(trainName, 'rb'))
tfidf = bm.Build_tfidf(train)
del train

100%|██████████| 12852/12852 [00:00<00:00, 1206202.62it/s]


(12852, 73416)
vocab size : 73416


## Word2Vec Model

### Twitter

#### News to Tagged Document

In [12]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_daum_news_by_ct_for_fastText_news_classification.pickled'):
    daumData2 = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_ct_for_fastText_news_classification.pickled', 'rb'))
else:
    daumData2 = bm.MakeTaggedDataDAUM2(daumData, TaggedDocument, ct, stopwords, 'daum')
    pickle.dump(daumData2, open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_ct_for_fastText_news_classification.pickled', 'wb'))

100%|██████████| 9372/9372 [14:19<00:00, 10.91it/s]


#### Load Model

In [13]:
model1 = FastText.load(loadModelPath+'fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-0_min_count-2_by-ct.model')
model2 = FastText.load(loadModelPath+'fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-1_min_count-2_by-ct.model')
model3 = FastText.load(loadModelPath+'fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-1_cbow_mean-0_min_count-2_by-ct.model')

#### Model1

In [14]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model1, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('fastText', model1,'ct')

  3%|▎         | 2649/80610 [00:00<00:02, 26482.83it/s]

FastText(vocab=80610, size=500, alpha=0.025)


100%|██████████| 80610/80610 [00:01<00:00, 60756.39it/s]
0it [00:00, ?it/s]

running time : 0:00:01.348332
Vectorizing Data


9372it [04:46, 32.76it/s]


scaling Data
total running time : 0:04:47.701595


In [15]:
classifierList = glob(classifierPath+'*'+modelName)

In [16]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

SVC
NeuralNetwork_1
XGBoost
LogisticRegression
RandomForestClassifier
NeuralNetwork_2


In [20]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

4130it [02:00, 34.31it/s]
0it [00:00, ?it/s][A
9372it [00:00, 619957.37it/s][A
0it [00:00, ?it/s][A
9372it [00:00, 695931.89it/s][A
0it [00:00, ?it/s][A
9372it [00:00, 622884.85it/s][A

CPU times: user 42.9 s, sys: 743 ms, total: 43.6 s
Wall time: 43.1 s


#### Model2

In [21]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model2, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('fastText', model2,'ct')


  0%|          | 0/80610 [00:00<?, ?it/s][A
  9%|▊         | 7005/80610 [00:00<00:01, 68263.68it/s][A

FastText(vocab=80610, size=500, alpha=0.025)



 16%|█▌        | 13078/80610 [00:00<00:01, 64917.15it/s][A
 25%|██▍       | 19985/80610 [00:00<00:00, 66029.89it/s][A
 36%|███▋      | 29231/80610 [00:00<00:00, 72594.25it/s][A
 45%|████▍     | 36252/80610 [00:00<00:00, 72379.51it/s][A
 55%|█████▌    | 44603/80610 [00:00<00:00, 74239.99it/s][A
 66%|██████▌   | 53286/80610 [00:00<00:00, 76014.60it/s][A
 77%|███████▋  | 62211/80610 [00:00<00:00, 77656.69it/s][A
 91%|█████████ | 73239/80610 [00:00<00:00, 81292.58it/s][A
100%|██████████| 80610/80610 [00:00<00:00, 83030.52it/s][A
0it [00:00, ?it/s][A
4it [00:00, 36.61it/s][A

running time : 0:00:00.976118
Vectorizing Data



8it [00:00, 36.84it/s][A
13it [00:00, 34.84it/s][A
17it [00:00, 34.36it/s][A
20it [00:00, 33.29it/s][A
25it [00:00, 35.03it/s][A
30it [00:00, 35.99it/s][A
34it [00:00, 34.43it/s][A
38it [00:01, 34.45it/s][A
42it [00:01, 33.99it/s][A
47it [00:01, 34.67it/s][A
51it [00:01, 34.42it/s][A
58it [00:01, 36.03it/s][A
64it [00:01, 37.18it/s][A
69it [00:01, 37.53it/s][A
75it [00:01, 38.24it/s][A
80it [00:02, 38.01it/s][A
85it [00:02, 38.47it/s][A
90it [00:02, 38.66it/s][A
95it [00:02, 38.42it/s][A
100it [00:02, 37.54it/s][A
104it [00:02, 37.60it/s][A
109it [00:02, 37.95it/s][A
114it [00:03, 37.11it/s][A
118it [00:03, 36.94it/s][A
123it [00:03, 37.15it/s][A
128it [00:03, 36.85it/s][A
132it [00:03, 36.58it/s][A
137it [00:03, 36.92it/s][A
141it [00:03, 36.18it/s][A
145it [00:04, 35.99it/s][A
149it [00:04, 35.59it/s][A
153it [00:04, 34.56it/s][A
157it [00:04, 34.41it/s][A
160it [00:04, 33.99it/s][A
164it [00:04, 34.09it/s][A
171it [00:04, 34.78it/s][A
176it [00:0

1431it [00:39, 36.59it/s][A
1435it [00:39, 36.51it/s][A
1439it [00:39, 36.51it/s][A
1443it [00:39, 36.47it/s][A
1447it [00:39, 36.43it/s][A
1451it [00:39, 36.43it/s][A
1455it [00:39, 36.43it/s][A
1461it [00:40, 36.48it/s][A
1466it [00:40, 36.51it/s][A
1472it [00:40, 36.55it/s][A
1477it [00:40, 36.53it/s][A
1482it [00:40, 36.55it/s][A
1489it [00:40, 36.62it/s][A
1494it [00:40, 36.64it/s][A
1501it [00:40, 36.72it/s][A
1507it [00:40, 36.77it/s][A
1513it [00:41, 36.81it/s][A
1520it [00:41, 36.88it/s][A
1526it [00:41, 36.90it/s][A
1532it [00:41, 36.91it/s][A
1537it [00:41, 36.94it/s][A
1542it [00:41, 36.89it/s][A
1547it [00:41, 36.87it/s][A
1551it [00:42, 36.86it/s][A
1556it [00:42, 36.89it/s][A
1561it [00:42, 36.89it/s][A
1568it [00:42, 36.94it/s][A
1573it [00:42, 36.93it/s][A
1578it [00:42, 36.96it/s][A
1583it [00:42, 36.94it/s][A
1588it [00:42, 36.96it/s][A
1594it [00:43, 37.01it/s][A
1599it [00:43, 37.02it/s][A
1604it [00:43, 36.96it/s][A
1608it [00:43,

2862it [01:16, 37.25it/s][A
2867it [01:16, 37.24it/s][A
2872it [01:17, 37.24it/s][A
2876it [01:17, 37.20it/s][A
2880it [01:17, 37.19it/s][A
2884it [01:17, 37.18it/s][A
2888it [01:17, 37.18it/s][A
2892it [01:17, 37.17it/s][A
2896it [01:18, 37.11it/s][A
2899it [01:18, 37.09it/s][A
2904it [01:18, 37.11it/s][A
2909it [01:18, 37.12it/s][A
2913it [01:18, 37.12it/s][A
2917it [01:18, 37.12it/s][A
2921it [01:18, 37.07it/s][A
2925it [01:18, 37.05it/s][A
2929it [01:19, 37.02it/s][A
2933it [01:19, 37.01it/s][A
2938it [01:19, 37.02it/s][A
2942it [01:19, 37.01it/s][A
2946it [01:19, 37.00it/s][A
2950it [01:19, 37.00it/s][A
2955it [01:19, 37.00it/s][A
2959it [01:20, 36.97it/s][A
2963it [01:20, 36.96it/s][A
2967it [01:20, 36.91it/s][A
2973it [01:20, 36.92it/s][A
2977it [01:20, 36.92it/s][A
2981it [01:20, 36.91it/s][A
2987it [01:20, 36.93it/s][A
2991it [01:21, 36.92it/s][A
2995it [01:21, 36.93it/s][A
2999it [01:21, 36.92it/s][A
3004it [01:21, 36.93it/s][A
3008it [01:21,

4075it [01:55, 35.29it/s][A
4081it [01:55, 35.29it/s][A
4086it [01:55, 35.30it/s][A
4091it [01:55, 35.31it/s][A
4096it [01:56, 35.31it/s][A
4101it [01:56, 35.29it/s][A
4105it [01:56, 35.27it/s][A
4109it [01:56, 35.27it/s][A
4114it [01:56, 35.28it/s][A
4118it [01:56, 35.28it/s][A
4122it [01:56, 35.28it/s][A
4126it [01:56, 35.27it/s][A
4130it [01:57, 35.27it/s][A
4134it [01:57, 35.26it/s][A
4138it [01:57, 35.22it/s][A
4142it [01:57, 35.22it/s][A
4145it [01:57, 35.21it/s][A
4152it [01:57, 35.24it/s][A
4156it [01:57, 35.24it/s][A
4160it [01:58, 35.23it/s][A
4164it [01:58, 35.22it/s][A
4168it [01:58, 35.22it/s][A
4172it [01:58, 35.21it/s][A
4176it [01:58, 35.20it/s][A
4181it [01:58, 35.21it/s][A
4186it [01:58, 35.21it/s][A
4192it [01:58, 35.23it/s][A
4197it [01:59, 35.23it/s][A
4203it [01:59, 35.25it/s][A
4208it [01:59, 35.26it/s][A
4213it [01:59, 35.25it/s][A
4218it [01:59, 35.26it/s][A
4223it [01:59, 35.26it/s][A
4228it [01:59, 35.25it/s][A
4232it [02:00,

5419it [02:32, 35.61it/s][A
5425it [02:32, 35.62it/s][A
5430it [02:32, 35.61it/s][A
5434it [02:32, 35.61it/s][A
5438it [02:32, 35.60it/s][A
5442it [02:32, 35.60it/s][A
5446it [02:33, 35.59it/s][A
5450it [02:33, 35.59it/s][A
5454it [02:33, 35.58it/s][A
5458it [02:33, 35.58it/s][A
5462it [02:33, 35.58it/s][A
5466it [02:33, 35.58it/s][A
5470it [02:33, 35.58it/s][A
5475it [02:33, 35.58it/s][A
5479it [02:34, 35.56it/s][A
5483it [02:34, 35.56it/s][A
5487it [02:34, 35.56it/s][A
5492it [02:34, 35.56it/s][A
5496it [02:34, 35.56it/s][A
5500it [02:34, 35.55it/s][A
5504it [02:34, 35.54it/s][A
5508it [02:34, 35.54it/s][A
5514it [02:35, 35.55it/s][A
5518it [02:35, 35.55it/s][A
5523it [02:35, 35.56it/s][A
5527it [02:35, 35.55it/s][A
5531it [02:35, 35.56it/s][A
5537it [02:35, 35.57it/s][A
5545it [02:35, 35.59it/s][A
5551it [02:35, 35.60it/s][A
5556it [02:36, 35.61it/s][A
5561it [02:36, 35.61it/s][A
5567it [02:36, 35.62it/s][A
5574it [02:36, 35.64it/s][A
5580it [02:36,

6756it [03:10, 35.53it/s][A
6761it [03:10, 35.54it/s][A
6766it [03:10, 35.54it/s][A
6771it [03:10, 35.55it/s][A
6776it [03:10, 35.56it/s][A
6781it [03:10, 35.55it/s][A
6786it [03:10, 35.56it/s][A
6791it [03:10, 35.56it/s][A
6796it [03:11, 35.56it/s][A
6802it [03:11, 35.58it/s][A
6807it [03:11, 35.58it/s][A
6812it [03:11, 35.58it/s][A
6817it [03:11, 35.58it/s][A
6822it [03:11, 35.59it/s][A
6827it [03:11, 35.58it/s][A
6832it [03:11, 35.59it/s][A
6837it [03:12, 35.60it/s][A
6842it [03:12, 35.58it/s][A
6849it [03:12, 35.60it/s][A
6854it [03:12, 35.60it/s][A
6859it [03:12, 35.60it/s][A
6864it [03:12, 35.59it/s][A
6868it [03:12, 35.59it/s][A
6872it [03:13, 35.59it/s][A
6878it [03:13, 35.60it/s][A
6885it [03:13, 35.62it/s][A
6890it [03:13, 35.61it/s][A
6895it [03:13, 35.61it/s][A
6902it [03:13, 35.62it/s][A
6907it [03:13, 35.63it/s][A
6913it [03:13, 35.64it/s][A
6919it [03:14, 35.65it/s][A
6925it [03:14, 35.66it/s][A
6931it [03:14, 35.65it/s][A
6936it [03:14,

8120it [03:47, 35.64it/s][A
8124it [03:48, 35.63it/s][A
8128it [03:48, 35.62it/s][A
8133it [03:48, 35.63it/s][A
8137it [03:48, 35.62it/s][A
8141it [03:48, 35.62it/s][A
8145it [03:48, 35.60it/s][A
8148it [03:48, 35.59it/s][A
8151it [03:49, 35.58it/s][A
8155it [03:49, 35.58it/s][A
8158it [03:49, 35.58it/s][A
8164it [03:49, 35.58it/s][A
8168it [03:49, 35.58it/s][A
8172it [03:49, 35.58it/s][A
8176it [03:49, 35.58it/s][A
8180it [03:49, 35.58it/s][A
8184it [03:50, 35.58it/s][A
8188it [03:50, 35.58it/s][A
8192it [03:50, 35.58it/s][A
8197it [03:50, 35.58it/s][A
8201it [03:50, 35.58it/s][A
8205it [03:50, 35.57it/s][A
8209it [03:50, 35.55it/s][A
8214it [03:51, 35.55it/s][A
8220it [03:51, 35.56it/s][A
8225it [03:51, 35.56it/s][A
8232it [03:51, 35.58it/s][A
8238it [03:51, 35.59it/s][A
8243it [03:51, 35.59it/s][A
8249it [03:51, 35.60it/s][A
8255it [03:51, 35.61it/s][A
8261it [03:51, 35.62it/s][A
8267it [03:52, 35.62it/s][A
8273it [03:52, 35.63it/s][A
8278it [03:52,

scaling Data
total running time : 0:04:25.232707


In [22]:
classifierList = glob(classifierPath+'*'+modelName)

In [23]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

NeuralNetwork_2
XGBoost
RandomForestClassifier
LogisticRegression
SVC
NeuralNetwork_1


In [24]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome/outcome_news_classification_'+modelName,index=None, encoding='utf-8')


0it [00:00, ?it/s][A
9372it [00:00, 551365.01it/s][A
0it [00:00, ?it/s][A
9372it [00:00, 667329.04it/s][A
0it [00:00, ?it/s][A
9372it [00:00, 672592.86it/s][A

CPU times: user 33.7 s, sys: 542 ms, total: 34.3 s
Wall time: 34 s


#### Model3

In [25]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model3, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('fastText', model3,'ct')


  0%|          | 0/80610 [00:00<?, ?it/s][A
 10%|█         | 8070/80610 [00:00<00:00, 79806.17it/s][A

FastText(vocab=80610, size=500, alpha=0.025)



 18%|█▊        | 14842/80610 [00:00<00:00, 73456.04it/s][A
 26%|██▌       | 21052/80610 [00:00<00:00, 69928.10it/s][A
 36%|███▌      | 29028/80610 [00:00<00:00, 72387.04it/s][A
 46%|████▌     | 36879/80610 [00:00<00:00, 73317.95it/s][A
 55%|█████▍    | 44109/80610 [00:00<00:00, 73390.95it/s][A
 65%|██████▍   | 52013/80610 [00:00<00:00, 74161.36it/s][A
 75%|███████▍  | 60409/80610 [00:00<00:00, 75403.14it/s][A
 88%|████████▊ | 70559/80610 [00:00<00:00, 78306.76it/s][A
 98%|█████████▊| 79349/80610 [00:01<00:00, 79264.03it/s][A
100%|██████████| 80610/80610 [00:01<00:00, 78701.49it/s][A
0it [00:00, ?it/s][A
2it [00:00, 17.49it/s][A

running time : 0:00:01.031335
Vectorizing Data



5it [00:00, 20.49it/s][A
8it [00:00, 22.18it/s][A
11it [00:00, 23.82it/s][A
14it [00:00, 24.57it/s][A
17it [00:00, 25.09it/s][A
20it [00:00, 25.43it/s][A
24it [00:00, 26.87it/s][A
28it [00:00, 28.14it/s][A
32it [00:01, 28.93it/s][A
36it [00:01, 26.98it/s][A
39it [00:01, 26.21it/s][A
43it [00:01, 27.00it/s][A
47it [00:01, 27.72it/s][A
51it [00:01, 27.88it/s][A
57it [00:01, 29.46it/s][A
61it [00:02, 29.44it/s][A
68it [00:02, 31.13it/s][A
73it [00:02, 31.48it/s][A
78it [00:02, 31.87it/s][A
83it [00:02, 32.44it/s][A
88it [00:02, 33.03it/s][A
93it [00:02, 32.54it/s][A
97it [00:02, 32.69it/s][A
101it [00:03, 31.96it/s][A
107it [00:03, 32.65it/s][A
111it [00:03, 32.67it/s][A
115it [00:03, 32.15it/s][A
119it [00:03, 32.27it/s][A
123it [00:03, 32.43it/s][A
128it [00:03, 32.60it/s][A
132it [00:04, 32.58it/s][A
137it [00:04, 32.98it/s][A
141it [00:04, 32.66it/s][A
145it [00:04, 32.66it/s][A
149it [00:04, 32.50it/s][A
153it [00:04, 31.86it/s][A
156it [00:04, 31

1352it [00:39, 34.60it/s][A
1359it [00:39, 34.69it/s][A
1364it [00:39, 34.68it/s][A
1369it [00:39, 34.70it/s][A
1377it [00:39, 34.78it/s][A
1382it [00:39, 34.78it/s][A
1387it [00:39, 34.78it/s][A
1392it [00:40, 34.80it/s][A
1397it [00:40, 34.81it/s][A
1402it [00:40, 34.79it/s][A
1406it [00:40, 34.78it/s][A
1410it [00:40, 34.77it/s][A
1414it [00:40, 34.76it/s][A
1418it [00:40, 34.76it/s][A
1423it [00:40, 34.80it/s][A
1427it [00:41, 34.80it/s][A
1431it [00:41, 34.81it/s][A
1435it [00:41, 34.76it/s][A
1440it [00:41, 34.80it/s][A
1444it [00:41, 34.76it/s][A
1448it [00:41, 34.73it/s][A
1452it [00:41, 34.73it/s][A
1456it [00:41, 34.74it/s][A
1462it [00:42, 34.80it/s][A
1468it [00:42, 34.85it/s][A
1473it [00:42, 34.88it/s][A
1478it [00:42, 34.87it/s][A
1485it [00:42, 34.94it/s][A
1491it [00:42, 34.98it/s][A
1499it [00:42, 35.08it/s][A
1505it [00:42, 35.13it/s][A
1511it [00:42, 35.18it/s][A
1517it [00:43, 35.23it/s][A
1523it [00:43, 35.28it/s][A
1529it [00:43,

2434it [01:21, 29.75it/s][A
2436it [01:22, 29.67it/s][A
2438it [01:22, 29.60it/s][A
2440it [01:22, 29.57it/s][A
2442it [01:22, 29.55it/s][A
2444it [01:22, 29.52it/s][A
2446it [01:22, 29.48it/s][A
2448it [01:23, 29.47it/s][A
2451it [01:23, 29.46it/s][A
2455it [01:23, 29.47it/s][A
2459it [01:23, 29.48it/s][A
2463it [01:23, 29.48it/s][A
2466it [01:23, 29.47it/s][A
2470it [01:23, 29.48it/s][A
2473it [01:23, 29.46it/s][A
2476it [01:24, 29.41it/s][A
2480it [01:24, 29.42it/s][A
2485it [01:24, 29.43it/s][A
2488it [01:24, 29.42it/s][A
2491it [01:24, 29.33it/s][A
2494it [01:25, 29.29it/s][A
2500it [01:25, 29.32it/s][A
2504it [01:25, 29.32it/s][A
2510it [01:25, 29.34it/s][A
2514it [01:25, 29.33it/s][A
2518it [01:25, 29.32it/s][A
2521it [01:25, 29.32it/s][A
2524it [01:26, 29.30it/s][A
2527it [01:26, 29.29it/s][A
2531it [01:26, 29.29it/s][A
2535it [01:26, 29.30it/s][A
2539it [01:26, 29.31it/s][A
2543it [01:26, 29.31it/s][A
2547it [01:26, 29.31it/s][A
2551it [01:27,

3524it [02:02, 28.87it/s][A
3528it [02:02, 28.87it/s][A
3531it [02:02, 28.87it/s][A
3535it [02:02, 28.87it/s][A
3538it [02:02, 28.86it/s][A
3542it [02:02, 28.86it/s][A
3545it [02:02, 28.86it/s][A
3548it [02:02, 28.86it/s][A
3551it [02:03, 28.85it/s][A
3554it [02:03, 28.85it/s][A
3559it [02:03, 28.84it/s][A
3562it [02:03, 28.83it/s][A
3565it [02:03, 28.82it/s][A
3569it [02:03, 28.83it/s][A
3573it [02:03, 28.83it/s][A
3577it [02:04, 28.84it/s][A
3581it [02:04, 28.85it/s][A
3585it [02:04, 28.85it/s][A
3589it [02:04, 28.82it/s][A
3592it [02:04, 28.79it/s][A
3598it [02:04, 28.81it/s][A
3602it [02:05, 28.79it/s][A
3605it [02:05, 28.79it/s][A
3608it [02:05, 28.78it/s][A
3611it [02:05, 28.77it/s][A
3615it [02:05, 28.77it/s][A
3621it [02:05, 28.80it/s][A
3625it [02:05, 28.80it/s][A
3629it [02:05, 28.81it/s][A
3633it [02:06, 28.80it/s][A
3637it [02:06, 28.76it/s][A
3640it [02:06, 28.76it/s][A
3643it [02:06, 28.75it/s][A
3646it [02:07, 28.71it/s][A
3649it [02:07,

4627it [02:42, 28.43it/s][A
4632it [02:42, 28.44it/s][A
4636it [02:43, 28.43it/s][A
4640it [02:43, 28.43it/s][A
4645it [02:43, 28.44it/s][A
4650it [02:43, 28.45it/s][A
4656it [02:43, 28.47it/s][A
4660it [02:43, 28.47it/s][A
4669it [02:43, 28.51it/s][A
4675it [02:43, 28.52it/s][A
4681it [02:44, 28.54it/s][A
4687it [02:44, 28.55it/s][A
4693it [02:44, 28.56it/s][A
4699it [02:44, 28.58it/s][A
4704it [02:44, 28.58it/s][A
4709it [02:44, 28.58it/s][A
4714it [02:44, 28.59it/s][A
4718it [02:45, 28.59it/s][A
4722it [02:45, 28.59it/s][A
4726it [02:45, 28.59it/s][A
4733it [02:45, 28.61it/s][A
4738it [02:45, 28.62it/s][A
4743it [02:45, 28.60it/s][A
4747it [02:45, 28.60it/s][A
4751it [02:46, 28.61it/s][A
4755it [02:46, 28.61it/s][A
4760it [02:46, 28.63it/s][A
4764it [02:46, 28.63it/s][A
4768it [02:46, 28.63it/s][A
4772it [02:46, 28.63it/s][A
4777it [02:46, 28.64it/s][A
4783it [02:46, 28.66it/s][A
4789it [02:47, 28.67it/s][A
4795it [02:47, 28.69it/s][A
4802it [02:47,

6052it [03:20, 30.25it/s][A
6059it [03:20, 30.27it/s][A
6065it [03:20, 30.27it/s][A
6071it [03:20, 30.28it/s][A
6077it [03:20, 30.29it/s][A
6082it [03:20, 30.30it/s][A
6087it [03:21, 30.28it/s][A
6092it [03:21, 30.29it/s][A
6097it [03:21, 30.30it/s][A
6102it [03:21, 30.29it/s][A
6106it [03:21, 30.29it/s][A
6110it [03:21, 30.29it/s][A
6114it [03:21, 30.29it/s][A
6119it [03:21, 30.30it/s][A
6124it [03:22, 30.31it/s][A
6129it [03:22, 30.32it/s][A
6135it [03:22, 30.33it/s][A
6140it [03:22, 30.34it/s][A
6145it [03:22, 30.34it/s][A
6150it [03:22, 30.34it/s][A
6154it [03:22, 30.35it/s][A
6160it [03:22, 30.36it/s][A
6165it [03:23, 30.36it/s][A
6170it [03:23, 30.37it/s][A
6175it [03:23, 30.38it/s][A
6181it [03:23, 30.39it/s][A
6186it [03:23, 30.40it/s][A
6192it [03:23, 30.41it/s][A
6198it [03:23, 30.42it/s][A
6204it [03:23, 30.43it/s][A
6210it [03:24, 30.44it/s][A
6215it [03:24, 30.44it/s][A
6220it [03:24, 30.44it/s][A
6224it [03:24, 30.44it/s][A
6228it [03:24,

7441it [03:58, 31.25it/s][A
7445it [03:58, 31.25it/s][A
7449it [03:58, 31.24it/s][A
7455it [03:58, 31.25it/s][A
7459it [03:58, 31.25it/s][A
7463it [03:58, 31.25it/s][A
7467it [03:58, 31.25it/s][A
7471it [03:59, 31.25it/s][A
7476it [03:59, 31.25it/s][A
7481it [03:59, 31.26it/s][A
7487it [03:59, 31.27it/s][A
7493it [03:59, 31.28it/s][A
7498it [03:59, 31.29it/s][A
7503it [03:59, 31.29it/s][A
7509it [03:59, 31.30it/s][A
7518it [04:00, 31.32it/s][A
7524it [04:00, 31.33it/s][A
7530it [04:00, 31.32it/s][A
7536it [04:00, 31.33it/s][A
7541it [04:00, 31.34it/s][A
7546it [04:00, 31.34it/s][A
7551it [04:01, 31.33it/s][A
7555it [04:01, 31.33it/s][A
7559it [04:01, 31.33it/s][A
7563it [04:01, 31.32it/s][A
7567it [04:01, 31.32it/s][A
7571it [04:01, 31.32it/s][A
7577it [04:01, 31.34it/s][A
7581it [04:01, 31.33it/s][A
7585it [04:02, 31.34it/s][A
7589it [04:02, 31.33it/s][A
7593it [04:02, 31.33it/s][A
7597it [04:02, 31.33it/s][A
7601it [04:02, 31.33it/s][A
7605it [04:02,

8862it [04:36, 32.09it/s][A
8867it [04:36, 32.09it/s][A
8872it [04:36, 32.10it/s][A
8877it [04:36, 32.09it/s][A
8882it [04:36, 32.09it/s][A
8886it [04:36, 32.09it/s][A
8890it [04:37, 32.09it/s][A
8894it [04:37, 32.09it/s][A
8898it [04:37, 32.09it/s][A
8903it [04:37, 32.09it/s][A
8907it [04:37, 32.09it/s][A
8911it [04:37, 32.09it/s][A
8917it [04:37, 32.10it/s][A
8921it [04:37, 32.10it/s][A
8925it [04:38, 32.10it/s][A
8929it [04:38, 32.08it/s][A
8934it [04:38, 32.09it/s][A
8938it [04:38, 32.09it/s][A
8944it [04:38, 32.09it/s][A
8951it [04:38, 32.10it/s][A
8956it [04:39, 32.10it/s][A
8961it [04:39, 32.10it/s][A
8965it [04:39, 32.10it/s][A
8969it [04:39, 32.10it/s][A
8973it [04:39, 32.10it/s][A
8978it [04:39, 32.11it/s][A
8982it [04:39, 32.11it/s][A
8989it [04:39, 32.12it/s][A
8997it [04:39, 32.14it/s][A
9003it [04:40, 32.15it/s][A
9009it [04:40, 32.14it/s][A
9014it [04:40, 32.14it/s][A
9020it [04:40, 32.15it/s][A
9025it [04:40, 32.16it/s][A
9030it [04:40,

scaling Data
total running time : 0:04:51.990420


In [26]:
classifierList = glob(classifierPath+'*'+modelName)

In [27]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

NeuralNetwork_2
XGBoost
LogisticRegression
RandomForestClassifier
SVC
NeuralNetwork_1


In [28]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome/outcome_news_classification_'+modelName,index=None, encoding='utf-8')


0it [00:00, ?it/s][A
9372it [00:00, 604958.86it/s][A
0it [00:00, ?it/s][A
9372it [00:00, 536532.00it/s][A
0it [00:00, ?it/s][A
9372it [00:00, 727863.89it/s][A

CPU times: user 34.8 s, sys: 476 ms, total: 35.3 s
Wall time: 34.6 s


### Mecab

#### News to Tagged Document

In [None]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_daum_news_by_mecab_for_fastText_news_classification.pickled'):
    daumData2 = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_mecab_for_fastText_news_classification.pickled', 'rb'))
else:
    daumData2 = bm.MakeTaggedDataDAUM2(daumData, TaggedDocument, mecab, stopwords, 'daum')
    pickle.dump(daumData2, open('./data/pre_data/news_tagged_data/pre_data_daum_news_by_mecab_for_fastText_news_classification.pickled', 'wb'))


  0%|          | 0/9372 [00:00<?, ?it/s][A
  0%|          | 2/9372 [00:00<35:11,  4.44it/s][A
  0%|          | 3/9372 [00:00<35:26,  4.40it/s][A
  0%|          | 4/9372 [00:00<31:32,  4.95it/s][A
  0%|          | 6/9372 [00:00<25:07,  6.21it/s][A
  0%|          | 8/9372 [00:01<22:47,  6.85it/s][A
  0%|          | 10/9372 [00:01<20:47,  7.51it/s][A
  0%|          | 12/9372 [00:01<19:05,  8.17it/s][A
  0%|          | 14/9372 [00:01<18:36,  8.38it/s][A
  0%|          | 16/9372 [00:01<17:42,  8.80it/s][A
  0%|          | 18/9372 [00:01<16:48,  9.28it/s][A
  0%|          | 20/9372 [00:02<16:17,  9.56it/s][A
  0%|          | 23/9372 [00:02<15:13, 10.23it/s][A
  0%|          | 25/9372 [00:02<14:56, 10.42it/s][A
  0%|          | 28/9372 [00:02<14:07, 11.03it/s][A
  0%|          | 30/9372 [00:02<13:49, 11.27it/s][A
  0%|          | 33/9372 [00:02<13:21, 11.66it/s][A
  0%|          | 35/9372 [00:03<13:27, 11.56it/s][A
  0%|          | 38/9372 [00:03<12:55, 12.04it/s][A
  0%| 

  6%|▌         | 533/9372 [00:22<06:20, 23.22it/s][A
  6%|▌         | 538/9372 [00:23<06:18, 23.33it/s][A
  6%|▌         | 542/9372 [00:23<06:19, 23.28it/s][A
  6%|▌         | 546/9372 [00:23<06:19, 23.27it/s][A
  6%|▌         | 549/9372 [00:23<06:18, 23.29it/s][A
  6%|▌         | 554/9372 [00:23<06:16, 23.40it/s][A
  6%|▌         | 558/9372 [00:23<06:16, 23.44it/s][A
  6%|▌         | 562/9372 [00:23<06:14, 23.49it/s][A
  6%|▌         | 566/9372 [00:24<06:13, 23.56it/s][A
  6%|▌         | 570/9372 [00:24<06:12, 23.62it/s][A
  6%|▌         | 574/9372 [00:24<06:11, 23.65it/s][A
  6%|▌         | 578/9372 [00:24<06:10, 23.72it/s][A
  6%|▌         | 582/9372 [00:24<06:10, 23.73it/s][A
  6%|▋         | 588/9372 [00:24<06:07, 23.87it/s][A
  6%|▋         | 593/9372 [00:24<06:06, 23.97it/s][A
  6%|▋         | 598/9372 [00:24<06:04, 24.05it/s][A
  6%|▋         | 603/9372 [00:25<06:03, 24.10it/s][A
  6%|▋         | 607/9372 [00:25<06:02, 24.15it/s][A
  7%|▋         | 611/9372 [0

 14%|█▎        | 1274/9372 [00:44<04:41, 28.77it/s][A
 14%|█▎        | 1279/9372 [00:44<04:41, 28.77it/s][A
 14%|█▎        | 1285/9372 [00:44<04:40, 28.83it/s][A
 14%|█▍        | 1290/9372 [00:44<04:39, 28.87it/s][A
 14%|█▍        | 1295/9372 [00:44<04:39, 28.86it/s][A
 14%|█▍        | 1299/9372 [00:45<04:39, 28.86it/s][A
 14%|█▍        | 1303/9372 [00:45<04:40, 28.79it/s][A
 14%|█▍        | 1307/9372 [00:45<04:39, 28.81it/s][A
 14%|█▍        | 1311/9372 [00:45<04:39, 28.81it/s][A
 14%|█▍        | 1315/9372 [00:45<04:39, 28.83it/s][A
 14%|█▍        | 1319/9372 [00:45<04:39, 28.80it/s][A
 14%|█▍        | 1324/9372 [00:45<04:39, 28.83it/s][A
 14%|█▍        | 1329/9372 [00:46<04:38, 28.88it/s][A
 14%|█▍        | 1335/9372 [00:46<04:37, 28.95it/s][A
 14%|█▍        | 1342/9372 [00:46<04:36, 29.01it/s][A
 14%|█▍        | 1348/9372 [00:46<04:35, 29.07it/s][A
 14%|█▍        | 1353/9372 [00:46<04:35, 29.08it/s][A
 15%|█▍        | 1362/9372 [00:46<04:34, 29.16it/s][A
 15%|█▍   

#### Load Model

In [None]:
model1 = FastText.load(loadModelPath+'fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-0_min_count-2_by-mecab.model')
model2 = FastText.load(loadModelPath+'fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-1_min_count-2_by-mecab.model')
model3 = FastText.load(loadModelPath+'fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-1_cbow_mean-0_min_count-2_by-mecab.model')

#### Model1

In [None]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model1, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('fastText', model1,'mecab')

In [None]:
classifierList = glob(classifierPath+'*'+modelName)

In [None]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

In [None]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

#### Model2

In [None]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model2, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('fastText', model2,'mecab')

In [None]:
classifierList = glob(classifierPath+'*'+modelName)

In [None]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

In [None]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome/outcome_news_classification_'+modelName,index=None, encoding='utf-8')

#### Model3

In [None]:
wv1, vecs_w2v = bm.Make_Pre_Data_For_DAUM(model3, tfidf, 500, daumData2)
modelName = bm.Return_ModelName('fastText', model3,'mecab')

In [None]:
classifierList = glob(classifierPath+'*'+modelName)

In [None]:
loadClassifierDict = dict(map(lambda x:bm.LoadClassifier(x), classifierList))

In [None]:
%%time
warnings.filterwarnings('ignore')
predictOutcome = dict(map(lambda x: bm.PredictNewsClassification(vecs_w2v, x, loadClassifierDict[x]), loadClassifierDict))
predictOutcome = pd.DataFrame.from_dict(predictOutcome)
predictOutcome = predictOutcome.applymap(lambda x: le.inverse_transform(int(x)))
predictOutcome = extDaumData.merge(predictOutcome,
                                   left_index = True, right_index = True)
predictOutcome.to_csv('./outcome/outcome_news_classification_'+modelName,index=None, encoding='utf-8')