# News Classification Using fastText
> * 네이버의 뉴스 기사를 이용하여 모델을 만들고 평가를 실시한뒤, 다음의 뉴스 기사를 이용하여 분류해보도록 한다. 

In [14]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os

from numba import jit

os.environ['KERAS_BACKEND']='tensorflow'

import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from gensim.models import FastText, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve,  accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler, LabelEncoder
from sklearn.manifold import TSNE

import keras.backend.tensorflow_backend as K
from keras.preprocessing import sequence
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Flatten, Dense, Embedding, embeddings, merge, Dropout, Activation,  LSTM, Bidirectional, SimpleRNN, GRU
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import SpatialDropout1D
from keras.utils import np_utils
from tensorflow.python.client import device_lib
from keras.layers.merge import dot

import xgboost as xgb

import matplotlib.pyplot as plt

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

In [15]:
print (device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 15308735070563717577
]


In [16]:
def Make_Roc_Curve(x, y, model1, model2, model3, model4):
    print ('Logistic Regression')
    fpr1, tpr1, thresholds1 = roc_curve(y, model1.predict(x))
    print ('Random Forest')
    fpr2, tpr2, thresholds2 = roc_curve(y, model2.predict(x))
    print ('Kernel SVM')
    fpr3, tpr3, thresholds3 = roc_curve(y, model3.predict(x))
    print ('XGBoost')
    import xgboost as xgb
    fpr4, tpr4, thresholds4 = roc_curve(y, model4.predict(xgb.DMatrix(x)))
    plt.plot(fpr1, tpr1, label="Logistic Regression")
    plt.plot(fpr2, tpr2, label="RandomForest")
    plt.plot(fpr3, tpr3, label="Kernel SVM")
    plt.plot(fpr4, tpr4, label='XGBoost')
    plt.legend()
    plt.plot([0, 1], [0, 1], 'k--', label="random guess")
    plt.xlabel('False Positive Rate (Fall-Out)')
    plt.ylabel('True Positive Rate (Recall)')
    plt.title('Receiver operating characteristic example')
    plt.show()

In [17]:
def plot_history(history):
    """Plot model history after `fit()`.
    """

    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

In [18]:
import multiprocessing
cores = multiprocessing.cpu_count()
def Make_FastText_Model(modelPath, data, size, epoch, sg, window, min_count, cbow_mean,
                        workers, negative,word_ngrams, hs, tagger):
    from tqdm import tqdm
    tqdm.pandas(desc="progress-bar")
    from datetime import datetime
    from gensim.models import FastText
    start = datetime.now()
    modelName = 'fastText_size-{}_epoch-{}_ngrams-{}_window-{}_negative-{}_hs-{}_sg-{}_cbow_mean-{}_min_count-{}_by-{}.model'.format(
        size, epoch, word_ngrams, window, negative, hs, sg, cbow_mean, min_count, tagger)
    modelName = modelPath+modelName
    print (modelName)
    fastText_model = FastText(size = size, sg = sg, cbow_mean = cbow_mean,
                                  negative = negative, hs = hs, window = window, word_ngrams=word_ngrams, 
                                  workers = workers, iter=epoch, min_count = min_count)
    fastText_model.build_vocab(tqdm(data))
    fastText_model.train(tqdm(data), total_examples=fastText_model.corpus_count, epochs=fastText_model.iter) 
    fastText_model.init_sims(replace = True)
    fastText_model.save(modelName)
    end = datetime.now()
    print ("Total running time: ", end-start)
    return fastText_model

In [19]:
def nav_tokenizer(tagger, corpus, stopwords):
    pos = tagger.pos(corpus)
    pos = [t[0] for t in pos if not t[0] in stopwords]
    return pos

def MakeTaggedData(df, taggedDoc, tagger, stopwords, labelEncoder):
    w2v_docs = list()
    for idx in tqdm(df.index):
        text = df.loc[idx,'title']+'.\n'+df.loc[idx,'mainText']
        pos = nav_tokenizer(tagger, text, stopwords)
        category = df.loc[idx, 'category']
        encodeCategory = labelEncoder.transform([category])
        label = ['news_'+str(idx)]
        w2v_docs.append(TaggedDocument(pos, label, encodeCategory))
    return w2v_docs

## Load Data

In [20]:
#Naver
naverData = pickle.load(open('./data/pre_data/stastics/for_statistics_Naver_from_mongodb.pickled','rb'))
naverData = pd.DataFrame.from_dict(naverData, orient = 'index')
naverData.reset_index(inplace = True)
naverData.rename(columns = {'index' : 'id'}, inplace = True)
#Daum
daumData = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
daumData = pd.DataFrame.from_dict(daumData, orient = 'index')
daumData.reset_index(inplace = True)
daumData.rename(columns = {'index' : 'id'}, inplace = True)

print ('Naver : {}'.format(naverData.shape))
print ('Daum : {}'.format(daumData.shape))

Naver : (15120, 11)
Daum : (9372, 11)


## Stopwords

In [21]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## Document Labeling

In [22]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags category')

> * words : 기사에서 나온 단어들 or keywords
> * tags : 문서 tag
> * classes : category
>> 기사분류가 daum보다 naver에서 더 세분화되어 있기 때문에 네이버의 category 분류를 이용하기로 함

## Category

In [23]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_fastText_news_classification.pickled'):
    le = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_fastText_news_classification.pickled','rb'))
else:
    le = LabelEncoder()
    le.fit(naverData['category'])
    pickle.dump(le, open('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_fastText_news_classification.pickled','wb'))
print (le.classes_)

['IT/과학' '경제' '사회' '생활/문화' '세계' '스포츠' '연예' '정치']


## Tagging Twitter

In [24]:
from ckonlpy.tag import Twitter
ct = Twitter()

### fastText 기본 포맷

In [25]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_by_ct_for_fastText_news_classification.pickled'):
    w2v_docs = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_by_ct_for_fastText_news_classification.pickled', 'rb'))
else:
    w2v_docs = MakeTaggedData(naverData, TaggedDocument, ct, stopwords, le)
    pickle.dump(w2v_docs, open('./data/pre_data/news_tagged_data/pre_data_by_ct_for_fastText_news_classification.pickled', 'wb'))

In [26]:
if 'ct' in locals():
    del ct

### Train dataset & test dataset

In [27]:
trainName = './data/pre_data/news_train_test_Data/pre_data_fastText_train_for_news_classification_by_ct.pickled'
testName = './data/pre_data/news_train_test_Data/pre_data_fastText_test_for_news_classification_by_ct.pickled'

In [28]:
if os.path.isfile(trainName) & os.path.isfile(testName):
    train = pickle.load(open(trainName, 'rb'))
    test = pickle.load(open(testName, 'rb'))
else:
    train, test = train_test_split(w2v_docs, test_size = 0.15)
    pickle.dump(train,open(trainName,'wb'))
    pickle.dump(test,open(testName,'wb'))

In [29]:
if 'w2v_docs' in locals():
    del w2v_docs

In [30]:
x_train = [ x.words for x in tqdm(train)]
x_test = [x.words for x in tqdm(test)]

100%|██████████| 12852/12852 [00:00<00:00, 217113.65it/s]
100%|██████████| 2268/2268 [00:00<00:00, 569860.51it/s]


In [31]:
if 'train' in locals() and 'test' in locals():
    del train
    del test

### Model 1

In [32]:
modelPath = './news_model/'

In [33]:
from konlpy.utils import pprint

In [38]:
%%time
model = Make_FastText_Model(modelPath = modelPath, data = x_train, size = 500, epoch = 20, 
                   sg = 0, window = 10, workers = cores, negative = 7, min_count = 2, word_ngrams = 3, 
                    cbow_mean = 1, hs = 0, tagger = 'ct')

  1%|▏         | 185/12852 [00:00<00:06, 1843.31it/s]

./news_model/fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-1_min_count-2_by-ct.model


100%|██████████| 12852/12852 [00:11<00:00, 1088.80it/s]
100%|██████████| 12852/12852 [02:10<00:00, 98.21it/s]


Total running time:  0:49:22.404405
CPU times: user 1h 11min 25s, sys: 1min 23s, total: 1h 12min 48s
Wall time: 49min 22s


In [39]:
del model

###  Model 2

In [40]:
%%time
model = Make_FastText_Model(modelPath = modelPath, data = x_train, size = 500, epoch = 20, 
                   sg = 0, window = 10, workers = cores, negative = 7, min_count = 2, word_ngrams = 3, 
                    cbow_mean = 0, hs = 0, tagger = 'ct')

  1%|          | 127/12852 [00:00<00:10, 1262.09it/s]

./news_model/fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-0_min_count-2_by-ct.model


100%|██████████| 12852/12852 [00:07<00:00, 1663.64it/s]
100%|██████████| 12852/12852 [01:59<00:00, 107.14it/s]


Total running time:  0:55:06.021759
CPU times: user 1h 8min 17s, sys: 1min 21s, total: 1h 9min 39s
Wall time: 55min 6s


In [41]:
del model

### Model 3

In [42]:
%%time
model = Make_FastText_Model(modelPath = modelPath, data = x_train, size = 500, epoch = 20, 
                   sg = 1, window = 10, workers = cores, negative = 7, min_count = 2, word_ngrams = 3, 
                    cbow_mean = 0, hs = 0, tagger = 'ct')


  3%|▎         | 377/12852 [00:00<00:03, 3391.88it/s]

./news_model/fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-1_cbow_mean-0_min_count-2_by-ct.model


100%|██████████| 12852/12852 [00:05<00:00, 2415.60it/s]
100%|██████████| 12852/12852 [07:35<00:00, 28.24it/s]


Total running time:  2:12:42.513252
CPU times: user 3h 21min 17s, sys: 2min 31s, total: 3h 23min 48s
Wall time: 2h 12min 42s


In [43]:
del model 

## Tagging Mecab

In [44]:
from konlpy.tag import Mecab
mecab = Mecab()

### fastText 기본 포맷

In [45]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_by_mecab_for_fastText_news_classification.pickled'):
    w2v_docs = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_by_mecab_for_fastText_news_classification.pickled', 'rb'))
else:
    w2v_docs = MakeTaggedData(naverData, TaggedDocument, ct, stopwords, le)
    pickle.dump(w2v_docs, open('./data/pre_data/news_tagged_data/pre_data_by_mecab_for_fastText_news_classification.pickled', 'wb'))

In [46]:
if 'mecab' in locals():
    del mecab

### Train dataset & test dataset

In [47]:
trainName = './data/pre_data/news_train_test_Data/pre_data_fastText_train_for_news_classification_by_mecab.pickled'
testName = './data/pre_data/news_train_test_Data/pre_data_fastText_test_for_news_classification_by_mecab.pickled'

In [48]:
if os.path.isfile(trainName) & os.path.isfile(testName):
    train = pickle.load(open(trainName, 'rb'))
    test = pickle.load(open(testName, 'rb'))
else:
    train, test = train_test_split(w2v_docs, test_size = 0.15)
    pickle.dump(train,open(trainName,'wb'))
    pickle.dump(test,open(testName,'wb'))

In [49]:
if 'w2v_docs' in locals():
    del w2v_docs

In [50]:
x_train = [ x.words for x in tqdm(train)]
x_test = [x.words for x in tqdm(test)]

100%|██████████| 12852/12852 [00:00<00:00, 1167006.45it/s]
100%|██████████| 2268/2268 [00:00<00:00, 1121646.21it/s]


In [51]:
if 'train' in locals() and 'test' in locals():
    del train
    del test

### Model 1

In [52]:
modelPath = './news_model/'

In [53]:
from konlpy.utils import pprint

In [54]:
%%time
model = Make_FastText_Model(modelPath = modelPath, data = x_train, size = 500, epoch = 20, 
                   sg = 0, window = 10, workers = cores, negative = 7, min_count = 2, word_ngrams = 3, 
                    cbow_mean = 1, hs = 0, tagger = 'mecab')

 11%|█         | 1384/12852 [00:00<00:01, 6911.88it/s]

./news_model/fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-1_min_count-2_by-mecab.model


100%|██████████| 12852/12852 [00:01<00:00, 6471.69it/s]
100%|██████████| 12852/12852 [00:53<00:00, 242.42it/s]


Total running time:  0:37:34.288760
CPU times: user 1h 6min 52s, sys: 1min 15s, total: 1h 8min 7s
Wall time: 37min 34s


In [55]:
del model

###  Model 2

In [56]:
%%time
model = Make_FastText_Model(modelPath = modelPath, data = x_train, size = 500, epoch = 20, 
                   sg = 0, window = 10, workers = cores, negative = 7, min_count = 2, word_ngrams = 3, 
                    cbow_mean = 0, hs = 0, tagger = 'mecab')

  0%|          | 53/12852 [00:00<00:24, 512.28it/s]

./news_model/fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-0_cbow_mean-0_min_count-2_by-mecab.model


100%|██████████| 12852/12852 [00:14<00:00, 899.46it/s]
100%|██████████| 12852/12852 [03:18<00:00, 64.78it/s]


Total running time:  0:55:05.351004
CPU times: user 1h 8min 19s, sys: 1min 42s, total: 1h 10min 1s
Wall time: 55min 5s


In [57]:
del model

### Model 3

In [60]:
%%time
model = Make_FastText_Model(modelPath = modelPath, data = x_train, size = 500, epoch = 20, 
                   sg = 1, window = 10, workers = cores, negative = 7, min_count = 2, word_ngrams = 3, 
                    cbow_mean = 0, hs = 0, tagger = 'mecab')



  0%|          | 0/12852 [00:00<?, ?it/s][A
  5%|▌         | 649/12852 [00:00<00:01, 6410.00it/s][A

./news_model/fastText_size-500_epoch-20_ngrams-3_window-10_negative-7_hs-0_sg-1_cbow_mean-0_min_count-2_by-mecab.model



  7%|▋         | 905/12852 [00:00<00:03, 3935.06it/s][A
  9%|▉         | 1147/12852 [00:00<00:03, 3595.79it/s][A
 11%|█         | 1363/12852 [00:00<00:03, 2949.40it/s][A
 12%|█▏        | 1558/12852 [00:00<00:04, 2621.05it/s][A
 14%|█▍        | 1798/12852 [00:00<00:04, 2585.51it/s][A
 16%|█▋        | 2097/12852 [00:00<00:04, 2639.55it/s][A
 18%|█▊        | 2331/12852 [00:00<00:04, 2606.21it/s][A
 20%|██        | 2616/12852 [00:00<00:03, 2622.78it/s][A
 22%|██▏       | 2863/12852 [00:01<00:03, 2552.05it/s][A
 24%|██▍       | 3094/12852 [00:01<00:03, 2493.62it/s][A
 26%|██▌       | 3314/12852 [00:01<00:03, 2427.34it/s][A
 27%|██▋       | 3521/12852 [00:01<00:03, 2381.47it/s][A
 29%|██▉       | 3722/12852 [00:02<00:05, 1779.27it/s][A
 30%|███       | 3873/12852 [00:02<00:05, 1651.39it/s][A
 31%|███       | 3997/12852 [00:02<00:05, 1617.08it/s][A
 32%|███▏      | 4113/12852 [00:02<00:05, 1508.54it/s][A
 33%|███▎      | 4208/12852 [00:02<00:05, 1479.64it/s][A
 33%|███▎     

 16%|█▌        | 2043/12852 [00:26<02:18, 78.03it/s][A
 16%|█▌        | 2084/12852 [00:26<02:16, 78.62it/s][A
 16%|█▋        | 2102/12852 [00:26<02:17, 78.17it/s][A
 17%|█▋        | 2123/12852 [00:27<02:16, 78.44it/s][A
 17%|█▋        | 2144/12852 [00:27<02:16, 78.44it/s][A
 17%|█▋        | 2162/12852 [00:27<02:16, 78.55it/s][A
 17%|█▋        | 2179/12852 [00:27<02:16, 77.94it/s][A
 17%|█▋        | 2198/12852 [00:28<02:16, 78.20it/s][A
 17%|█▋        | 2222/12852 [00:28<02:15, 78.21it/s][A
 17%|█▋        | 2244/12852 [00:28<02:15, 78.41it/s][A
 18%|█▊        | 2259/12852 [00:29<02:16, 77.75it/s][A
 18%|█▊        | 2279/12852 [00:29<02:15, 77.99it/s][A
 18%|█▊        | 2295/12852 [00:29<02:15, 77.89it/s][A
 18%|█▊        | 2311/12852 [00:29<02:15, 77.85it/s][A
 18%|█▊        | 2327/12852 [00:30<02:16, 77.32it/s][A
 18%|█▊        | 2348/12852 [00:30<02:15, 77.57it/s][A
 18%|█▊        | 2368/12852 [00:30<02:15, 77.43it/s][A
 19%|█▊        | 2387/12852 [00:30<02:14, 77.59i

 42%|████▏     | 5342/12852 [01:12<01:42, 73.27it/s][A
 42%|████▏     | 5365/12852 [01:13<01:42, 73.26it/s][A
 42%|████▏     | 5402/12852 [01:13<01:41, 73.24it/s][A
 42%|████▏     | 5423/12852 [01:14<01:41, 73.19it/s][A
 42%|████▏     | 5448/12852 [01:14<01:41, 73.29it/s][A
 42%|████▏     | 5457/12852 [01:14<01:40, 73.31it/s][A
 43%|████▎     | 5481/12852 [01:14<01:40, 73.13it/s][A
 43%|████▎     | 5497/12852 [01:15<01:40, 73.08it/s][A
 43%|████▎     | 5513/12852 [01:15<01:40, 72.89it/s][A
 43%|████▎     | 5558/12852 [01:16<01:39, 73.01it/s][A
 43%|████▎     | 5576/12852 [01:16<01:39, 73.02it/s][A
 44%|████▎     | 5596/12852 [01:16<01:39, 73.08it/s][A
 44%|████▎     | 5616/12852 [01:16<01:38, 73.11it/s][A
 44%|████▍     | 5631/12852 [01:17<01:39, 72.83it/s][A
 44%|████▍     | 5650/12852 [01:17<01:38, 72.93it/s][A
 44%|████▍     | 5666/12852 [01:17<01:38, 72.88it/s][A
 44%|████▍     | 5688/12852 [01:18<01:38, 72.89it/s][A
 44%|████▍     | 5710/12852 [01:18<01:38, 72.66i

 68%|██████▊   | 8685/12852 [02:03<00:59, 70.42it/s][A
 68%|██████▊   | 8699/12852 [02:03<00:58, 70.40it/s][A
 68%|██████▊   | 8719/12852 [02:03<00:58, 70.39it/s][A
 68%|██████▊   | 8737/12852 [02:04<00:58, 70.45it/s][A
 68%|██████▊   | 8755/12852 [02:04<00:58, 70.32it/s][A
 68%|██████▊   | 8768/12852 [02:04<00:58, 70.32it/s][A
 68%|██████▊   | 8784/12852 [02:04<00:57, 70.28it/s][A
 69%|██████▊   | 8806/12852 [02:05<00:57, 70.35it/s][A
 69%|██████▊   | 8829/12852 [02:05<00:57, 70.31it/s][A
 69%|██████▉   | 8848/12852 [02:05<00:56, 70.33it/s][A
 69%|██████▉   | 8866/12852 [02:06<00:56, 70.30it/s][A
 69%|██████▉   | 8888/12852 [02:06<00:56, 70.37it/s][A
 69%|██████▉   | 8903/12852 [02:06<00:56, 70.28it/s][A
 69%|██████▉   | 8920/12852 [02:06<00:55, 70.27it/s][A
 69%|██████▉   | 8930/12852 [02:07<00:55, 70.17it/s][A
 70%|██████▉   | 8946/12852 [02:07<00:55, 70.19it/s][A
 70%|██████▉   | 8964/12852 [02:07<00:55, 70.13it/s][A
 70%|██████▉   | 8986/12852 [02:08<00:55, 70.20i

 92%|█████████▏| 11808/12852 [02:46<00:14, 70.80it/s][A
 92%|█████████▏| 11853/12852 [02:46<00:14, 71.00it/s][A
 92%|█████████▏| 11873/12852 [02:47<00:13, 71.02it/s][A
 93%|█████████▎| 11893/12852 [02:47<00:13, 70.91it/s][A
 93%|█████████▎| 11912/12852 [02:48<00:13, 70.90it/s][A
 93%|█████████▎| 11921/12852 [02:48<00:13, 70.95it/s][A
 93%|█████████▎| 11954/12852 [02:48<00:12, 71.08it/s][A
 93%|█████████▎| 11977/12852 [02:48<00:12, 70.97it/s][A
 93%|█████████▎| 11998/12852 [02:48<00:12, 71.02it/s][A
 93%|█████████▎| 12015/12852 [02:49<00:11, 71.04it/s][A
 94%|█████████▎| 12034/12852 [02:49<00:11, 71.09it/s][A
 94%|█████████▍| 12058/12852 [02:50<00:11, 70.93it/s][A
 94%|█████████▍| 12067/12852 [02:50<00:11, 70.90it/s][A
 94%|█████████▍| 12135/12852 [02:50<00:10, 71.02it/s][A
 95%|█████████▍| 12149/12852 [02:51<00:09, 71.03it/s][A
 95%|█████████▍| 12186/12852 [02:51<00:09, 70.89it/s][A
 95%|█████████▍| 12207/12852 [02:52<00:09, 70.92it/s][A
 95%|█████████▌| 12217/12852 [0

Total running time:  2:07:36.505677
CPU times: user 3h 30min 52s, sys: 1min 32s, total: 3h 32min 25s
Wall time: 2h 7min 36s


In [61]:
del model 