# News Classification Using Word2Vec
> * 네이버의 뉴스 기사를 이용하여 모델을 만들고 평가를 실시한뒤, 다음의 뉴스 기사를 이용하여 분류해보도록 한다. 

In [11]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os

from numba import jit

os.environ['KERAS_BACKEND']='tensorflow'

import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from gensim.models import Word2Vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve,  accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler, LabelEncoder
from sklearn.manifold import TSNE

import keras.backend.tensorflow_backend as K
from keras.preprocessing import sequence
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Flatten, Dense, Embedding, embeddings, merge, Dropout, Activation,  LSTM, Bidirectional, SimpleRNN, GRU
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import SpatialDropout1D
from keras.utils import np_utils
from tensorflow.python.client import device_lib
from keras.layers.merge import dot

import xgboost as xgb

import matplotlib.pyplot as plt

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

In [3]:
print (device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1213504241553485166
]


In [4]:
def Make_Roc_Curve(x, y, model1, model2, model3, model4):
    print ('Logistic Regression')
    fpr1, tpr1, thresholds1 = roc_curve(y, model1.predict(x))
    print ('Random Forest')
    fpr2, tpr2, thresholds2 = roc_curve(y, model2.predict(x))
    print ('Kernel SVM')
    fpr3, tpr3, thresholds3 = roc_curve(y, model3.predict(x))
    print ('XGBoost')
    import xgboost as xgb
    fpr4, tpr4, thresholds4 = roc_curve(y, model4.predict(xgb.DMatrix(x)))
    plt.plot(fpr1, tpr1, label="Logistic Regression")
    plt.plot(fpr2, tpr2, label="RandomForest")
    plt.plot(fpr3, tpr3, label="Kernel SVM")
    plt.plot(fpr4, tpr4, label='XGBoost')
    plt.legend()
    plt.plot([0, 1], [0, 1], 'k--', label="random guess")
    plt.xlabel('False Positive Rate (Fall-Out)')
    plt.ylabel('True Positive Rate (Recall)')
    plt.title('Receiver operating characteristic example')
    plt.show()

In [5]:
def plot_history(history):
    """Plot model history after `fit()`.
    """

    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

In [1]:
import multiprocessing
cores = int(multiprocessing.cpu_count())
#sg (int {1, 0}) – Defines the training algorithm. If 1, CBOW is used, otherwise, skip-gram is employed.
def Make_Word2Vec_Model(modelPath, data, size, epoch, sg, window,min_count, cbow_mean, workers, negative, hs, tagger):
    from tqdm import tqdm
    tqdm.pandas(desc="progress-bar")
    from datetime import datetime
    from gensim.models import Word2Vec
    start = datetime.now()
    modelName = 'word2vec_size-{}_epoch-{}_window-{}_negative-{}_hs-{}_sg-{}_cbow_mean-{}_min_count-{}_by-{}.model'.format(
        size, epoch, window, negative, hs, sg, cbow_mean, min_count, tagger)
    modelName = modelPath+modelName
    print (modelName)
    w2v_model = Word2Vec(size = size, sg = sg, cbow_mean = cbow_mean, negative = negative,
                                  hs = hs, window = window, workers = workers, iter=epoch, min_count = min_count)
    w2v_model.build_vocab(tqdm(data))
    w2v_model.train(tqdm(data),  total_examples=w2v_model.corpus_count, epochs=w2v_model.iter, compute_loss = True)
    w2v_model.init_sims(replace = True)
    w2v_model.save(modelName)
    end = datetime.now()
    print ("Total running time: ", end-start)
    return w2v_model

In [26]:
def nav_tokenizer(tagger, corpus, stopwords):
    pos = tagger.pos(corpus)
    pos = [t[0] for t in pos if not t[0] in stopwords]
    return pos

def MakeTaggedData(df, taggedDoc, tagger, stopwords, labelEncoder):
    w2v_docs = list()
    for idx in tqdm(df.index):
        text = df.loc[idx,'title']+'.\n'+df.loc[idx,'mainText']
        pos = nav_tokenizer(tagger, text, stopwords)
        category = df.loc[idx, 'category']
        encodeCategory = labelEncoder.transform([category])
        label = ['news_'+str(idx)]
        w2v_docs.append(TaggedDocument(pos, label, encodeCategory))
    return w2v_docs

## Load Data

In [7]:
#Naver
naverData = pickle.load(open('./data/pre_data/stastics/for_statistics_Naver_from_mongodb.pickled','rb'))
naverData = pd.DataFrame.from_dict(naverData, orient = 'index')
naverData.reset_index(inplace = True)
naverData.rename(columns = {'index' : 'id'}, inplace = True)
#Daum
daumData = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
daumData = pd.DataFrame.from_dict(daumData, orient = 'index')
daumData.reset_index(inplace = True)
daumData.rename(columns = {'index' : 'id'}, inplace = True)

print ('Naver : {}'.format(naverData.shape))
print ('Daum : {}'.format(daumData.shape))

Naver : (15120, 11)
Daum : (9372, 11)


## Stopwords

In [8]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## Document Labeling

In [23]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags category')

> * words : 기사에서 나온 단어들 or keywords
> * tags : 문서 tag
> * classes : category
>> 기사분류가 daum보다 naver에서 더 세분화되어 있기 때문에 네이버의 category 분류를 이용하기로 함

## Category

In [13]:
le = LabelEncoder()
le.fit(naverData['category'])
le.classes_

array(['IT/과학', '경제', '사회', '생활/문화', '세계', '스포츠', '연예', '정치'],
      dtype=object)

In [14]:
pickle.dump(le, open('./data/pre_data/news_tagged_data/pre_data_category_label_encoder_by_ct_for_word2vec_news_classification.pickled','wb'))

## Tagging Twitter

In [15]:
from ckonlpy.tag import Twitter
ct = Twitter()

### Word2Vec 기본 포맷

In [None]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_by_ct_for_word2vec_news_classification.pickled'):
    w2v_docs = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_by_ct_for_word2vec_news_classification.pickled', 'rb'))
else:
    w2v_docs = MakeTaggedData(naverData, TaggedDocument, ct, stopwords, le)
    pickle.dump(w2v_docs, open('./data/pre_data/news_tagged_data/pre_data_by_ct_for_word2vec_news_classification.pickled', 'wb'))

 35%|███▍      | 5284/15120 [09:47<18:12,  9.00it/s]

### Train dataset & test dataset

In [None]:
trainName = './data/pre_data/news_train_test_Data/pre_data_word2vec_train_for_news_classification_by_ct.pickled'
testName = './data/pre_data/news_train_test_Data/pre_data_word2vec_test_for_news_classification_by_ct.pickled'

In [None]:
if os.path.isfile(trainName) & os.path.isfile(testName):
    train = pickle.load(open(trainName, 'rb'))
    test = pickle.load(open(testName, 'rb'))
else:
    train, test = train_test_split(w2v_docs, test_size = 0.15)
    pickle.dump(train,open(trainName,'wb'))
    pickle.dump(test,open(testName,'wb'))

In [None]:
del w2v_docs

In [None]:
x_train = [ x.words for x in tqdm(train)]
x_test = [x.words for x in tqdm(test)]

In [None]:
del train
del test

### Model 1

In [None]:
modelPath = './news_model/'

In [None]:
from konlpy.utils import pprint

In [None]:
%%time
model = Make_Word2Vec_Model(modelPath = modelPath, data = x_train, size = 1000, epoch = 20, 
                   sg = 0, window = 10, workers = cores, negative = 7, min_count = 2,
                    cbow_mean = 1, hs = 0, tagger = 'ct')

In [None]:
del model

###  Model 2

In [None]:
%%time
model = Make_Word2Vec_Model(modelPath = modelPath, data = x_train, size = 1000, epoch = 20, 
                   sg = 0, window = 10, workers = cores, negative = 7, min_count = 2,
                    cbow_mean = 0, hs = 0, tagger = 'ct')

In [None]:
del model

### Model 3

In [None]:
%%time
model = Make_Word2Vec_Model(modelPath = modelPath, data = x_train, size = 1000, epoch = 20, 
                   sg = 1, window = 10, workers = cores, negative = 7, min_count = 2,
                    cbow_mean = 0, hs = 0, tagger = 'ct')


## Tagging Mecab

In [15]:
from konlpy.tag import Mecab
mecab = Mecab()

### Word2Vec 기본 포맷

In [None]:
if os.path.isfile('./data/pre_data/news_tagged_data/pre_data_by_mecab_for_word2vec_news_classification.pickled'):
    w2v_docs = pickle.load(open('./data/pre_data/news_tagged_data/pre_data_by_mecab_for_word2vec_news_classification.pickled', 'rb'))
else:
    w2v_docs = MakeTaggedData(naverData, TaggedDocument, ct, stopwords, le)
    pickle.dump(w2v_docs, open('./data/pre_data/news_tagged_data/pre_data_by_mecab_for_word2vec_news_classification.pickled', 'wb'))

 26%|██▌       | 3944/15120 [07:15<20:34,  9.06it/s]

### Train dataset & test dataset

In [None]:
trainName = './data/pre_data/news_train_test_Data/pre_data_word2vec_train_for_news_classification_by_mecab.pickled'
testName = './data/pre_data/news_train_test_Data/pre_data_word2vec_test_for_news_classification_by_mecab.pickled'

In [None]:
if os.path.isfile(trainName) & os.path.isfile(testName):
    train = pickle.load(open(trainName, 'rb'))
    test = pickle.load(open(testName, 'rb'))
else:
    train, test = train_test_split(w2v_docs, test_size = 0.15)
    pickle.dump(train,open(trainName,'wb'))
    pickle.dump(test,open(testName,'wb'))

In [None]:
del w2v_docs

In [None]:
x_train = [ x.words for x in tqdm(train)]
x_test = [x.words for x in tqdm(test)]

In [None]:
del train
del test

### Model 1

In [None]:
modelPath = './news_model/'

In [None]:
from konlpy.utils import pprint

In [None]:
%%time
model = Make_Word2Vec_Model(modelPath = modelPath, data = x_train, size = 1000, epoch = 20, 
                   sg = 0, window = 10, workers = cores, negative = 7, min_count = 2,
                    cbow_mean = 1, hs = 0, tagger = 'mecab')

In [None]:
del model

###  Model 2

In [None]:
%%time
model = Make_Word2Vec_Model(modelPath = modelPath, data = x_train, size = 1000, epoch = 20, 
                   sg = 0, window = 10, workers = cores, negative = 7, min_count = 2,
                    cbow_mean = 0, hs = 0, tagger = 'mecab')

In [None]:
del model

### Model 3

In [None]:
%%time
model = Make_Word2Vec_Model(modelPath = modelPath, data = x_train, size = 1000, epoch = 20, 
                   sg = 1, window = 10, workers = cores, negative = 7, min_count = 2,
                    cbow_mean = 0, hs = 0, tagger = 'mecab')
