# 뉴스기사 Classification Using Doc2Vec
> * 네이버의 뉴스 기사를 이용하여 모델을 만들고 평가를 실시한뒤, 다으므이 뉴스 기사를 이용하여 분류해보도록 한다. 

In [1]:
import pickle
import html
import multiprocessing
from collections import namedtuple, OrderedDict
import re
import sys
import os

os.environ['KERAS_BACKEND']='tensorflow'

import numpy as np
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
import pandas as pd

from gensim.models import doc2vec, KeyedVectors
from gensim.models.doc2vec import TaggedDocument

from konlpy.utils import pprint

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, roc_curve,  accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import scale, MinMaxScaler, LabelEncoder
from sklearn.manifold import TSNE

import keras.backend.tensorflow_backend as K
from keras.preprocessing import sequence
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Flatten, Dense, Embedding, embeddings, merge, Dropout, Activation,  LSTM, Bidirectional, SimpleRNN, GRU
from keras.layers.convolutional import Conv1D, Conv2D
from keras.layers.pooling import MaxPooling1D, GlobalMaxPooling1D
from keras.layers.normalization import BatchNormalization
from keras.layers.core import SpatialDropout1D
from keras.utils import np_utils
from tensorflow.python.client import device_lib
from keras.layers.merge import dot

import xgboost as xgb

import matplotlib.pyplot as plt

import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
print (device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9804077932699667221
]


In [3]:
def Make_Roc_Curve(x, y, model1, model2, model3, model4):
    print ('Logistic Regression')
    fpr1, tpr1, thresholds1 = roc_curve(y, model1.predict(x))
    print ('Random Forest')
    fpr2, tpr2, thresholds2 = roc_curve(y, model2.predict(x))
    print ('Kernel SVM')
    fpr3, tpr3, thresholds3 = roc_curve(y, model3.predict(x))
    print ('XGBoost')
    import xgboost as xgb
    fpr4, tpr4, thresholds4 = roc_curve(y, model4.predict(xgb.DMatrix(x)))
    plt.plot(fpr1, tpr1, label="Logistic Regression")
    plt.plot(fpr2, tpr2, label="RandomForest")
    plt.plot(fpr3, tpr3, label="Kernel SVM")
    plt.plot(fpr4, tpr4, label='XGBoost')
    plt.legend()
    plt.plot([0, 1], [0, 1], 'k--', label="random guess")
    plt.xlabel('False Positive Rate (Fall-Out)')
    plt.ylabel('True Positive Rate (Recall)')
    plt.title('Receiver operating characteristic example')
    plt.show()

In [4]:
def plot_history(history):
    """Plot model history after `fit()`.
    """

    # summarize history for accuracy
    plt.plot(history.history['acc'])
    plt.plot(history.history['val_acc'])
    plt.title('model accuracy')
    plt.ylabel('accuracy')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

    # summarize history for loss
    plt.plot(history.history['loss'])
    plt.plot(history.history['val_loss'])
    plt.title('model loss')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(['train', 'valid'], loc='upper left')
    plt.show()

## Load Data

In [5]:
#Naver
naverData = pickle.load(open('./data/pre_data/stastics/for_statistics_Naver_from_mongodb.pickled','rb'))
naverData = pd.DataFrame.from_dict(naverData, orient = 'index')
naverData.reset_index(inplace = True)
naverData.rename(columns = {'index' : 'id'}, inplace = True)
#Daum
daumData = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
daumData = pd.DataFrame.from_dict(daumData, orient = 'index')
daumData.reset_index(inplace = True)
daumData.rename(columns = {'index' : 'id'}, inplace = True)

print ('Naver : {}'.format(naverData.shape))
print ('Daum : {}'.format(daumData.shape))

Naver : (15120, 11)
Daum : (9372, 11)


## Stopwords

In [6]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## document Labeling

In [7]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags category')

> * words : 기사에서 나온 단어들 or keywords
> * tags : 문서 tag
> * classes : category
>> 기사분류가 daum보다 naver에서 더 세분화되어 있기 때문에 네이버의 category 분류를 이용하기로 함

## Category 

In [8]:
le = LabelEncoder()
le.fit(naverData['category'])
le.classes_

array(['IT/과학', '경제', '사회', '생활/문화', '세계', '스포츠', '연예', '정치'],
      dtype=object)

In [9]:
from ckonlpy.tag import Twitter
from konlpy.tag import Mecab
ct = Twitter()
mecab = Mecab()
def nav_tokenizer(tagger, corpus, stopwords):
    pos = tagger.pos(corpus)
    pos = ['/'.join(t) for t in pos if not t[0] in stopwords]
    return pos

In [10]:
def MakeTaggedData(df, taggedDoc, tagger, stopwords, labelEncoder):
    w2v_docs = list()
    for idx in tqdm(df.index):
        text = df.loc[idx,'title']+'.\n'+df.loc[idx,'mainText']
        pos = nav_tokenizer(tagger, text, stopwords)
        category = df.loc[idx, 'category']
        encodeCategory = labelEncoder.transform([category])
        label = ['news_'+str(idx)]
        w2v_docs.append(TaggedDocument(pos, label, encodeCategory))
    return w2v_docs

In [None]:
%%time
w2v_docs = MakeTaggedData(naverData, TaggedDocument, ct, stopwords, le)

  2%|▏         | 345/15120 [00:39<28:28,  8.65it/s] 

In [None]:
train, test = train_test_split(w2v_docs, test_size = 0.15)

In [None]:
pickle.dump(train,open('./data/pre_data/kc_train_test_Data/pre_data_doc2vec_train_for_keywords_classification_by_ct.pickled','wb'))
pickle.dump(test,open('./data/pre_data/kc_train_test_Data/pre_data_doc2vec_test_for_keywords_classification_by_ct.pickled','wb'))

## doc2vec

In [None]:
d2v_model = Doc2Vec(alpha=0.025, min_alpha=0.025, iter=10)
d2v_model.build_vocab(tqdm(train))
for epoch in tqdm(range(10)):
    d2v_model.train(tqdm(train),total_examples=d2v_model.corpus_count, epochs=d2v_model.iter)
    d2v_model.alpha -= 0.002 
    d2v_model.min_alpha = d2v_model.alpha

In [None]:
d2v_model.save('./model/keywords_classification_naver_by_doc2vec_size100_window5_iter10_by_mecab.model')

In [None]:
d2v_model = Doc2Vec.load('./model/keywords_classification_naver_by_doc2vec_size100_window5_iter10_by_mecab.model')

In [None]:
train = pickle.load(open('./data/pre_data/train_test_Data/pre_data_doc2vec_train_for_keywords_classification_by_mecab.pickled','rb'))
test = pickle.load(open('./data/pre_data/train_test_Data/pre_data_doc2vec_test_for_keywords_classification_by_mecab.pickled','rb'))

In [None]:
x_train = [x.words for x in train]
y_train = [x.tags for x in train]
x_test = [x.words for x in test]
y_test = [x.tags for x in test]

In [None]:
from sklearn.preprocessing import scale
train_vecs_w2v = np.concatenate([d2v_model.infer_vector(z).reshape(1,-1) for z in tqdm(map(lambda x: x, x_train))])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([d2v_model.infer_vector(z).reshape(1,-1) for z in tqdm(map(lambda x: x, x_test))])
test_vecs_w2v = scale(test_vecs_w2v)

In [None]:
from keras.utils import np_utils
y_test2 = np_utils.to_categorical(y_test,8)
y_train2 = np_utils.to_categorical(y_train,8)

In [None]:
from keras_tqdm import TQDMCallback, TQDMNotebookCallback
from keras.models import Model, Sequential
from keras.regularizers import l2
from keras.optimizers import SGD
from keras.utils import np_utils
from keras.layers import Input, Dense, Embedding, embeddings, merge, Dropout
model = Sequential()
model.add(Dense(32, activation='relu', input_dim=100))
model.add(Dense(16, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.1))
model.add(Dense(8, activation='sigmoid'))
model.compile(#optimizer='rmsprop',
    optimizer='adadelta',
              #loss='binary_crossentropy',
    #optimizer=SGD(lr=0.2), 
    loss='categorical_crossentropy',
    metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train2, epochs=100, verbose=0, callbacks=[TQDMNotebookCallback(leave_inner=True, leave_outer=True)])
score = model.evaluate(test_vecs_w2v, y_test2, verbose=2)
print (score[1])

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.optimizers import SGD
model2 = Sequential()
model2.add(Dense(64, activation='relu', input_dim=100))
model2.add(Dropout(0.25))
model2.add(Dense(64, activation='relu'))
model2.add(Dropout(0.125))
model2.add(Dense(8, activation='softmax'))
#sgd = SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
model2.compile(loss='categorical_crossentropy',
              optimizer='adadelta',
              metrics=['accuracy'])
model2.fit(train_vecs_w2v, y_train2,
          epochs=100, verbose=0,
          callbacks=[TQDMNotebookCallback(leave_inner=True, leave_outer=True)])
score = model2.evaluate(test_vecs_w2v, y_test2)
print (score[1])

In [None]:
dataDict2 = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
print (len(dataDict2))

keywordsDict2 = pickle.load(open('./data/pre_data/keywords/keywords_daum.pickled','rb'))

for idx in dataDict2:
    dataDict2[idx]['extracted_keywords'] = keywordsDict2[idx]

df2 = pd.DataFrame.from_dict(dataDict2, orient='index')
df2['date'] = pd.to_datetime(df2['date']).dt.date
df2.reset_index(inplace = True)
df2.rename(columns={'index':'id'}, inplace=True)
print (df2.shape)
df2.head()

In [None]:
daum_w2v = np.concatenate([d2v_model.infer_vector(z).reshape(1,-1) for z in tqdm(map(lambda x: x, df2.title.values+'\.n'+df2.mainText.values))])
daum_w2v = scale(daum_w2v)

model_pre = model.predict_classes(daum_w2v)
model2_pre = model2.predict_classes(daum_w2v)

In [None]:
mp1 = le.inverse_transform(model_pre)

In [None]:
mp2 = le.inverse_transform(model2_pre)

In [None]:
(mp1==mp2).sum() / len(daum_w2v)

In [None]:
embedding_matrix = np.zeros((len(d2v_model.wv.vocab), 100))
for i in range(len(d2v_model.wv.vocab)):
    embedding_vector = d2v_model.wv[d2v_model.wv.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [None]:
valid_size = 50  # Random set of words to evaluate similarity on.
valid_window = 100  # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
# input words - in this case we do sample by sample evaluations of the similarity
valid_word = Input((1,), dtype='int32')
other_word = Input((1,), dtype='int32')
# setup the embedding layer
embeddings = Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1],
                      weights=[embedding_matrix])
embedded_a = embeddings(valid_word)
embedded_b = embeddings(other_word)
similarity = merge([embedded_a, embedded_b], mode='cos', dot_axes=2)
# create the Keras model
k_model = Model(input=[valid_word, other_word], output=similarity)

def get_sim(valid_word_idx, vocab_size):
    sim = np.zeros((vocab_size,))
    in_arr1 = np.zeros((1,))
    in_arr2 = np.zeros((1,))
    in_arr1[0,] = valid_word_idx
    for i in range(vocab_size):
        in_arr2[0,] = i
        out = k_model.predict_on_batch([in_arr1, in_arr2])
        sim[i] = out
    return sim

# now run the model and get the closest words to the valid examples
for i in range(valid_size):
    valid_word = d2v_model.wv.index2word[valid_examples[i]]
    top_k = 8  # number of nearest neighbors
    sim = get_sim(valid_examples[i], len(d2v_model.wv.vocab))
    nearest = (-sim).argsort()[1:top_k + 1]
    log_str = 'Nearest to %s:' % valid_word
    for k in range(top_k):
        close_word = d2v_model.wv.index2word[nearest[k]]
        log_str = '%s %s,' % (log_str, close_word)
    print(log_str)