In [None]:
import pickle
import itertools
import warnings
import sys 
import re
import os
import logging
from multiprocessing import cpu_count
from collections import namedtuple
import html

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from ckonlpy.tag import Twitter
from konlpy.tag import Mecab
from konlpy.utils import pprint

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel, ldaseqmodel, LdaMulticore, lda_dispatcher, doc2vec
from gensim.models.wrappers import LdaMallet, DtmModel
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger
from gensim import corpora, models, similarities
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric
from gensim.models.doc2vec import TaggedDocument

from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.decomposition import PCA

import pyLDAvis.gensim

import pylab as pl

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

warnings.filterwarnings('ignore')

In [None]:
ct = Twitter()
mecab = Mecab()
def nav_tokenizer(tagger, corpus, stopwords):
    pos = tagger.pos(corpus)
    pos = ['/'.join(t) for t in pos if not t[0] in stopwords]
    return pos

In [None]:
def MakeTaggedData(df, taggedDoc, tagger, stopwords):
    w2v_docs = list()
    for idx in tqdm(df.index):
        text = df.loc[idx,'title']+'.\n'+df.loc[idx,'mainText']
        pos = nav_tokenizer(tagger, text, stopwords)
        label = ['news_'+str(idx)]
        w2v_docs.append(TaggedDocument(pos, label))
    return w2v_docs

In [None]:
import multiprocessing
cores = int(multiprocessing.cpu_count())
def Make_Doc2Vec_Model(modelPath, data, size, dm, dm_concat, dm_mean, hs, negative, epoch, window, alpha, min_alpha, workers, tagger):
    from tqdm import tqdm
    tqdm.pandas(desc="progress-bar")
    from datetime import datetime
    from gensim.models import doc2vec
    start = datetime.now()
    modelName = 'doc2vec_size-{}_epoch-{}_window-{}_negative-{}_hs-{}_dm-{}_dm_concat-{}_dm_mean-{}_by-{}.model'.format(
        size, epoch, window, negative, hs, dm, dm_concat, dm_mean, tagger)
    modelName = modelPath+modelName
    print (modelName)
    if window!=None:
        d2v_model = doc2vec.Doc2Vec(vector_size = size, dm = dm, dm_concat = dm_concat,
                   dm_mean = dm_mean, negative = negative, hs = hs, window = window,
                   alpha = alpha, min_alpha = min_alpha, workers = workers, epochs= epoch)
    else:
        d2v_model = doc2vec.Doc2Vec(vector_size = size, dm = dm, dm_concat = dm_concat,
                   dm_mean = dm_mean, negative = negative, hs = hs,
                   alpha = alpha, min_alpha = min_alpha, workers = workers, epochs= epoch)
    d2v_model.build_vocab(tqdm(data))
    d2v_model.train_lbls = False # do not train labels of words
    d2v_model.train(tqdm(data), total_examples=d2v_model.corpus_count, epochs=d2v_model.iter)
    
    end = datetime.now()
    d2v_model.save(modelName)
    print ("Total running time: ", end-start)
    return d2v_model
print (cores)

In [None]:
ct = Twitter()
mecab = Mecab()
def nav_tokenizer(tagger, corpus, stopwords):
    pos = tagger.pos(corpus)
    pos = ['/'.join(t) for t in pos if not t[0] in stopwords]
    return pos

In [None]:
def Print_Similar_Doc(model, df, infer_vecs, target):
    import re
    intIndex = df.index.tolist().index(target.name)
    print ('* News : {}'.format(target['title']))
    print ('* Press : {}'.format(target['press']))
    print ('* Date : {}'.format(target['date']))
    print ('* Site : {}'.format(target['site']))
    print ()
    infer_vec = infer_vecs[intIndex]
    simDocs = model.docvecs.most_similar(positive = [infer_vec])
    print (' * Similar Document : {}'.format(len(simDocs)))
    simDocs = pd.DataFrame(list(map(lambda x: df.loc[re.split('_', x[0])[1]], simDocs)))
    simDocs = simDocs.loc[:, ['category','date', 'press', 'title', 'keywords', 'extracted_keywords', 'site']]
    return simDocs
    

In [None]:
def ScatterPlot_by_kmeans(n_cluster, model):
    print ('KMeans Clustering')
    print ('Number of Cluster : {}'.format(n_cluster))
    import timeit
    start = timeit.default_timer()
    from sklearn.cluster import KMeans
    from sklearn.decomposition import PCA
    kmeans = KMeans(n_clusters = n_cluster, 
                  init = 'k-means++', 
                  max_iter = 500)
    X = kmeans.fit(model.docvecs.doctag_syn0)
    labels = kmeans.labels_.tolist()
    l = kmeans.fit_predict(model.docvecs.doctag_syn0)
    pca = PCA(n_components = 2).fit(model.docvecs.doctag_syn0)
    datapoint = pca.transform(model.docvecs.doctag_syn0)
    centroids = kmeans.cluster_centers_
    centroidpoint = pca.transform(centroids)
    plt.figure
    plt.scatter(datapoint[:, 0], datapoint[:, 1],
                c = labels)
    plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], 
               marker = '^', s = 150, c = '#000000')
    end = timeit.default_timer()
    execution_time = end - start
    print ('Running Time : {}'.format(execution_time))
    

# Stopwords

In [None]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

# News

## Naver

In [None]:
dictNaver = pickle.load(open('./data/pre_data/stastics/for_statistics_Naver_from_mongodb.pickled','rb'))
dfNaver = pd.DataFrame.from_dict(dictNaver, orient='index')
dfNaver['site'] = ['Naver'] * dfNaver.shape[0]
print (dfNaver.shape)

## Daum

In [None]:
dictDaum = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
dfDaum = pd.DataFrame.from_dict(dictDaum, orient='index')
dfDaum['site'] = ['Daum'] * dfDaum.shape[0]
print (dfDaum.shape)

## 뉴스기사 통합

In [None]:
combinedDf = pd.concat([dfNaver, dfDaum])
combinedDf.head()

## Path

In [None]:
if sys.platform =='darwin':
    clusteringPath ='/Volumes/disk1/Clustering_doc2vec/'
    
elif sys.platform =='win32':
    clusteringPath = 'd:/Clustering_doc2vec/' 

## Document Labeling

In [None]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags')

In [None]:
combinedDf[combinedDf.category == '스포츠']

## Twitter

### Doc2Vec 기본 포맷으로 변경

In [None]:
tagger_ct = 'ct'
filename_ct = clusteringPath+'predata_doc2vec_{}'.format(tagger_ct)
if os.path.isfile(filename_ct):
    w2v_docs_ct = pickle.load(open(filename_ct, 'rb'))
else:
    w2v_docs_ct = MakeTaggedData(combinedDf, TaggedDocument, ct, stopwords)
    pickle.dump(w2v_docs_ct, open(filename_ct,'wb'))

### Doc2Vec Model1 Using Tagger Twitter

In [None]:
%%time
#PV-DM W/
modelName1_ct = clusteringPath + 'doc2vec_size-300_epoch-20_window-5_negative-7_hs-0_dm-1_dm_concat-1_dm_mean-0_by-ct.model'
if not os.path.isfile(modelName1_ct):
    d2v_model1_ct = Make_Doc2Vec_Model(modelPath=clusteringPath, data=w2v_docs_ct, size = 300, dm = 1, dm_concat = 1,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = 5,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')
else:
    d2v_model1_ct = doc2vec.Doc2Vec.load(modelName1_ct)

In [None]:
m1_ct_name = modelName1_ct+'-infer_vector'
if not os.path.isfile(m1_ct_name):
    X_d2v_1_ct = [ d2v_model1_ct.infer_vector(x.words) for x in tqdm(w2v_docs_ct)]
    pickle.dump(X_d2v_1_ct, open(m1_ct_name, 'wb'))
else:
    X_d2v_1_ct = pickle.load(open(m1_ct_name, 'rb'))

In [None]:
test_1_ct = Print_Similar_Doc(d2v_model1_ct, combinedDf, X_d2v_1_ct, combinedDf.loc['5a381bb0588c13417c9a01a3'])
test_1_ct

In [None]:
ScatterPlot_by_kmeans(30, d2v_model1_ct)

### Doc2Vec Model2 Using Tagger Twitter

In [None]:
%%time
#PV-DM w/
modelName2_ct = clusteringPath + 'doc2vec_size-300_epoch-20_window-10_negative-7_hs-0_dm-1_dm_concat-0_dm_mean-1_by-ct.model'
if not os.path.isfile(modelName2_ct):
    d2v_model2_ct = Make_Doc2Vec_Model(modelPath=clusteringPath, data=w2v_docs_ct, size = 300, dm = 1, dm_concat = 0,
                   dm_mean = 1, negative = 7, hs = 0, epoch = 20, window = 10,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')
else:
    d2v_model2_ct = doc2vec.Doc2Vec.load(modelName2_ct)

In [None]:
m2_ct_name = modelName2_ct+'-infer_vector'
if not os.path.isfile(m2_ct_name):
    X_d2v_2_ct = [ d2v_model2_ct.infer_vector(x.words) for x in tqdm(w2v_docs_ct)]
    pickle.dump(X_d2v_2_ct, open(m2_ct_name, 'wb'))
else:
    X_d2v_2_ct = pickle.load(open(m2_ct_name, 'rb'))

In [None]:
test_2_ct = Print_Similar_Doc(d2v_model2_ct, combinedDf, X_d2v_2_ct, combinedDf.loc['5a381bb0588c13417c9a01a3'])
test_2_ct

### Doc2Vec Model3 Using Tagger Twitter

In [None]:
%%time
# PV - DBOW
modelName3_ct = clusteringPath + 'doc2vec_size-300_epoch-20_window-None_negative-7_hs-0_dm-0_dm_concat-0_dm_mean-0_by-ct.model'
if not os.path.isfile(modelName3_ct):
    d2v_model3_Ct = Make_Doc2Vec_Model(modelPath=clusteringPath, data=w2v_docs_ct, size = 300, dm = 0, dm_concat = 0,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = None,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')
else:
    d2v_model3_ct = doc2vec.Doc2Vec.load(modelName3_ct)

In [None]:
m3_ct_name = modelName3_ct+'-infer_vector'
if not os.path.isfile(m3_ct_name):
    X_d2v_3_ct = [ d2v_model3_ct.infer_vector(x.words) for x in tqdm(w2v_docs_ct)]
    pickle.dump(X_d2v_3_ct, open(m3_ct_name, 'wb'))
else:
    X_d2v_3_ct = pickle.load(open(m3_ct_name, 'rb'))
    

In [None]:
test_3_ct = Print_Similar_Doc(d2v_model3_ct, combinedDf, X_d2v_3_ct, combinedDf.loc['5a381bb0588c13417c9a01a3'])
test_3_ct

# Mecab

In [None]:
tagger_mecab = 'mecab'
filename_mecab = clusteringPath+'predata_doc2vec_{}'.format(tagger_mecab)
if os.path.isfile(filename_mecab):
    w2v_docs_mecab = pickle.load(open(filename_mecab, 'rb'))
else:
    w2v_docs_mecab = MakeTaggedData(combinedDf, TaggedDocument, mecab, stopwords)
    pickle.dump(w2v_docs_mecab, open(filename_mecab,'wb'))

### Doc2Vec Model1 Using Tagger Mecab

In [None]:
%%time
#PV-DM W/
modelName1_mecab = clusteringPath + 'doc2vec_size-300_epoch-20_window-5_negative-7_hs-0_dm-1_dm_concat-1_dm_mean-0_by-mecab.model'
if not os.path.isfile(modelName1_mecab):
    d2v_model1_mecab = Make_Doc2Vec_Model(modelPath=clusteringPath, data=w2v_docs_mecab, size = 300, dm = 1, dm_concat = 1,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = 5,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'mecab')
else:
    d2v_model1_mecab = doc2vec.Doc2Vec.load(modelName1_mecab)

In [None]:
m1_mecab_name = modelName1_mecab+'-infer_vector'
if not os.path.isfile(m1_mecab_name):
    X_d2v_1_mecab = [ d2v_model1_mecab.infer_vector(x.words) for x in tqdm(w2v_docs_mecab)]
    pickle.dump(X_d2v_1_mecab, open(m1_mecab_name, 'wb'))
else:
    X_d2v_1_mecab = pickle.load(open(m1_mecab_name, 'rb'))

In [None]:
test_1_mecab = Print_Similar_Doc(d2v_model1_mecab, combinedDf, X_d2v_1_mecab, combinedDf.loc['5a381bb0588c13417c9a01a3'])
test_1_mecab

### Doc2Vec Model2 Using Tagger Mecab

In [None]:
%%time
#PV-DM w/
modelName2_mecab = clusteringPath + 'doc2vec_size-300_epoch-20_window-10_negative-7_hs-0_dm-1_dm_concat-0_dm_mean-1_by-mecab.model'
if not os.path.isfile(modelName2_mecab):
    d2v_model2_mecab = Make_Doc2Vec_Model(modelPath=clusteringPath, data=w2v_docs_mecab, size = 300, dm = 1, dm_concat = 0,
                   dm_mean = 1, negative = 7, hs = 0, epoch = 20, window = 10,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'mecab')
else:
    d2v_model2_mecab = doc2vec.Doc2Vec.load(modelName2_mecab)

In [None]:
m2_mecab_name = modelName2_mecab+'-infer_vector'
if not os.path.isfile(m2_mecab_name):
    X_d2v_2_mecab = [ d2v_model2_mecab.infer_vector(x.words) for x in tqdm(w2v_docs_mecab)]
    pickle.dump(X_d2v_2_mecab, open(m2_mecab_name, 'wb'))
else:
    X_d2v_2_mecab = pickle.load(open(m2_mecab_name, 'rb'))

In [None]:
test_2_mecab = Print_Similar_Doc(d2v_model2_mecab, combinedDf, X_d2v_2_mecab, combinedDf.loc['5a381bb0588c13417c9a01a3'])
test_2_mecab

### Doc2Vec Model3 Using Tagger Mecab

In [None]:
%%time
# PV - DBOW
modelName3_mecab = clusteringPath + 'doc2vec_size-300_epoch-20_window-None_negative-7_hs-0_dm-0_dm_concat-0_dm_mean-0_by-mecab.model'
if not os.path.isfile(modelName3_mecab):
    d2v_model3_mecab = Make_Doc2Vec_Model(modelPath=clusteringPath, data=w2v_docs_mecab, size = 300, dm = 0, dm_concat = 0,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = None,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'mecab')
else:
    d2v_model3_mecab = doc2vec.Doc2Vec.load(modelName3_mecab)

In [None]:
m3_mecab_name = modelName3_mecab+'-infer_vector'
if not os.path.isfile(m3_mecab_name):
    X_d2v_3_mecab = [ d2v_model3_mecab.infer_vector(x.words) for x in tqdm(w2v_docs_mecab)]
    pickle.dump(X_d2v_3_mecab, open(m3_mecab_name, 'wb'))
else:
    X_d2v_3_mecab = pickle.load(open(m3_mecab_name, 'rb'))

In [None]:
test_3_mecab = Print_Similar_Doc(d2v_model3_mecab, combinedDf, X_d2v_3_mecab, combinedDf.loc['5a381bb0588c13417c9a01a3'])
test_3_mecab

## 중간 점검
* 만들어진 Doc2vec 모델을 하나의 뉴스(제목 : 안보전략硏 "황병서·김원홍 '처벌'…공포통치 끝 아닌 시작")로 확인을 해본 결과, 전체적으로 북한 관련 뉴스를 찾아주는 것으로 확인됨