In [12]:
import pickle
import itertools
import warnings
import sys 
import os
import logging
from multiprocessing import cpu_count
from collections import namedtuple

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from ckonlpy.tag import Twitter
from konlpy.tag import Mecab

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel, ldaseqmodel, LdaMulticore, lda_dispatcher, doc2vec
from gensim.models.wrappers import LdaMallet, DtmModel
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger
from gensim import corpora, models, similarities
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric
from gensim.models.doc2vec import TaggedDocument

import pyLDAvis.gensim

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

warnings.filterwarnings('ignore')

In [2]:
ct = Twitter()
mecab = Mecab()
def nav_tokenizer(tagger, corpus, stopwords):
    pos = tagger.pos(corpus)
    pos = ['/'.join(t) for t in pos if not t[0] in stopwords]
    return pos

In [3]:
def MakeTaggedData(df, taggedDoc, tagger, stopwords):
    w2v_docs = list()
    for idx in tqdm(df.index):
        text = df.loc[idx,'title']+'.\n'+df.loc[idx,'mainText']
        pos = nav_tokenizer(tagger, text, stopwords)
        label = ['news_'+str(idx)]
        w2v_docs.append(TaggedDocument(pos, label))
    return w2v_docs

In [4]:
import multiprocessing
cores = int(multiprocessing.cpu_count())
def Make_Doc2Vec_Model(modelPath, data, size, dm, dm_concat, dm_mean, hs, negative, epoch, window, alpha, min_alpha, workers, tagger):
    from tqdm import tqdm
    tqdm.pandas(desc="progress-bar")
    from datetime import datetime
    from gensim.models import doc2vec
    start = datetime.now()
    modelName = 'doc2vec_size-{}_epoch-{}_window-{}_negative-{}_hs-{}_dm-{}_dm_concat-{}_dm_mean-{}_by-{}.model'.format(
        size, epoch, window, negative, hs, dm, dm_concat, dm_mean, tagger)
    modelName = modelPath+modelName
    print (modelName)
    if window!=None:
        d2v_model = doc2vec.Doc2Vec(vector_size = size, dm = dm, dm_concat = dm_concat,
                   dm_mean = dm_mean, negative = negative, hs = hs, window = window,
                   alpha = alpha, min_alpha = min_alpha, workers = workers, epochs= epoch)
    else:
        d2v_model = doc2vec.Doc2Vec(vector_size = size, dm = dm, dm_concat = dm_concat,
                   dm_mean = dm_mean, negative = negative, hs = hs,
                   alpha = alpha, min_alpha = min_alpha, workers = workers, epochs= epoch)
    d2v_model.build_vocab(tqdm(data))
    d2v_model.train_lbls = False # do not train labels of words
    d2v_model.train(tqdm(data), total_examples=d2v_model.corpus_count, epochs=d2v_model.iter)
    
    end = datetime.now()
    d2v_model.save(modelName)
    print ("Total running time: ", end-start)
    return d2v_model
print (cores)

4


In [5]:
ct = Twitter()
mecab = Mecab()
def nav_tokenizer(tagger, corpus, stopwords):
    pos = tagger.pos(corpus)
    pos = ['/'.join(t) for t in pos if not t[0] in stopwords]
    return pos

# Stopwords

In [6]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

# News

## Naver

In [7]:
dictNaver = pickle.load(open('./data/pre_data/stastics/for_statistics_Naver_from_mongodb.pickled','rb'))
dfNaver = pd.DataFrame.from_dict(dictNaver, orient='index')
print (dfNaver.shape)

(15120, 10)


## Daum

In [8]:
dictDaum = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
dfDaum = pd.DataFrame.from_dict(dictDaum, orient='index')
print (dfDaum.shape)

(9372, 10)


## 뉴스기사 통합

In [9]:
combinedDf = pd.concat([dfNaver, dfDaum])
combinedDf.head()

Unnamed: 0,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords
5a29c445588c132954d1973a,정치,2017.12.07,연합뉴스,1713,1465,1,"北외무성 ""전쟁 바라지 않지만 결코 피하지 않을 것""","美고위인사 대북언급 비난하며 ""전쟁 기정사실화"" 위협 며칠 새 이어지는 북한 군민...","[외무성, 핵전쟁, 대변인]","{조선반도, 핵전쟁, 미국, 북한, 고위, 중앙, 도화선, 대변인}"
5a29c445588c132954d1973b,정치,2017.12.07,한국일보,2551,2062,2,"예산전쟁, 예결위 간사ㆍ호남이 웃었다",예결위 간사들이 최대 수혜자..당 지도부 내 몫 챙기기도 여전 황주홍ㆍ김도읍 등...,"[예산, 예결위, soc]","{의원, 정부안, 증액, 지역구, 예산안, 호남, 국민의당}"
5a29c445588c132954d1973c,정치,2017.12.07,뉴시스,610,536,3,"혐의 부인에 20시간 조사…檢, 최경환 구속 카드 꺼내나",【서울=뉴시스】 최진석 기자 = 박근혜 정부 시절 국가정보원 특수활동비 수수 의혹 ...,"[최경환, 구속영장, 국가정보원]","{의원, 국정원장, 혐의, 구속영장 청구, 검찰, 조사, 정기국회}"
5a29c445588c132954d1973d,정치,2017.12.07,연합뉴스,145,133,4,"최재형 감사원장 후보자 ""독립성 강화는 임명권자의 뜻""",감사원장에 내정된 최재형 사법연수원장(고양=연합뉴스) 이희열 기자 = 7일 감사원장...,"[이슈 · 최재형 감사원장 내정, 감사원장, 최재형, 감사원]","{후보자, 법관, 감사원장, 생활, 지명, 공직 사회}"
5a29c445588c132954d1973e,정치,2017.12.07,동아일보,1074,932,5,"B-1B 한반도에 뜨자, 평양 비운 김정은",[동아일보] 북중 접경지 양강도 삼지연 시찰… 방북 유엔 사무차장 면담 안할듯 B-...,"[김정은, b-1b, 한반도]","{양강도, 사무차장, 훈련, 공장, 시찰, 펠트먼, 접경, 김정은, 삼지연}"


## Path

In [10]:
if sys.platform =='darwin':
    clusteringPath ='/Volumes/disk1/Clustering_doc2vec/'
    
elif sys.platform =='win32':
    clusteringPath = 'd:/Clustering_doc2vec/' 

## Document Labeling

In [13]:
TaggedDocument = namedtuple('TaggedDocument', 'words tags')

## Twitter

### Doc2Vec 기본 포맷으로 변경

In [14]:
tagger_ct = 'ct'
filename_ct = clusteringPath+'predata_doc2vec_{}'.format(tagger_ct)
if os.path.isfile(filename_ct):
    w2v_docs_ct = pickle.load(open(filename_ct, 'rb'))
else:
    w2v_docs_ct = MakeTaggedData(combinedDf, TaggedDocument, ct, stopwords)
    pickle.dump(w2v_docs_ct, open(filename_ct,'wb'))

### Make Doc2Vec Model

In [15]:
%%time
#PV-DM W/
modelName = '/Volumes/disk1/Clustering_doc2vec/doc2vec_size-300_epoch-20_window-5_negative-7_hs-0_dm-1_dm_concat-1_dm_mean-0_by-ct.model'
if not os.path.isfile(modelName):
    d2v_model = Make_Doc2Vec_Model(modelPath=clusteringPath, data=w2v_docs_ct, size = 300, dm = 1, dm_concat = 1,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = 5,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')
else:
    d2v_model = doc2vec.Doc2Vec.load(modelName)

  3%|▎         | 814/24492 [00:00<00:05, 4054.15it/s]

/Volumes/disk1/Clustering_doc2vec/doc2vec_size-300_epoch-20_window-5_negative-7_hs-0_dm-1_dm_concat-1_dm_mean-0_by-ct.model


100%|██████████| 24492/24492 [00:04<00:00, 5279.94it/s]
100%|██████████| 24492/24492 [02:49<00:00, 144.89it/s]


Total running time:  0:54:02.643884
CPU times: user 1h 30min 7s, sys: 1min 36s, total: 1h 31min 43s
Wall time: 54min 19s


In [16]:
%%time
#PV-DM w/
d2v_model = Make_Doc2Vec_Model(modelPath=clusteringPath, data=w2v_docs_ct, size = 300, dm = 1, dm_concat = 0,
                   dm_mean = 1, negative = 7, hs = 0, epoch = 20, window = 10,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')

  1%|          | 270/24492 [00:00<00:09, 2629.04it/s]

/Volumes/disk1/Clustering_doc2vec/doc2vec_size-300_epoch-20_window-10_negative-7_hs-0_dm-1_dm_concat-0_dm_mean-1_by-ct.model


100%|██████████| 24492/24492 [00:11<00:00, 2102.79it/s]
100%|██████████| 24492/24492 [00:47<00:00, 515.16it/s]


Total running time:  0:13:25.071448
CPU times: user 29min 35s, sys: 48.7 s, total: 30min 23s
Wall time: 13min 29s


In [None]:
%%time
# PV - DBOW
d2v_model = Make_Doc2Vec_Model(modelPath=clusteringPath, data=w2v_docs_ct, size = 300, dm = 0, dm_concat = 0,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = None,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')

  0%|          | 0/24492 [00:00<?, ?it/s]

/Volumes/disk1/Clustering_doc2vec/doc2vec_size-300_epoch-20_window-None_negative-7_hs-0_dm-0_dm_concat-0_dm_mean-0_by-ct.model


100%|██████████| 24492/24492 [00:03<00:00, 6764.50it/s]
100%|██████████| 24492/24492 [00:26<00:00, 937.08it/s] 


# Mecab

In [None]:
tagger_ct = 'mecab'
filename_ct = clusteringPath+'predata_doc2vec_{}'.format(tagger_ct)
if os.path.isfile(filename_ct):
    w2v_docs_mecab = pickle.load(open(filename_ct, 'rb'))
else:
    w2v_docs_mecab = MakeTaggedData(combinedDf, TaggedDocument, mecab, stopwords)
    pickle.dump(w2v_docs_mecab, open(filename_ct,'wb'))

In [None]:
%%time
#PV-DM W/
d2v_model = Make_Doc2Vec_Model(modelPath=clusteringPath, data=w2v_docs_mecab, size = 300, dm = 1, dm_concat = 1,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = 5,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')

In [None]:
%%time
#PV-DM w/
d2v_model = Make_Doc2Vec_Model(modelPath=clusteringPath, data=w2v_docs_mecab, size = 300, dm = 1, dm_concat = 0,
                   dm_mean = 1, negative = 7, hs = 0, epoch = 20, window = 10,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')

In [None]:
%%time
# PV - DBOW
d2v_model = Make_Doc2Vec_Model(modelPath=clusteringPath, data=w2v_docs_mecab, size = 300, dm = 0, dm_concat = 0,
                   dm_mean = 0, negative = 7, hs = 0, epoch = 20, window = None,
                   alpha = 0.025, min_alpha = 0.025, workers = cores, tagger = 'ct')