# News Clustering

In [1]:
import pickle
import itertools
import warnings
import sys 
import os
import logging

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from ckonlpy.tag import Twitter
from konlpy.tag import Mecab

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel, ldaseqmodel, LdaMulticore, lda_dispatcher
from gensim.models.wrappers import LdaMallet, DtmModel
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger
from gensim import corpora, models, similarities
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric

import pyLDAvis.gensim

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

warnings.filterwarnings('ignore')

In [2]:
def Nav_tokenizer(doc, tagger, stopwords):
    pos = tagger.pos(doc)
    pos = [word[0] for word in pos if (len(word[0])>1) & (not word[0] in stopwords)]
    return pos

In [3]:
def Nav_tokenizer_noun(doc, tagger, stopwords):
    pos = tagger.nouns(doc)
    pos = [word for word in pos if (len(word)>1) & (not word in stopwords)]
    return pos

In [4]:
def evaluate_graph(dictionary, corpus, texts, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    for num_topics in range(1, limit):
        lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())
        
    # Show graph
    x = range(1, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return lm_list, c_v

### Stopwords

In [5]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## News

### Naver

In [6]:
dictNaver = pickle.load(open('./data/pre_data/stastics/for_statistics_Naver_from_mongodb.pickled','rb'))
dfNaver = pd.DataFrame.from_dict(dictNaver, orient='index')
print (dfNaver.shape)

(15120, 10)


### Daum

In [7]:
dictDaum = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
dfDaum = pd.DataFrame.from_dict(dictDaum, orient='index')
print (dfDaum.shape)

(9372, 10)


## Daum

### 뉴스 기사 통합

In [8]:
combinedDf = pd.concat([dfNaver, dfDaum])
combinedDf.head()

Unnamed: 0,category,date,press,number_of_comment,number_of_crawled_comment,rank,title,mainText,keywords,extracted_keywords
5a29c445588c132954d1973a,정치,2017.12.07,연합뉴스,1713,1465,1,"北외무성 ""전쟁 바라지 않지만 결코 피하지 않을 것""","美고위인사 대북언급 비난하며 ""전쟁 기정사실화"" 위협 며칠 새 이어지는 북한 군민...","[외무성, 핵전쟁, 대변인]","{고위, 북한, 미국, 조선반도, 핵전쟁, 중앙, 대변인, 도화선}"
5a29c445588c132954d1973b,정치,2017.12.07,한국일보,2551,2062,2,"예산전쟁, 예결위 간사ㆍ호남이 웃었다",예결위 간사들이 최대 수혜자..당 지도부 내 몫 챙기기도 여전 황주홍ㆍ김도읍 등...,"[예산, 예결위, soc]","{의원, 국민의당, 지역구, 호남, 증액, 예산안, 정부안}"
5a29c445588c132954d1973c,정치,2017.12.07,뉴시스,610,536,3,"혐의 부인에 20시간 조사…檢, 최경환 구속 카드 꺼내나",【서울=뉴시스】 최진석 기자 = 박근혜 정부 시절 국가정보원 특수활동비 수수 의혹 ...,"[최경환, 구속영장, 국가정보원]","{혐의, 조사, 의원, 구속영장 청구, 국정원장, 검찰, 정기국회}"
5a29c445588c132954d1973d,정치,2017.12.07,연합뉴스,145,133,4,"최재형 감사원장 후보자 ""독립성 강화는 임명권자의 뜻""",감사원장에 내정된 최재형 사법연수원장(고양=연합뉴스) 이희열 기자 = 7일 감사원장...,"[이슈 · 최재형 감사원장 내정, 감사원장, 최재형, 감사원]","{감사원장, 후보자, 공직 사회, 생활, 법관, 지명}"
5a29c445588c132954d1973e,정치,2017.12.07,동아일보,1074,932,5,"B-1B 한반도에 뜨자, 평양 비운 김정은",[동아일보] 북중 접경지 양강도 삼지연 시찰… 방북 유엔 사무차장 면담 안할듯 B-...,"[김정은, b-1b, 한반도]","{삼지연, 김정은, 양강도, 사무차장, 훈련, 접경, 공장, 시찰, 펠트먼}"


In [9]:
rawData_text = combinedDf.title + '\n' + combinedDf.mainText

In [10]:
rawData_text[0]

'北외무성 "전쟁 바라지 않지만 결코 피하지 않을 것"\n美고위인사 대북언급 비난하며 "전쟁 기정사실화" 위협  며칠 새 이어지는 북한 군민연환대회(평양 조선중앙통신=연합뉴스) 북한이 \'화성-15형\' 발사 성공을 축하하는 군민연환대회를 지난 5일 황해북도, 강원도, 양강도 등 각지에서 열었다고 6일 조선중앙통신이 보도했다. 2017.12.6 [국내에서만 사용가능. 재배포 금지. For Use Only in the Republic of Korea. No Redistribution] photo@yna.co.kr   (서울=연합뉴스) 홍국기 기자 = 북한은 6일 미국 고위인사들의 대북 강경 발언들을 문제 삼으며 "미국은 매일과 같이 조선반도(한반도)에서의 핵전쟁을 광고하고 있다"면서 "우리는 전쟁을 바라지 않지만 결코 피하지 않을 것"이라고 밝혔다.  북한 외무성 대변인은 이날 조선중앙통신 기자와의 문답에서 "미국이 조선반도에서 우리를 겨냥한 사상 최대의 연합공중훈련을 강행하고 있는 가운데 최근 미국의 고위정객들이 줄줄이 나서서 호전적인 망발들을 늘어놓는 등 심상치 않은 움직임을 보이고 있다"면서 이같이 말했다고 중앙통신이 전했다.  외무성 대변인은 "조선반도에 언제 전쟁이 터질지 모를 일촉즉발의 초긴장상태가 조성되고있는 속에 미국의 고위정객들의 입에서 연달아 터져 나오는 전쟁 폭언으로 말미암아 조선반도에서의 전쟁은 기정사실화되고 이제 남은 것은 언제 전쟁이 터지는가 하는 시점상 문제"라고 위협했다.  대변인은 "백악관 국가안보보좌관과 공화당 소속 국회 상원의원이 북조선과의 전쟁 가능성이 매일 증대되고 있다느니, 선제공격 선택에 더욱 접근하고 있다느니, 남조선 주둔 미군 가족들을 철수시켜야 한다느니 하는 따위의 화약내 풍기는 대결 망발들을 늘어놓은 것은 우리에게 조선반도에서의 전쟁발발에 대비하라는 신호로밖에 달리 해석될 수 없다"고 주장했다.  이어 "지어(심지어) 미 중앙정보국장이란 놈이 우리의 심장인 최고 지도부까지 감히 걸고 들며 도발을 걸어온 것은 우리가

In [11]:
if sys.platform =='darwin':
    clusteringPath ='/Volumes/disk1/Clustering/'
    clusteringModelPath = '/Volumes/disk1/Clustering_model/'
elif sys.platform =='win32':
    clusteringPath = 'd:/Clustering/' 
    clusteringModelPath = 'd:/Clustering_model/'

### token

In [12]:
mecab = Mecab()
ct = Twitter()

#### 명사만

In [13]:
outfile_ct = clusteringPath + 'nouns_taggerd_news_text_by_ct.pickled'
if not os.path.isfile(outfile_ct):
    tagged_text_ct = [Nav_tokenizer_noun(doc, ct, stopwords) for doc in tqdm(rawData_text)]
    pickle.dump(tagged_text_ct, open(outfile_ct, 'wb'))
else:
    tagged_text_ct = pickle.load(open(outfile_ct, 'rb'))

In [14]:
outfile_mecab = clusteringPath + 'nouns_taggerd_news_text_by_mecab.pickled'
if not os.path.isfile(outfile_mecab):
    tagged_text_mecab = [Nav_tokenizer_noun(doc, mecab, stopwords) for doc in tqdm(rawData_text)]
    pickle.dump(tagged_text_mecab, open(outfile_mecab, 'wb'))
else:
    tagged_text_mecab = pickle.load(open(outfile_mecab, 'rb'))

In [15]:
tagged_text_ct[0]

['외무성',
 '전쟁',
 '바라지',
 '결코',
 '피하',
 '고위',
 '인사',
 '대북',
 '언급',
 '비난',
 '전쟁',
 '기정',
 '위협',
 '며칠',
 '북한',
 '군민',
 '연환',
 '대회',
 '평양',
 '조선',
 '중앙',
 '통신',
 '북한',
 '화성',
 '15',
 '발사',
 '성공',
 '군민',
 '연환',
 '대회',
 '황해북도',
 '강원도',
 '양강도',
 '각지',
 '조선중앙통신',
 '2017',
 '12',
 '국내',
 '사용',
 '가능',
 '배포',
 '금지',
 '서울',
 '국기',
 '북한',
 '미국',
 '고위',
 '인사',
 '대북',
 '강경',
 '발언',
 '미국',
 '매일',
 '조선반도',
 '한반도',
 '핵전쟁',
 '광고',
 '전쟁',
 '바라지',
 '결코',
 '피하',
 '북한',
 '외무성',
 '대변인',
 '조선중앙통신',
 '문답',
 '미국',
 '조선반도',
 '겨냥',
 '사상',
 '최대',
 '연합',
 '공중',
 '훈련',
 '강행',
 '미국',
 '고위',
 '정객',
 '줄줄이',
 '호전',
 '적인',
 '망발',
 '상치',
 '움직임',
 '중앙통신',
 '외무성',
 '대변인',
 '조선반도',
 '전쟁',
 '일촉즉발',
 '초긴장',
 '상태',
 '조성',
 '미국',
 '고위',
 '정객',
 '연달',
 '전쟁',
 '폭언',
 '조선반도',
 '서의',
 '전쟁',
 '기정',
 '전쟁',
 '위협',
 '대변인',
 '백악관',
 '국가',
 '안보',
 '보좌',
 '공화당',
 '국회',
 '상원의원',
 '북조선',
 '전쟁',
 '가능성',
 '매일',
 '증대',
 '선제공격',
 '선택',
 '접근',
 '남조선',
 '주둔',
 '미군',
 '가족',
 '철수',
 '화약',
 '풍기',
 '대결',
 '망발',
 '조선반도',
 '서의',
 '전쟁',
 '발발',
 '대비',
 '신호'

In [16]:
tagged_text_mecab[0]

['외무성',
 '전쟁',
 '고위',
 '인사',
 '대북',
 '언급',
 '비난',
 '전쟁',
 '기정사실화',
 '위협',
 '며칠',
 '북한',
 '군민',
 '연환',
 '대회',
 '평양',
 '조선중앙통신',
 '북한',
 '화성',
 '발사',
 '성공',
 '축하',
 '군민',
 '연환',
 '대회',
 '황해북도',
 '강원도',
 '양강도',
 '각지',
 '조선중앙통신',
 '보도',
 '국내',
 '사용',
 '가능',
 '재배포',
 '금지',
 '서울',
 '홍국',
 '북한',
 '미국',
 '고위',
 '인사',
 '대북',
 '강경',
 '발언',
 '미국',
 '매일',
 '조선반도',
 '한반도',
 '핵전쟁',
 '광고',
 '전쟁',
 '북한',
 '외무성',
 '대변인',
 '조선중앙통신',
 '문답',
 '미국',
 '조선반도',
 '겨냥',
 '사상',
 '최대',
 '연합',
 '공중',
 '훈련',
 '강행',
 '미국',
 '고위',
 '정객',
 '호전',
 '망발',
 '움직임',
 '중앙통신',
 '외무성',
 '대변인',
 '조선반도',
 '전쟁',
 '일촉즉발',
 '초긴장',
 '상태',
 '조성',
 '미국',
 '고위',
 '정객',
 '전쟁',
 '폭언',
 '조선반도',
 '전쟁',
 '기정사실',
 '전쟁',
 '위협',
 '대변인',
 '백악관',
 '국가안보',
 '보좌관',
 '공화당',
 '국회',
 '상원',
 '의원',
 '북조선',
 '전쟁',
 '가능',
 '증대',
 '선제공격',
 '선택',
 '접근',
 '남조선',
 '주둔',
 '미군',
 '가족',
 '철수',
 '화약내',
 '대결',
 '망발',
 '조선반도',
 '전쟁',
 '발발',
 '대비',
 '신호',
 '해석',
 '주장',
 '지어',
 '중앙',
 '정보',
 '국장',
 '심장',
 '최고',
 '지도부',
 '도발',
 '강경',
 '대응',
 '조치',
 '빌미',
 '조선반도',
 '핵

### 사전 데이터 제작

In [17]:
%%time
dict_ct_name = clusteringModelPath + 'dictionary_ct'
dict_mecab_name = clusteringModelPath + 'dictionary_mecab'
if not os.path.isfile(dict_ct_name):
    dictionary_ct = Dictionary(tagged_text_ct)
    dictionary_ct.save(dict_ct_name)
else:
    dictionary_ct = Dictionary.load(dict_ct_name)
if not os.path.isfile(dict_mecab_name):
    dictionary_mecab = Dictionary(tagged_text_mecab)
    dictionary_mecab.save(dict_mecab_name)
else:
    dictionary_mecab = Dictionary.load(dict_mecab_name)

CPU times: user 134 ms, sys: 37.8 ms, total: 172 ms
Wall time: 222 ms


In [18]:
%%time
corpus_ct_name = clusteringModelPath + 'corpus_ct.pickled'
corpus_mecab_name = clusteringModelPath + 'corpus_mecab.pickled'
if not os.path.isfile(corpus_ct_name):
    corpus_ct = [ dictionary_ct.doc2bow(text) for text in tqdm(tagged_text_ct)]
    pickle.dump(corpus_ct, open(corpus_ct_name, 'wb'))
else:
    corpus_ct = pickle.load(open(corpus_ct_name, 'rb'))
if not os.path.isfile(corpus_mecab_name):
    corpus_mecab = [ dictionary_mecab.doc2bow(text) for text in tqdm(tagged_text_mecab)]
    pickle.dump(corpus_mecab, open(corpus_mecab_name, 'wb'))
else:
    corpus_mecab = pickle.load(open(corpus_mecab_name, 'rb'))


CPU times: user 1.25 s, sys: 419 ms, total: 1.67 s
Wall time: 1.9 s


In [19]:
print('Number of unique tokens: %d' % len(dictionary_ct))
print('Number of documents: %d' % len(corpus_ct))
print('Number of unique tokens: %d' % len(dictionary_mecab))
print('Number of documents: %d' % len(corpus_mecab))

Number of unique tokens: 98356
Number of documents: 24492
Number of unique tokens: 102314
Number of documents: 24492


### LSI (  Latent Semantic Indexing )
* an indexing and retrieval method that uses a mathematical technique called singular value decomposition (SVD) to identify patterns in the relationships between the terms and concepts contained in an unstructured collection of text  

In [20]:
%%time
lsimodel_ct_name = clusteringModelPath + 'lsimodel_ct'
lsimodel_mecab_name = clusteringModelPath + 'lsimidel_mecab'
if not os.path.isfile(lsimodel_ct_name):
    lsimodel_ct = LsiModel(corpus = corpus_ct, num_topics = 20, id2word = dictionary_ct)
    lsimodel_ct.save(lsimodel_ct_name)
else:
    lsimodel_ct = LsiModel.load(lsimodel_ct_name)
if not os.path.isfile(lsimodel_mecab_name):
    lsimodel_mecab = LsiModel(corpus = corpus_mecab, num_topics = 20, id2word = dictionary_mecab)
    lsimodel_mecab.save(lsimodel_mecab_name)
else:
    lsimodel_mecab = LsiModel.load(lsimodel_mecab_name)

CPU times: user 372 ms, sys: 125 ms, total: 497 ms
Wall time: 1.08 s


In [21]:
lsimodel_ct.show_topics(num_topics = 20)

[(0,
  '0.240*"대통령" + 0.238*"선수" + 0.231*"경기" + 0.228*"한국" + 0.155*"정부" + 0.151*"중국" + 0.150*"서울" + 0.149*"미국" + 0.145*"적인" + 0.138*"감독"'),
 (1,
  '-0.424*"대통령" + 0.379*"경기" + 0.354*"선수" + 0.212*"득점" + 0.203*"감독" + -0.163*"정부" + 0.139*"기록" + 0.118*"리그" + -0.109*"중국" + -0.106*"미국"'),
 (2,
  '-0.874*"게임" + 0.178*"대통령" + -0.170*"동아" + -0.168*"출처" + -0.142*"쉘룡" + -0.108*"딴지" + -0.091*"출시" + -0.088*"개발" + -0.068*"국산" + 0.050*"경기"'),
 (3,
  '-0.622*"대통령" + -0.210*"게임" + 0.166*"거래" + 0.155*"가상화폐" + -0.149*"경기" + 0.144*"비트코인" + -0.125*"검찰" + 0.121*"한국" + 0.117*"시장" + -0.113*"득점"'),
 (4,
  '-0.459*"중국" + -0.365*"북한" + -0.267*"미국" + -0.244*"한국" + 0.173*"서울" + 0.167*"의원" + -0.150*"선수" + 0.149*"검찰" + -0.135*"일본" + 0.134*"수사"'),
 (5,
  '-0.503*"선수" + 0.360*"득점" + 0.229*"경기" + 0.202*"중국" + -0.174*"계약" + -0.133*"구단" + -0.130*"감독" + 0.124*"북한" + 0.117*"쿼터" + 0.112*"리바운드"'),
 (6,
  '-0.289*"대통령" + -0.280*"거래" + -0.279*"가상화폐" + 0.275*"대표" + 0.266*"의원" + -0.246*"비트코인" + 0.193*"통합" + -0.180*"거래소" + 0.168*

In [22]:
lsimodel_mecab.show_topics(num_topics = 20)

[(0,
  '0.292*"대통령" + 0.249*"선수" + 0.213*"한국" + 0.209*"경기" + 0.175*"정부" + 0.169*"중국" + 0.163*"미국" + 0.146*"서울" + 0.143*"대표" + 0.139*"감독"'),
 (1,
  '0.415*"대통령" + -0.408*"선수" + -0.389*"경기" + -0.220*"감독" + -0.199*"득점" + 0.148*"정부" + -0.144*"기록" + -0.111*"리그" + 0.100*"청와대" + 0.098*"검찰"'),
 (2,
  '-0.600*"대통령" + 0.207*"게임" + 0.169*"가상화폐" + -0.163*"경기" + 0.146*"중국" + 0.140*"비트코인" + -0.135*"검찰" + 0.133*"시장" + 0.131*"한국" + -0.129*"청와대"'),
 (3,
  '-0.490*"중국" + -0.356*"북한" + -0.275*"미국" + 0.212*"의원" + -0.187*"한국" + 0.179*"게임" + 0.168*"대표" + 0.155*"서울" + 0.150*"검찰" + 0.139*"수사"'),
 (4,
  '-0.840*"게임" + -0.196*"출처" + -0.160*"대통령" + -0.127*"꿀딴지" + 0.111*"가상화폐" + -0.105*"개발" + -0.099*"출시" + 0.093*"정부" + -0.088*"소개" + 0.084*"서울"'),
 (5,
  '0.452*"선수" + -0.367*"득점" + -0.303*"경기" + 0.215*"대표" + 0.169*"계약" + -0.140*"시티" + -0.136*"가상화폐" + -0.128*"기록" + 0.126*"의원" + 0.113*"구단"'),
 (6,
  '-0.343*"대표" + -0.303*"의원" + 0.288*"선수" + 0.256*"가상화폐" + 0.251*"대통령" + -0.215*"통합" + 0.201*"비트코인" + 0.176*"계약" + -0.17

In [23]:
lsitopics_ct = lsimodel_ct.show_topics(formatted = False)
lsitopics_mecab = lsimodel_mecab.show_topics(formatted = False)

### HDP (Hierarchical Dirichlet Process)
* a non-parametric bayesian method (note the missing number of requested topics)

In [24]:
%%time
hdpmodel_ct_name = clusteringModelPath+'hdpmodel_ct'
hdpmodel_mecab_name = clusteringModelPath+'hdpmodel_mecab'
if not os.path.isfile(hdpmodel_ct_name):
    hdpmodel_ct = HdpModel(corpus = corpus_ct, id2word = dictionary_ct)
    hdpmodel_ct.save(clusteringModelPath+'hdpmodel_ct')
else:
    hdpmodel_ct = HdpModel.load(hdpmodel_ct_name)
if not os.path.isfile(hdpmodel_mecab_name):
    hdpmodel_mecab = HdpModel(corpus = corpus_mecab, id2word = dictionary_mecab)
    hdpmodel_mecab.save(clusteringModelPath+'hdpmodel_mecab')
else:
    hdpmodel_mecab = HdpModel.load(hdpmodel_mecab_name)

CPU times: user 2.04 s, sys: 1.5 s, total: 3.54 s
Wall time: 10.9 s


In [25]:
hdpmodel_ct.show_topics()

[(0,
  '0.004*선수 + 0.004*한국 + 0.004*서울 + 0.003*사진 + 0.003*적인 + 0.003*경기 + 0.003*대통령 + 0.003*미국 + 0.002*금지 + 0.002*중국 + 0.002*감독 + 0.002*정부 + 0.002*12 + 0.002*배포 + 0.002*무단 + 0.002*상황 + 0.002*대표 + 0.002*모습 + 0.002*20 + 0.002*전재'),
 (1,
  '0.007*대통령 + 0.005*정부 + 0.005*서울 + 0.005*북한 + 0.004*한국 + 0.003*중국 + 0.003*미국 + 0.003*사진 + 0.003*금지 + 0.003*적인 + 0.002*배포 + 0.002*무단 + 0.002*검찰 + 0.002*전재 + 0.002*12 + 0.002*대표 + 0.002*상황 + 0.002*혐의 + 0.002*조사 + 0.002*자들'),
 (2,
  '0.020*경기 + 0.013*손흥민 + 0.011*토트넘 + 0.010*감독 + 0.010*선수 + 0.008*한국 + 0.007*기록 + 0.006*맨유 + 0.006*리그 + 0.005*케인 + 0.005*2017 + 0.005*전반 + 0.005*이적 + 0.005*공격 + 0.005*득점 + 0.005*맨체스터 + 0.005*맨시티 + 0.004*축구 + 0.004*레알 + 0.004*라운드'),
 (3,
  '0.005*미국 + 0.005*북한 + 0.004*중국 + 0.004*정부 + 0.004*한국 + 0.004*예산 + 0.004*인상 + 0.003*지진 + 0.003*서울 + 0.003*대통령 + 0.003*규모 + 0.003*예산안 + 0.003*달러 + 0.003*원내대표 + 0.003*적인 + 0.003*의원 + 0.003*금지 + 0.003*금리 + 0.002*합의 + 0.002*경제'),
 (4,
  '0.026*영하 + 0.026*서울 + 0.023*기온 + 0.019*날씨 + 0.015*한파 + 0.015*추

In [26]:
hdpmodel_mecab.show_topics()

[(0,
  '0.005*대통령 + 0.005*서울 + 0.004*정부 + 0.003*미국 + 0.003*중국 + 0.003*사진 + 0.003*한국 + 0.003*대표 + 0.003*금지 + 0.003*가능 + 0.002*무단 + 0.002*조사 + 0.002*북한 + 0.002*의원 + 0.002*재배포 + 0.002*시장 + 0.002*전재 + 0.002*상황 + 0.002*검찰 + 0.002*경찰'),
 (1,
  '0.013*선수 + 0.010*경기 + 0.007*감독 + 0.007*한국 + 0.005*계약 + 0.005*기록 + 0.004*사진 + 0.003*손흥민 + 0.003*리그 + 0.003*대표 + 0.003*모습 + 0.003*구단 + 0.003*상황 + 0.003*정현 + 0.003*일본 + 0.003*방송 + 0.003*가능 + 0.002*토트넘 + 0.002*영입 + 0.002*우승'),
 (2,
  '0.005*방송 + 0.004*무단 + 0.004*화유기 + 0.003*금지 + 0.003*사건 + 0.003*재배포 + 0.003*서울 + 0.003*드라마 + 0.003*영화 + 0.003*전재 + 0.002*사진 + 0.002*수사 + 0.002*손오공 + 0.002*혐의 + 0.002*사용 + 0.002*배우 + 0.002*진선미 + 0.002*SK텔레콤 + 0.002*검찰 + 0.002*이유'),
 (3,
  '0.008*한국 + 0.006*경기 + 0.006*감독 + 0.005*선수 + 0.005*맨유 + 0.004*호날두 + 0.004*월드컵 + 0.004*레알 + 0.004*독일 + 0.003*영입 + 0.003*마드리드 + 0.003*사진 + 0.003*기록 + 0.003*네이마르 + 0.003*스페인 + 0.003*금지 + 0.003*코치 + 0.002*무단 + 0.002*상대 + 0.002*재배포'),
 (4,
  '0.003*서울 + 0.003*교수 + 0.003*한국 + 0.003*사진 + 0.002*금지 + 0

In [27]:
hdptopics_ct = hdpmodel_ct.show_topics(formatted = False)
hdptopics_mecab = hdpmodel_mecab.show_topics(formatted = False)

### LDA (Latent Dirichlet allocation)
* a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar  

#### LDA model1
* basic

In [28]:
pyLDAvis.enable_notebook()

In [29]:
pl_ct = PerplexityMetric(corpus = corpus_ct, logger = 'shell', 
                        title = 'Perplexity (twitter)')
ch_umass_ct = CoherenceMetric(corpus = corpus_ct, coherence = 'u_mass', 
                             logger = 'shell', title = ' Coherence (u_mass)')
ch_cv_ct = CoherenceMetric(corpus = corpus_ct, logger = 'shell', 
                          texts = tagged_text_ct, coherence = 'c_v', 
                          title = 'Coherence (c_v)')
diff_kl_ct = DiffMetric(distance = 'kullback_leibler', 
                       logger = 'shell', title = 'Diff (kullback_leibler)')
convergence_kl_ct = ConvergenceMetric(distance = 'jaccard', logger = 'shell', 
                                     title = 'Convergence (jaccard)')
callbacks_ct = [pl_ct, ch_umass_ct, ch_cv_ct, diff_kl_ct, convergence_kl_ct]

In [None]:
%%time
logging.basicConfig(level = logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ldamodel_ct_name = clusteringModelPath + 'ldamodel_ct'
if not os.path.isfile(ldamodel_ct_name):
    ldamodel_ct = LdaModel( corpus = corpus_ct, num_topics = 20,
                           id2word = dictionary_ct, passes = 50,
                           chunksize = 12246, iterations = 200,
                           alpha='auto', callbacks = callbacks_ct)
    ldamodel_ct.save(ldamodel_ct_name)
else:
    ldamodel_ct = LdaModel.load(ldamodel_ct_name)

INFO:gensim.models.ldamodel:using autotuned alpha, starting with [0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05, 0.05]
INFO:gensim.models.ldamodel:using symmetric eta at 0.05
INFO:gensim.models.ldamodel:using serial LDA version on this node
INFO:gensim.models.ldamodel:running online (multi-pass) LDA training, 20 topics, 50 passes over the supplied corpus of 24492 documents, updating model once every 12246 documents, evaluating perplexity every 24492 documents, iterating 200x with a convergence threshold of 0.001000
INFO:gensim.models.ldamodel:PROGRESS: pass 0, at document #12246/24492
INFO:gensim.models.ldamodel:optimized alpha [0.04113103, 0.044442922, 0.041740872, 0.042542383, 0.04247296, 0.04590621, 0.036615603, 0.042415425, 0.040266722, 0.03936632, 0.041400168, 0.044339057, 0.04098651, 0.041123882, 0.04512896, 0.04370206, 0.043665458, 0.041070648, 0.039178677, 0.038598057]
INFO:gensim.models.ldamodel:merging changes

In [None]:
%%time
coherence1_um_ct_name = clusteringModelPath + 'coherence1_ct_u_mass'
if not os.path.isfile(coherence1_um_ct_name):
    cm_ct = CoherenceModel(model = ldamodel_ct, 
                      corpus = corpus_ct, 
                      dictionary = dictionary_ct,
                      coherence = 'u_mass')
    cm_ct.save(coherence1_um_ct_name)
else:
    cm_ct = CoherenceModel.load(coherence1_um_ct_name)

In [None]:
print ('Coherence : {}'.format(cm_ct.get_coherence()))

In [None]:
%%time
coherence1_cv_ct_name = clusteringModelPath + 'coherence1_ct_c_v'
if not os.path.isfile(coherence1_cv_ct_name):
    cm_ct_cv = CoherenceModel(model = ldamodel_ct, 
                         texts = tagged_text_ct,
                         dictionary = dictionary_ct, 
                         coherence = 'c_v')
    cm_ct_cv.save(coherence1_cv_ct_name)
else:
    cm_ct_cv = CoherenceModel.load(coherence1_cv_ct_name)

In [None]:
print ('Coherence : {}'.format(cm_ct_cv.get_coherence()))

In [None]:
%%time
pyLDAvis.gensim.prepare(ldamodel_ct, corpus_ct, dictionary_ct)

In [None]:
ldatopics_ct = ldamodel_ct.show_topics(formatted = False)

In [None]:
pl_mecab = PerplexityMetric(corpus = corpus_mecab, logger = 'shell', 
                           title = 'Perplexity (Mecab)')
ch_umass_mecab = CoherenceMetric(corpus = corpus_mecab, coherence = 'u_mass', 
                             logger = 'shell', title = ' Coherence (u_mass)')
ch_cv_mecab = CoherenceMetric(corpus = corpus_mecab, logger = 'shell', 
                          texts = tagged_text_mecab, coherence = 'c_v', 
                          title = 'Coherence (c_v)')
diff_kl_mecab = DiffMetric(distance = 'kullback_leibler', 
                       logger = 'shell', title = 'Diff (kullback_leibler)')
convergence_kl_mecab = ConvergenceMetric(distance = 'jaccard', logger = 'shell', 
                                     title = 'Convergence (jaccard)')
callbacks_mecab = [pl_mecab, ch_umass_mecab, ch_cv_mecab, diff_kl_mecab, convergence_kl_mecab]

In [None]:
%%time
logging.basicConfig(level = logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ldamodel_mecab_name = clusteringModelPath + 'ldamodel_mecab'
if not os.path.isfile(ldamodel_mecab_name):
    ldamodel_mecab = LdaModel( corpus = corpus_mecab, num_topics = 20,
                              id2word = dictionary_mecab, passes = 50,
                           chunksize = 12246, iterations = 200,
                           alpha='auto', callbacks = callbacks_mecab)
    ldamodel_mecab.save(ldamodel_mecab_name)
else:
    ldamodel_mecab = LdaModel.load(ldamodel_mecab_name)

In [None]:
%%time
coherence1_um_mecab = clusteringModelPath + 'coherence1_mecab_u_mass'
if not os.path.isfile(coherence1_um_mecab):
    cm_mecab = CoherenceModel(model = ldamodel_mecab, 
                      corpus = corpus_mecab, 
                      dictionary = dictionary_mecab,
                      coherence = 'u_mass')
    cm_mecab.save(coherence1_um_mecab)
else:
    cm_mecab = CoherenceModel.load(coherence1_um_mecab)

In [None]:
print ('Coherence : {}'.format(cm_mecab.get_coherence()))

In [None]:
%%time
coherence1_cv_mecab = clusteringModelPath + 'coherence1_mecab_c_v'
if not os.path.isfile(coherence1_cv_mecab):
    cm_mecab_cv = CoherenceModel(model = ldamodel_mecab, 
                         texts = tagged_text_mecab,
                         dictionary = dictionary_mecab, 
                         coherence = 'c_v')
    cm_mecab_cv.save(coherence1_cv_mecab)
else:
    cm_mecab_cv = CoherenceModel.load(coherence1_cv_mecab)

In [None]:
print ('Coherence : {}'.format(cm_mecab_cv.get_coherence()))

In [None]:
%%time
pyLDAvis.gensim.prepare(ldamodel_mecab, corpus_mecab, dictionary_mecab)

In [None]:
ldatopics_mecab = ldamodel_mecab.show_topics(formatted = False)

##### display num_topics - LDA graph using c_v coherence

In [None]:
%%time
lmlist_ct, c_v_ct = evaluate_graph(dictionary = dictionary_ct, corpus = corpus_ct, texts = tagged_text_ct, limit = 20)

In [None]:
%%time
lmlist_mecab, c_v_mecab = evaluate_graph(dictionary = dictionary_mecab, corpus = corpus_mecab, texts = tagged_text_mecab, limit = 20)

### LDASEQ
* The constructor estimates Dynamic Topic Model parameters based on a training corpus  

In [None]:
%%time
ldaseq_ct_name = clusteringModelPath + 'ldaseqmodel_ct'
if not os.path.isfile(ldaseq_ct_name):
    ldaseq_ct = ldaseqmodel.LdaSeqModel(corpus = corpus_ct, 
                                   id2word = dictionary_ct,
                                   time_slice= [8164, 8164, 8164], 
                                   num_topics = 20)
    ldaseq_ct.save(ldaseq_ct_name)
else:
    ldaseq_ct = ldaseqmodel.LdaSeqModel.load(ldaseq_ct_name)

In [None]:
%%time
doc_topic_ct, topic_term_ct, doc_lengths_ct, term_freq_ct,vocab_ct = ldaseq_ct.dtm_vis(time = 0, corpus = corpus_ct)
vis_wrapper_ct = pyLDAvis.prepare(topic_term_dists = topic_term_ct,
                               doc_topic_dists = doc_topic_ct,
                              doc_lengths = doc_lengths_ct,
                              vocab = vocab_ct, 
                              term_frequency = term_freq_ct)

In [None]:
%%time
ldaseq_mecab_name = clusteringModelPath + 'ldaseqmodel_mecab'
if not os.path.isfile(ldaseq_mecab_name):
    ldaseq_mecab = ldaseqmodel.LdaSeqModel(corpus = corpus_mecab, 
                                   id2word = dictionary_mecab,
                                   time_slice = [8164, 8164, 8164], 
                                   num_topics = 20)
    ldaseq_mecab.save(ldaseq_mecab_name)
else:
    ldaseq_mecab = ldaseqmodel.LdaSeqModel.load(ldaseq_mecab_name)

In [None]:
%%time
doc_topic_mecab, topic_term_mecab, doc_lengths_mecab, term_freq_mecab,vocab_mecab = ldaseq_mecab.dtm_vis(time = 0, corpus = corpus_mecab)
vis_wrapper_mecab = pyLDAvis.prepare(topic_term_dists = topic_term_mecab,
                               doc_topic_dists = doc_topic_mecab,
                              doc_lengths = doc_lengths_mecab,
                              vocab = vocab_mecab, 
                              term_frequency = term_freq_mecab)

### LDASEQ
* chain_variance : 0.05  
> * a constant which dictates how the beta values evolve - it is a gaussian parameter defined in the beta distribution  

In [None]:
%%time
ldaseq_chain_ct_name = clusteringModelPath + 'ldaseqmodel_chain_ct'
if not os.path.isfile(ldaseq_chain_ct_name):
    ldaseq_chain_ct = ldaseqmodel.LdaSeqModel(corpus = corpus_ct, 
                                         id2word = dictionary_ct, 
                                         time_slice = [8164, 8164, 8164],
                                         num_topics = 20, 
                                         chain_variance = 0.05)
    ldaseq_chain_ct.save(ldaseq_chain_ct_name)
else:
    ldaseq_chain_ct = ldaseqmodel.LdaSeqModel.load(ldaseq_chain_ct_name)

In [None]:
%%time
ldaseq_chain_mecab_name = clusteringModelPath + 'ldaseqmodel_chain_mecab'
if not os.path.isfile(ldaseq_chain_mecab_name):
    ldaseq_chain_mecab = ldaseqmodel.LdaSeqModel(corpus = corpus_mecab, 
                                         id2word = dictionary_mecab, 
                                         time_slice = [8164, 8164, 8164],
                                         num_topics = 20, 
                                         chain_variance = 0.05)
    ldaseq_chain_mecab.save(ldaseq_chain_mecab_name)
else:
    ldaseq_chain_mecab = ldaseqmodel.LdaSeqModel.load(ldaseq_chain_mecab_name)

### DTM

In [None]:
dtm_path = '/Users/hyunyoun/Documents/GitHub/Private_Project/dtm-darwin64'

In [None]:
%%time
dtm_model_ct_name = clusteringModelPath + 'dtm_ct'
if not os.path.isfile(dtm_model_ct_name):
    dtm_model_ct = DtmModel(dtm_path, corpus = corpus_ct,  
                       num_topics = 20, 
                       id2word = dictionary_ct, 
                       initialize_lda = True)
    dtm_model_ct.save(dtm_model_ct_name)
else:
    dtm_model_ct = DtmModel.load(dtm_model_ct_name)

In [None]:
%%time
doc_topic_ct, topic_term_ct, doc_lengths_ct, term_freq_ct,vocab_ct = dtm_model_ct.dtm_vis(time = 0, corpus = corpus_ct)
vis_wrapper_ct = pyLDAvis.prepare(topic_term_dists = topic_term_ct,
                               doc_topic_dists = doc_topic_ct,
                              doc_lengths = doc_lengths_ct,
                              vocab = vocab_ct, 
                              term_frequency = term_freq_ct)

In [None]:
%%time
dtm_model_mecab_name = clusteringModelPath + 'dtm_mecab'
if not os.path.isfile(dtm_model_mecab_name):
    dtm_model_mecab = DtmModel(dtm_path, corpus = corpus_mecab, 
                       num_topics = 20, 
                       id2word = dictionary_mecab, 
                       initialize_lda = True)
    
    dtm_model_mecab.save(dtm_model_mecab_name)
else:
    dtm_model_mecab = DtmModel.load(dtm_model_mecab_name)

In [None]:
%%time
doc_topic_mecab, topic_term_mecab, doc_lengths_mecab, term_freq_mecab,vocab_mecab = dtm_model_mecab.dtm_vis(time = 0, corpus = corpus_mecab)
vis_wrapper_mecab = pyLDAvis.prepare(topic_term_dists = topic_term_mecab,
                               doc_topic_dists = doc_topic_mecab,
                              doc_lengths = doc_lengths_mecab,
                              vocab = vocab_mecab, 
                              term_frequency = term_freq_mecab)

In [None]:
%%time
topics_wrapper_ct = dtm_model_ct.dtm_coherence(time = 0)
topics_dtm_ct = ldaseq_ct.dtm_coherence(time = 2)
topics_dtm2_ct = ldaseq_chain_ct.dtm_coherence( time = 2)

cm_wrapper_ct = CoherenceModel(topics = topics_wrapper_ct, corpus = corpus_ct,
                            dictionaray = dictionary_ct, coherence = 'u_mass')

cm_dtm_ct = CoherenceModel(topics = topics_dtm_ct, corpus = corpus_ct,
                            dictionaray = dictionary_ct, coherence = 'u_mass')

cm_dtm2_ct = CoherenceModel(topics = topics_dtm2_ct, corpus = corpus_ct,
                            dictionaray = dictionary_ct, coherence = 'u_mass')

print ('U_mass topic coherence')
print ('Wrapper coherence is {}'.format(cm_wrapper_ct.get_coherence()))
print ('DTM Python coherence is {}'.format(cm_dtm_ct.get_coherence()))
print ('DTM (chain variance) Python coherence is {}'.format(cm_dtm2_ct.get_coherence()))


In [None]:
%%time
topics_wrapper_mecab = dtm_model_mecab.dtm_coherence(time = 0)
topics_dtm_mecab = ldaseq_mecab.dtm_coherence(time = 2)
topics_dtm2_mecab = ldaseq_chain_mecab.dtm_coherence( time = 2)

cm_wrapper_mecab = CoherenceModel(topics = topics_wrapper_mecab, corpus = corpus_mecab,
                            dictionaray = dictionary_mecab, coherence = 'u_mass')

cm_dtm_mecab = CoherenceModel(topics = topics_dtm_mecab, corpus = corpus_mecab,
                            dictionaray = dictionary_mecab, coherence = 'u_mass')

cm_dtm2_mecab = CoherenceModel(topics = topics_dtm2_mecab, corpus = corpus_mecab,
                            dictionaray = dictionary_mecab, coherence = 'u_mass')

print ('U_mass topic coherence')
print ('Wrapper coherence is {}'.format(cm_wrapper_mecab.get_coherence()))
print ('DTM Python coherence is {}'.format(cm_dtm_mecab.get_coherence()))
print ('DTM (chain variance) Python coherence is {}'.format(cm_dtm2_mecab.get_coherence()))
