# News Clustering

In [None]:
import pickle
import itertools
import warnings
import sys 
import os
import logging
from multiprocessing import cpu_count

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from ckonlpy.tag import Twitter
from konlpy.tag import Mecab

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel, ldaseqmodel, LdaMulticore, lda_dispatcher
from gensim.models.wrappers import LdaMallet, DtmModel
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger
from gensim import corpora, models, similarities
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric

import pyLDAvis.gensim

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

warnings.filterwarnings('ignore')

In [None]:
def Nav_tokenizer(doc, tagger, stopwords):
    pos = tagger.pos(doc)
    pos = [word[0] for word in pos if (len(word[0])>1) & (not word[0] in stopwords)]
    return pos

In [None]:
def Nav_tokenizer_noun(doc, tagger, stopwords):
    pos = tagger.nouns(doc)
    pos = [word for word in pos if (len(word)>1) & (not word in stopwords)]
    return pos

In [None]:
def evaluate_graph(dictionary, corpus, texts, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    for num_topics in range(1, limit):
        lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())
        
    # Show graph
    x = range(1, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return lm_list, c_v

### Stopwords

In [None]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## News

### Naver

In [None]:
dictNaver = pickle.load(open('./data/pre_data/stastics/for_statistics_Naver_from_mongodb.pickled','rb'))
dfNaver = pd.DataFrame.from_dict(dictNaver, orient='index')
print (dfNaver.shape)

### Daum

In [None]:
dictDaum = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
dfDaum = pd.DataFrame.from_dict(dictDaum, orient='index')
print (dfDaum.shape)

## Daum

### 뉴스 기사 통합

In [None]:
combinedDf = pd.concat([dfNaver, dfDaum])
combinedDf.head()

In [None]:
rawData_text = combinedDf.title + '\n' + combinedDf.mainText

In [None]:
rawData_text[0]

In [None]:
if sys.platform =='darwin':
    clusteringPath ='/Volumes/disk1/Clustering/'
    clusteringModelPath = '/Volumes/disk1/Clustering_model/'
elif sys.platform =='win32':
    clusteringPath = 'd:/Clustering/' 
    clusteringModelPath = 'd:/Clustering_model/'

### token

In [None]:
mecab = Mecab()
ct = Twitter()

#### 명사만

In [None]:
outfile_ct = clusteringPath + 'nouns_taggerd_news_text_by_ct.pickled'
if not os.path.isfile(outfile_ct):
    tagged_text_ct = [Nav_tokenizer_noun(doc, ct, stopwords) for doc in tqdm(rawData_text)]
    pickle.dump(tagged_text_ct, open(outfile_ct, 'wb'))
else:
    tagged_text_ct = pickle.load(open(outfile_ct, 'rb'))

In [None]:
outfile_mecab = clusteringPath + 'nouns_taggerd_news_text_by_mecab.pickled'
if not os.path.isfile(outfile_mecab):
    tagged_text_mecab = [Nav_tokenizer_noun(doc, mecab, stopwords) for doc in tqdm(rawData_text)]
    pickle.dump(tagged_text_mecab, open(outfile_mecab, 'wb'))
else:
    tagged_text_mecab = pickle.load(open(outfile_mecab, 'rb'))

In [None]:
tagged_text_ct[0]

In [None]:
tagged_text_mecab[0]

### 사전 데이터 제작

In [None]:
%%time
dict_ct_name = clusteringModelPath + 'dictionary_ct'
dict_mecab_name = clusteringModelPath + 'dictionary_mecab'
if not os.path.isfile(dict_ct_name):
    dictionary_ct = Dictionary(tagged_text_ct)
    dictionary_ct.save(dict_ct_name)
else:
    dictionary_ct = Dictionary.load(dict_ct_name)
if not os.path.isfile(dict_mecab_name):
    dictionary_mecab = Dictionary(tagged_text_mecab)
    dictionary_mecab.save(dict_mecab_name)
else:
    dictionary_mecab = Dictionary.load(dict_mecab_name)

In [None]:
%%time
corpus_ct_name = clusteringModelPath + 'corpus_ct.pickled'
corpus_mecab_name = clusteringModelPath + 'corpus_mecab.pickled'
if not os.path.isfile(corpus_ct_name):
    corpus_ct = [ dictionary_ct.doc2bow(text) for text in tqdm(tagged_text_ct)]
    pickle.dump(corpus_ct, open(corpus_ct_name, 'wb'))
else:
    corpus_ct = pickle.load(open(corpus_ct_name, 'rb'))
if not os.path.isfile(corpus_mecab_name):
    corpus_mecab = [ dictionary_mecab.doc2bow(text) for text in tqdm(tagged_text_mecab)]
    pickle.dump(corpus_mecab, open(corpus_mecab_name, 'wb'))
else:
    corpus_mecab = pickle.load(open(corpus_mecab_name, 'rb'))


In [None]:
print('Number of unique tokens: %d' % len(dictionary_ct))
print('Number of documents: %d' % len(corpus_ct))
print('Number of unique tokens: %d' % len(dictionary_mecab))
print('Number of documents: %d' % len(corpus_mecab))

### LSI (  Latent Semantic Indexing )
* an indexing and retrieval method that uses a mathematical technique called singular value decomposition (SVD) to identify patterns in the relationships between the terms and concepts contained in an unstructured collection of text  

In [None]:
%%time
lsimodel_ct_name = clusteringModelPath + 'lsimodel_ct'
lsimodel_mecab_name = clusteringModelPath + 'lsimidel_mecab'
if not os.path.isfile(lsimodel_ct_name):
    lsimodel_ct = LsiModel(corpus = corpus_ct, num_topics = 20, id2word = dictionary_ct)
    lsimodel_ct.save(lsimodel_ct_name)
else:
    lsimodel_ct = LsiModel.load(lsimodel_ct_name)
if not os.path.isfile(lsimodel_mecab_name):
    lsimodel_mecab = LsiModel(corpus = corpus_mecab, num_topics = 20, id2word = dictionary_mecab)
    lsimodel_mecab.save(lsimodel_mecab_name)
else:
    lsimodel_mecab = LsiModel.load(lsimodel_mecab_name)

In [None]:
lsimodel_ct.show_topics(num_topics = 20)

In [None]:
lsimodel_mecab.show_topics(num_topics = 20)

In [None]:
lsitopics_ct = lsimodel_ct.show_topics(formatted = False)
lsitopics_mecab = lsimodel_mecab.show_topics(formatted = False)

### HDP (Hierarchical Dirichlet Process)
* a non-parametric bayesian method (note the missing number of requested topics)

In [None]:
%%time
hdpmodel_ct_name = clusteringModelPath+'hdpmodel_ct'
hdpmodel_mecab_name = clusteringModelPath+'hdpmodel_mecab'
if not os.path.isfile(hdpmodel_ct_name):
    hdpmodel_ct = HdpModel(corpus = corpus_ct, id2word = dictionary_ct)
    hdpmodel_ct.save(clusteringModelPath+'hdpmodel_ct')
else:
    hdpmodel_ct = HdpModel.load(hdpmodel_ct_name)
if not os.path.isfile(hdpmodel_mecab_name):
    hdpmodel_mecab = HdpModel(corpus = corpus_mecab, id2word = dictionary_mecab)
    hdpmodel_mecab.save(clusteringModelPath+'hdpmodel_mecab')
else:
    hdpmodel_mecab = HdpModel.load(hdpmodel_mecab_name)

In [None]:
hdpmodel_ct.show_topics()

In [None]:
hdpmodel_mecab.show_topics()

In [None]:
hdptopics_ct = hdpmodel_ct.show_topics(formatted = False)
hdptopics_mecab = hdpmodel_mecab.show_topics(formatted = False)

### LDA (Latent Dirichlet allocation)
* a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar  

#### LDA model1
* basic

In [None]:
pyLDAvis.enable_notebook()

In [None]:
pl_ct = PerplexityMetric(corpus = corpus_ct, logger = 'shell', 
                        title = 'Perplexity (twitter)')
ch_umass_ct = CoherenceMetric(corpus = corpus_ct, coherence = 'u_mass', 
                             logger = 'shell', title = ' Coherence (u_mass)')
ch_cv_ct = CoherenceMetric(corpus = corpus_ct, logger = 'shell', 
                          texts = tagged_text_ct, coherence = 'c_v', 
                          title = 'Coherence (c_v)')
diff_kl_ct = DiffMetric(distance = 'kullback_leibler', 
                       logger = 'shell', title = 'Diff (kullback_leibler)')
convergence_kl_ct = ConvergenceMetric(distance = 'jaccard', logger = 'shell', 
                                     title = 'Convergence (jaccard)')
callbacks_ct = [pl_ct, ch_umass_ct, ch_cv_ct, diff_kl_ct, convergence_kl_ct]

In [None]:
%%time
logging.basicConfig(level = logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ldamodel_ct_name = clusteringModelPath + 'ldamodel_ct'
if not os.path.isfile(ldamodel_ct_name):
    ldamodel_ct = LdaModel(corpus = corpus_ct, num_topics = 20,
                           id2word = dictionary_ct, passes = 50,
                           chunksize = 6123, iterations = 250,
                           alpha='symmetric', callbacks = callbacks_ct)
    ldamodel_ct.save(ldamodel_ct_name)
else:
    ldamodel_ct = LdaModel.load(ldamodel_ct_name)

In [None]:
%%time
coherence1_um_ct_name = clusteringModelPath + 'coherence1_ct_u_mass'
if not os.path.isfile(coherence1_um_ct_name):
    cm_ct = CoherenceModel(model = ldamodel_ct, 
                      corpus = corpus_ct, 
                      dictionary = dictionary_ct,
                      coherence = 'u_mass')
    cm_ct.save(coherence1_um_ct_name)
else:
    cm_ct = CoherenceModel.load(coherence1_um_ct_name)

In [None]:
print ('Coherence : {}'.format(cm_ct.get_coherence()))

In [None]:
%%time
coherence1_cv_ct_name = clusteringModelPath + 'coherence1_ct_c_v'
if not os.path.isfile(coherence1_cv_ct_name):
    cm_ct_cv = CoherenceModel(model = ldamodel_ct, 
                         texts = tagged_text_ct,
                         dictionary = dictionary_ct, 
                         coherence = 'c_v')
    cm_ct_cv.save(coherence1_cv_ct_name)
else:
    cm_ct_cv = CoherenceModel.load(coherence1_cv_ct_name)

In [None]:
print ('Coherence : {}'.format(cm_ct_cv.get_coherence()))

In [None]:
%%time
pyLDAvis.gensim.prepare(ldamodel_ct, corpus_ct, dictionary_ct)

In [None]:
ldatopics_ct = ldamodel_ct.show_topics(formatted = False)

In [None]:
pl_mecab = PerplexityMetric(corpus = corpus_mecab, logger = 'shell', 
                           title = 'Perplexity (Mecab)')
ch_umass_mecab = CoherenceMetric(corpus = corpus_mecab, coherence = 'u_mass', 
                             logger = 'shell', title = ' Coherence (u_mass)')
ch_cv_mecab = CoherenceMetric(corpus = corpus_mecab, logger = 'shell', 
                          texts = tagged_text_mecab, coherence = 'c_v', 
                          title = 'Coherence (c_v)')
diff_kl_mecab = DiffMetric(distance = 'kullback_leibler', 
                       logger = 'shell', title = 'Diff (kullback_leibler)')
convergence_kl_mecab = ConvergenceMetric(distance = 'jaccard', logger = 'shell', 
                                     title = 'Convergence (jaccard)')
callbacks_mecab = [pl_mecab, ch_umass_mecab, ch_cv_mecab, diff_kl_mecab, convergence_kl_mecab]

In [None]:
%%time
logging.basicConfig(level = logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ldamodel_mecab_name = clusteringModelPath + 'ldamodel_mecab'
if not os.path.isfile(ldamodel_mecab_name):
    ldamodel_mecab = LdaModel( corpus = corpus_mecab, num_topics = 20,
                              id2word = dictionary_mecab, passes = 100,
                           chunksize = 6123, iterations = 200,
                           alpha='symmetric', callbacks = callbacks_mecab)
    ldamodel_mecab.save(ldamodel_mecab_name)
else:
    ldamodel_mecab = LdaModel.load(ldamodel_mecab_name)

In [None]:
%%time
coherence1_um_mecab = clusteringModelPath + 'coherence1_mecab_u_mass'
if not os.path.isfile(coherence1_um_mecab):
    cm_mecab = CoherenceModel(model = ldamodel_mecab, 
                      corpus = corpus_mecab, 
                      dictionary = dictionary_mecab,
                      coherence = 'u_mass')
    cm_mecab.save(coherence1_um_mecab)
else:
    cm_mecab = CoherenceModel.load(coherence1_um_mecab)

In [None]:
print ('Coherence : {}'.format(cm_mecab.get_coherence()))

In [None]:
%%time
coherence1_cv_mecab = clusteringModelPath + 'coherence1_mecab_c_v'
if not os.path.isfile(coherence1_cv_mecab):
    cm_mecab_cv = CoherenceModel(model = ldamodel_mecab, 
                         texts = tagged_text_mecab,
                         dictionary = dictionary_mecab, 
                         coherence = 'c_v')
    cm_mecab_cv.save(coherence1_cv_mecab)
else:
    cm_mecab_cv = CoherenceModel.load(coherence1_cv_mecab)

In [None]:
print ('Coherence : {}'.format(cm_mecab_cv.get_coherence()))

In [None]:
%%time
pyLDAvis.gensim.prepare(ldamodel_mecab, corpus_mecab, dictionary_mecab)

In [None]:
ldatopics_mecab = ldamodel_mecab.show_topics(formatted = False)

##### display num_topics - LDA graph using c_v coherence

In [None]:
%%time
lmlist_ct, c_v_ct = evaluate_graph(dictionary = dictionary_ct, corpus = corpus_ct, texts = tagged_text_ct, limit = 20)

In [None]:
%%time
lmlist_mecab, c_v_mecab = evaluate_graph(dictionary = dictionary_mecab, corpus = corpus_mecab, texts = tagged_text_mecab, limit = 20)

### LDASEQ
* The constructor estimates Dynamic Topic Model parameters based on a training corpus  

In [None]:
%%time
ldaseq_ct_name = clusteringModelPath + 'ldaseqmodel_ct'
if not os.path.isfile(ldaseq_ct_name):
    ldaseq_ct = ldaseqmodel.LdaSeqModel(corpus = corpus_ct, 
                                   id2word = dictionary_ct,
                                   time_slice= [8164, 8164, 8164], 
                                   num_topics = 20)
    ldaseq_ct.save(ldaseq_ct_name)
else:
    ldaseq_ct = ldaseqmodel.LdaSeqModel.load(ldaseq_ct_name)

In [None]:
%%time
doc_topic_ct, topic_term_ct, doc_lengths_ct, term_freq_ct,vocab_ct = ldaseq_ct.dtm_vis(time = 0, corpus = corpus_ct)
vis_wrapper_ct = pyLDAvis.prepare(topic_term_dists = topic_term_ct,
                               doc_topic_dists = doc_topic_ct,
                              doc_lengths = doc_lengths_ct,
                              vocab = vocab_ct, 
                              term_frequency = term_freq_ct)

In [None]:
%%time
ldaseq_mecab_name = clusteringModelPath + 'ldaseqmodel_mecab'
if not os.path.isfile(ldaseq_mecab_name):
    ldaseq_mecab = ldaseqmodel.LdaSeqModel(corpus = corpus_mecab, 
                                   id2word = dictionary_mecab,
                                   time_slice = [8164, 8164, 8164], 
                                   num_topics = 20)
    ldaseq_mecab.save(ldaseq_mecab_name)
else:
    ldaseq_mecab = ldaseqmodel.LdaSeqModel.load(ldaseq_mecab_name)

In [None]:
%%time
doc_topic_mecab, topic_term_mecab, doc_lengths_mecab, term_freq_mecab,vocab_mecab = ldaseq_mecab.dtm_vis(time = 0, corpus = corpus_mecab)
vis_wrapper_mecab = pyLDAvis.prepare(topic_term_dists = topic_term_mecab,
                               doc_topic_dists = doc_topic_mecab,
                              doc_lengths = doc_lengths_mecab,
                              vocab = vocab_mecab, 
                              term_frequency = term_freq_mecab)

### LDASEQ
* chain_variance : 0.05  
> * a constant which dictates how the beta values evolve - it is a gaussian parameter defined in the beta distribution  

In [None]:
%%time
ldaseq_chain_ct_name = clusteringModelPath + 'ldaseqmodel_chain_ct'
if not os.path.isfile(ldaseq_chain_ct_name):
    ldaseq_chain_ct = ldaseqmodel.LdaSeqModel(corpus = corpus_ct, 
                                         id2word = dictionary_ct, 
                                         time_slice = [8164, 8164, 8164],
                                         num_topics = 20, 
                                         chain_variance = 0.05)
    ldaseq_chain_ct.save(ldaseq_chain_ct_name)
else:
    ldaseq_chain_ct = ldaseqmodel.LdaSeqModel.load(ldaseq_chain_ct_name)

In [None]:
%%time
ldaseq_chain_mecab_name = clusteringModelPath + 'ldaseqmodel_chain_mecab'
if not os.path.isfile(ldaseq_chain_mecab_name):
    ldaseq_chain_mecab = ldaseqmodel.LdaSeqModel(corpus = corpus_mecab, 
                                         id2word = dictionary_mecab, 
                                         time_slice = [8164, 8164, 8164],
                                         num_topics = 20, 
                                         chain_variance = 0.05)
    ldaseq_chain_mecab.save(ldaseq_chain_mecab_name)
else:
    ldaseq_chain_mecab = ldaseqmodel.LdaSeqModel.load(ldaseq_chain_mecab_name)

### DTM

In [None]:
dtm_path = '/Users/hyunyoun/Documents/GitHub/Private_Project/dtm-darwin64'

In [None]:
%%time
dtm_model_ct_name = clusteringModelPath + 'dtm_ct'
if not os.path.isfile(dtm_model_ct_name):
    dtm_model_ct = DtmModel(dtm_path, corpus = corpus_ct,  
                       num_topics = 20, 
                       id2word = dictionary_ct, 
                       initialize_lda = True)
    dtm_model_ct.save(dtm_model_ct_name)
else:
    dtm_model_ct = DtmModel.load(dtm_model_ct_name)

In [None]:
%%time
doc_topic_ct, topic_term_ct, doc_lengths_ct, term_freq_ct,vocab_ct = dtm_model_ct.dtm_vis(time = 0, corpus = corpus_ct)
vis_wrapper_ct = pyLDAvis.prepare(topic_term_dists = topic_term_ct,
                               doc_topic_dists = doc_topic_ct,
                              doc_lengths = doc_lengths_ct,
                              vocab = vocab_ct, 
                              term_frequency = term_freq_ct)

In [None]:
%%time
dtm_model_mecab_name = clusteringModelPath + 'dtm_mecab'
if not os.path.isfile(dtm_model_mecab_name):
    dtm_model_mecab = DtmModel(dtm_path, corpus = corpus_mecab, 
                       num_topics = 20, 
                       id2word = dictionary_mecab, 
                       initialize_lda = True)
    
    dtm_model_mecab.save(dtm_model_mecab_name)
else:
    dtm_model_mecab = DtmModel.load(dtm_model_mecab_name)

In [None]:
%%time
doc_topic_mecab, topic_term_mecab, doc_lengths_mecab, term_freq_mecab,vocab_mecab = dtm_model_mecab.dtm_vis(time = 0, corpus = corpus_mecab)
vis_wrapper_mecab = pyLDAvis.prepare(topic_term_dists = topic_term_mecab,
                               doc_topic_dists = doc_topic_mecab,
                              doc_lengths = doc_lengths_mecab,
                              vocab = vocab_mecab, 
                              term_frequency = term_freq_mecab)

In [None]:
%%time
topics_wrapper_ct = dtm_model_ct.dtm_coherence(time = 0)
topics_dtm_ct = ldaseq_ct.dtm_coherence(time = 2)
topics_dtm2_ct = ldaseq_chain_ct.dtm_coherence( time = 2)

cm_wrapper_ct = CoherenceModel(topics = topics_wrapper_ct, corpus = corpus_ct,
                            dictionaray = dictionary_ct, coherence = 'u_mass')

cm_dtm_ct = CoherenceModel(topics = topics_dtm_ct, corpus = corpus_ct,
                            dictionaray = dictionary_ct, coherence = 'u_mass')

cm_dtm2_ct = CoherenceModel(topics = topics_dtm2_ct, corpus = corpus_ct,
                            dictionaray = dictionary_ct, coherence = 'u_mass')

print ('U_mass topic coherence')
print ('Wrapper coherence is {}'.format(cm_wrapper_ct.get_coherence()))
print ('DTM Python coherence is {}'.format(cm_dtm_ct.get_coherence()))
print ('DTM (chain variance) Python coherence is {}'.format(cm_dtm2_ct.get_coherence()))


In [None]:
%%time
topics_wrapper_mecab = dtm_model_mecab.dtm_coherence(time = 0)
topics_dtm_mecab = ldaseq_mecab.dtm_coherence(time = 2)
topics_dtm2_mecab = ldaseq_chain_mecab.dtm_coherence( time = 2)

cm_wrapper_mecab = CoherenceModel(topics = topics_wrapper_mecab, corpus = corpus_mecab,
                            dictionaray = dictionary_mecab, coherence = 'u_mass')

cm_dtm_mecab = CoherenceModel(topics = topics_dtm_mecab, corpus = corpus_mecab,
                            dictionaray = dictionary_mecab, coherence = 'u_mass')

cm_dtm2_mecab = CoherenceModel(topics = topics_dtm2_mecab, corpus = corpus_mecab,
                            dictionaray = dictionary_mecab, coherence = 'u_mass')

print ('U_mass topic coherence')
print ('Wrapper coherence is {}'.format(cm_wrapper_mecab.get_coherence()))
print ('DTM Python coherence is {}'.format(cm_dtm_mecab.get_coherence()))
print ('DTM (chain variance) Python coherence is {}'.format(cm_dtm2_mecab.get_coherence()))
