# News Clustering

In [None]:
import pickle
import itertools
import warnings
import sys 
import os
import logging
from multiprocessing import cpu_count

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from ckonlpy.tag import Twitter
from konlpy.tag import Mecab

from gensim.models import CoherenceModel, LdaModel, LsiModel, HdpModel, ldaseqmodel, LdaMulticore, lda_dispatcher
from gensim.models.wrappers import LdaMallet, DtmModel
from gensim.corpora import Dictionary, bleicorpus
from gensim.matutils import hellinger
from gensim import corpora, models, similarities
from gensim.models.callbacks import CoherenceMetric, DiffMetric, PerplexityMetric, ConvergenceMetric

import pyLDAvis.gensim

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

warnings.filterwarnings('ignore')

In [None]:
def Nav_tokenizer(doc, tagger, stopwords):
    pos = tagger.pos(doc)
    pos = [word[0] for word in pos if (len(word[0])>1) & (not word[0] in stopwords)]
    return pos

In [None]:
def Nav_tokenizer_noun(doc, tagger, stopwords):
    pos = tagger.nouns(doc)
    pos = [word for word in pos if (len(word)>1) & (not word in stopwords)]
    return pos

In [None]:
def evaluate_graph(dictionary, corpus, texts, limit):
    """
    Function to display num_topics - LDA graph using c_v coherence
    
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    limit : topic limit
    
    Returns:
    -------
    lm_list : List of LDA topic models
    c_v : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_v = []
    lm_list = []
    for num_topics in range(1, limit):
        lm = LdaModel(corpus=corpus, num_topics=num_topics, id2word=dictionary)
        lm_list.append(lm)
        cm = CoherenceModel(model=lm, texts=texts, dictionary=dictionary, coherence='c_v')
        c_v.append(cm.get_coherence())
        
    # Show graph
    x = range(1, limit)
    plt.plot(x, c_v)
    plt.xlabel("num_topics")
    plt.ylabel("Coherence score")
    plt.legend(("c_v"), loc='best')
    plt.show()
    
    return lm_list, c_v

### Stopwords

In [None]:
stopwords = open('./data/stopwordsList.txt',encoding='utf-8').readlines()
stopwords = list(map(lambda x: x.strip(), stopwords))

## News

### Naver

In [None]:
dictNaver = pickle.load(open('./data/pre_data/stastics/for_statistics_Naver_from_mongodb.pickled','rb'))
dfNaver = pd.DataFrame.from_dict(dictNaver, orient='index')
print (dfNaver.shape)

### Daum

In [None]:
dictDaum = pickle.load(open('./data/pre_data/stastics/for_statistics_daum_from_mongodb.pickled','rb'))
dfDaum = pd.DataFrame.from_dict(dictDaum, orient='index')
print (dfDaum.shape)

## Daum

### 뉴스 기사 통합

In [None]:
combinedDf = pd.concat([dfNaver, dfDaum])
combinedDf.head()

In [None]:
extKeywords = combinedDf.extracted_keywords.tolist()

In [None]:
if sys.platform =='darwin':
    clusteringPath ='/Volumes/disk1/Clustering/'
    clusteringModelPath = '/Volumes/disk1/Clustering_model/'
elif sys.platform =='win32':
    clusteringPath = 'd:/Clustering/' 
    clusteringModelPath = 'd:/Clustering_model/'

### 사전 데이터 제작

In [None]:
%%time
dict_keywords_name = clusteringModelPath + 'dictionary_keywords'
if not os.path.isfile(dict_keywords_name):
    dict_keywords = Dictionary(extKeywords)
    dict_keywords.save(dict_keywords_name)
else:
    dict_keywords = Dictionary.load(dict_keywords_name)

In [None]:
%%time
corpus_keywords_name = clusteringModelPath + 'corpus_keywords.pickled'
if not os.path.isfile(corpus_keywords_name):
    corpus_keywords = [ dict_keywords.doc2bow(text) for text in tqdm(extKeywords)]
    pickle.dump(corpus_keywords, open(corpus_keywords_name, 'wb'))
else:
    corpus_keywords = pickle.load(open(corpus_keywords_name, 'rb'))

In [None]:
print('Number of unique tokens: %d' % len(dict_keywords))
print('Number of documents: %d' % len(corpus_keywords))

### LSI (  Latent Semantic Indexing )
* an indexing and retrieval method that uses a mathematical technique called singular value decomposition (SVD) to identify patterns in the relationships between the terms and concepts contained in an unstructured collection of text  

In [None]:
%%time
lsimodel_keywords_name = clusteringModelPath + 'lsimodel_keywords'
if not os.path.isfile(lsimodel_keywords_name):
    lsimodel_keywords = LsiModel(corpus = corpus_keywords, num_topics = 20, id2word = dict_keywords)
    lsimodel_keywords.save(lsimodel_keywords_name)
else:
    lsimodel_keywords = LsiModel.load(lsimodel_keywords_name)

In [None]:
lsimodel_keywords.show_topics(num_topics = 20)

In [None]:
lsitopics_keywords = lsimodel_keywords.show_topics(formatted = False)

### HDP (Hierarchical Dirichlet Process)
* a non-parametric bayesian method (note the missing number of requested topics)

In [None]:
%%time
hdpmodel_keywords_name = clusteringModelPath+'hdpmodel_keywords'
if not os.path.isfile(hdpmodel_keywords_name):
    hdpmodel_keywords = HdpModel(corpus = corpus_keywords, id2word = dict_keywords)
    hdpmodel_keywords.save(clusteringModelPath+'hdpmodel_keywords')
else:
    hdpmodel_keywords = HdpModel.load(hdpmodel_keywords_name)

In [None]:
hdpmodel_keywords.show_topics()

In [None]:
hdptopics_keywords = hdpmodel_keywords.show_topics(formatted = False)

### LDA (Latent Dirichlet allocation)
* a generative statistical model that allows sets of observations to be explained by unobserved groups that explain why some parts of the data are similar  

#### LDA model1
* basic

In [None]:
pyLDAvis.enable_notebook()

In [None]:
pl_keywords = PerplexityMetric(corpus = corpus_keywords, logger = 'shell', 
                        title = 'Perplexity (twitter)')
ch_umass_keywords = CoherenceMetric(corpus = corpus_keywords, coherence = 'u_mass', 
                             logger = 'shell', title = ' Coherence (u_mass)')
ch_cv_keywords = CoherenceMetric(corpus = corpus_keywords, logger = 'shell', 
                          texts = extKeywords, coherence = 'c_v', 
                          title = 'Coherence (c_v)')
diff_kl_keywords = DiffMetric(distance = 'kullback_leibler', 
                       logger = 'shell', title = 'Diff (kullback_leibler)')
convergence_kl_keywords = ConvergenceMetric(distance = 'jaccard', logger = 'shell', 
                                     title = 'Convergence (jaccard)')
callbacks_keywords = [pl_keywords, ch_umass_keywords, ch_cv_keywords, diff_kl_keywords, convergence_kl_keywords]

In [None]:
%%time
logging.basicConfig(level = logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ldamodel_keywords_name = clusteringModelPath + 'ldamodel_keywords'
if not os.path.isfile(ldamodel_keywords_name):
    ldamodel_keywords = LdaModel(corpus = corpus_keywords, num_topics = 20,
                           id2word = dict_keywords, passes = 50,
                           chunksize = 6123, iterations = 250,
                           alpha='auto', callbacks = callbacks_keywords)
    ldamodel_keywords.save(ldamodel_keywords_name)
else:
    ldamodel_keywords = LdaModel.load(ldamodel_keywords_name)

In [None]:
%%time
coherence1_um_keywords_name = clusteringModelPath + 'coherence1_keywords_u_mass'
if not os.path.isfile(coherence1_um_keywords_name):
    cm_keywords = CoherenceModel(model = ldamodel_keywords, 
                      corpus = corpus_keywords, 
                      dictionary = dict_keywords,
                      coherence = 'u_mass')
    cm_keywords.save(coherence1_um_keywords_name)
else:
    cm_keywords = CoherenceModel.load(coherence1_um_keywords_name)

In [None]:
print ('Coherence : {}'.format(cm_keywords.get_coherence()))

In [None]:
%%time
coherence1_cv_keywords_name = clusteringModelPath + 'coherence1_keywords_c_v'
if not os.path.isfile(coherence1_cv_keywords_name):
    cm_keywords_cv = CoherenceModel(model = ldamodel_keywords, 
                         texts = extKeywords,
                         dictionary = dict_keywords, 
                         coherence = 'c_v')
    cm_keywords_cv.save(coherence1_cv_keywords_name)
else:
    cm_keywords_cv = CoherenceModel.load(coherence1_cv_keywords_name)

In [None]:
print ('Coherence : {}'.format(cm_extKeywords_cv.get_coherence()))

In [None]:
%%time
pyLDAvis.gensim.prepare(ldamodel_keywords, corpus_extKeywords, dict_extKeywords)

In [None]:
ldatopics_keywords = ldamodel_keywords.show_topics(formatted = False)

##### display num_topics - LDA graph using c_v coherence

In [None]:
%%time
lmlist_keywords, c_v_keywords = evaluate_graph(dictionary = dict_keywords, corpus = corpus_keywords, texts = extKeywords, limit = 20)

### LDASEQ
* The constructor estimates Dynamic Topic Model parameters based on a training corpus  

In [None]:
%%time
ldaseq_keywords_name = clusteringModelPath + 'ldaseqmodel_keywords'
if not os.path.isfile(ldaseq_keywords_name):
    ldaseq_keywords = ldaseqmodel.LdaSeqModel(corpus = corpus_keywords, 
                                   id2word = dict_keywords,
                                   time_slice= [8164, 8164, 8164], 
                                   num_topics = 20)
    ldaseq_keywords.save(ldaseq_keywords_name)
else:
    ldaseq_keywords = ldaseqmodel.LdaSeqModel.load(ldaseq_keywords_name)

In [None]:
%%time
doc_topic_keywords, topic_term_keywords, doc_lengths_keywords, term_freq_keywords, vocab_keywords = ldaseq_keywords.dtm_vis(time = 0, corpus = corpus_keywords)
vis_wrapper_keywords = pyLDAvis.prepare(topic_term_dists = topic_term_keywords,
                               doc_topic_dists = doc_topic_keywords,
                              doc_lengths = doc_lengths_keywords,
                              vocab = vocab_keywords, 
                              term_frequency = term_freq_keywords)

### LDASEQ
* chain_variance : 0.05  
> * a constant which dictates how the beta values evolve - it is a gaussian parameter defined in the beta distribution  

In [None]:
%%time
ldaseq_chain_keywords_name = clusteringModelPath + 'ldaseqmodel_chain_keywords'
if not os.path.isfile(ldaseq_chain_keywords_name):
    ldaseq_chain_keywords = ldaseqmodel.LdaSeqModel(corpus = corpus_keywords, 
                                         id2word = dict_keywords, 
                                         time_slice = [8164, 8164, 8164],
                                         num_topics = 20, 
                                         chain_variance = 0.05)
    ldaseq_chain_keywords.save(ldaseq_chain_keywrods_name)
else:
    ldaseq_chain_keywords = ldaseqmodel.LdaSeqModel.load(ldaseq_chain_keywords_name)

### DTM

In [None]:
dtm_path = '/Users/hyunyoun/Documents/GitHub/Private_Project/dtm-darwin64'

In [None]:
%%time
dtm_model_keywords_name = clusteringModelPath + 'dtm_keywords'
if not os.path.isfile(dtm_model_keywords_name):
    dtm_model_keywords = DtmModel(dtm_path, corpus = corpus_keywords,  
                       num_topics = 20, 
                       id2word = dict_keywords, 
                       initialize_lda = True)
    dtm_model_keywords.save(dtm_model_keywords_name)
else:
    dtm_model_keywords = DtmModel.load(dtm_model_keywords_name)

In [None]:
%%time
doc_topic_keywords, topic_term_keywords, doc_lengths_keywords, term_freq_keywords,vocab_keywords = dtm_model_keywords.dtm_vis(time = 0, corpus = corpus_keywords)
vis_wrapper_keywords = pyLDAvis.prepare(topic_term_dists = topic_term_keywords,
                               doc_topic_dists = doc_topic_keywords,
                              doc_lengths = doc_lengths_keywords,
                              vocab = vocab_keywords, 
                              term_frequency = term_freq_keywords)

In [None]:
%%time
topics_wrapper_keywords = dtm_model_keywords.dtm_coherence(time = 0)
topics_dtm_keywords = ldaseq_keywords.dtm_coherence(time = 2)
topics_dtm2_keywords = ldaseq_chain_keywords.dtm_coherence( time = 2)

cm_wrapper_keywords = CoherenceModel(topics = topics_wrapper_keywords, corpus = corpus_keywords,
                            dictionary = dict_keywords, coherence = 'u_mass')

cm_dtm_keywords = CoherenceModel(topics = topics_dtm_keywords, corpus = corpus_keywords,
                            dictionary = dict_keywords, coherence = 'u_mass')

cm_dtm2_keywords = CoherenceModel(topics = topics_dtm2_keywords, corpus = corpus_keywords,
                            dictionary = dict_keywords, coherence = 'u_mass')

print ('U_mass topic coherence')
print ('Wrapper coherence is {}'.format(cm_wrapper_keywords.get_coherence()))
print ('DTM Python coherence is {}'.format(cm_dtm_keywords.get_coherence()))
print ('DTM (chain variance) Python coherence is {}'.format(cm_dtm2_keywords.get_coherence()))
