# 視覺化 LDA模型

### 參考資料
- https://www.machinelearningplus.com/nlp/topic-modeling-visualization-how-to-present-results-lda-models/
- https://datartisan.gitbooks.io/begining-text-mining-with-python/%E7%AC%AC8%E7%AB%A0%20%E6%96%87%E6%9C%AC%E6%95%B0%E6%8D%AE%E5%8F%AF%E8%A7%86%E5%8C%96/8.2%20%E6%96%87%E6%9C%AC%E5%85%B3%E7%B3%BB%E5%8F%AF%E8%A7%86%E5%8C%96.html


In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from enum import Enum
from pprint import pprint

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]

import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

import pyLDAvis
import pyLDAvis.gensim
from gensim.models.ldamodel import LdaModel

In [2]:
class ContentType(Enum):
    TIT = 'title'
    ABS = 'abstract'
    AUT = 'author'
    SEC = 'section'
    
def get_contents(content_type):
    all_contents = []
    dataset_path = '../dataset'
    for file in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file)
        if os.path.isfile(file_path):
            with open(file_path) as f:  
                line = f.readlines()
                if content_type == ContentType.AUT:
                    line = line[1]
                elif content_type == ContentType.SEC:
                    line = line[2]
                elif content_type == ContentType.ABS:
                    line = line[3]
                else:
                    line = line[0]
                line = line.strip()
                all_contents.append(line)
        else:
            print(file_path + ' does not exist.')
    return all_contents


def get_all_titles():
    return get_contents(ContentType.TIT)

### 載入字典

In [3]:
dictionary = corpora.Dictionary.load('../corpus/dict_bigram_filtered.dict')
print('共',len(dictionary),'個字\n')

共 1558 個字



  args, varargs, keywords, defaults = inspect.getargspec(kallable)


### 載入 bow corpus

In [4]:
bow_corpus = corpora.MmCorpus('../corpus/corpus_bigram_filtered.mm')
print('共',len(bow_corpus),'筆')

共 1343 筆


  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)


### 載入 TF-IDF Corpus

### 載入 LDA models

In [5]:
num_topics = 10
file_name = '../models/lda_bigram_bow_filtered_topic_' + str(num_topics) + '.model'
lda_model = models.ldamodel.LdaModel.load(file_name)

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)


In [6]:
titles = get_all_titles()

doc_dominant_topic_keywords = []

documents = lda_model[bow_corpus]
        
count = 0
for i, topics in enumerate(documents):
    #print(topics)
    sorted_topics = sorted(topics, key=lambda x: (x[1]), reverse=True)
    topic_num,topic_prop = sorted_topics[0]
    #print(topic_num)
    
    words = lda_model.show_topic(topic_num)
    topic_keywords = ", ".join([word for word, prop in words])
    doc_dominant_topic_keywords.append([i,titles[i],int(topic_num), round(topic_prop,4), topic_keywords])

sent_topics_df = pd.DataFrame(doc_dominant_topic_keywords)
sent_topics_df.columns = ['Doc_No', 'Title','Dominant_Topic', 'Topic_Prob', 'Keywords']

sent_topics_df.head(20)

  args, varargs, keywords, defaults = inspect.getargspec(kallable)


Unnamed: 0,Doc_No,Title,Dominant_Topic,Topic_Prob,Keywords
0,0,Active Preference Learning Based on Generalize...,4,0.4607,"function, number, optimization, sample, soluti..."
1,1,Generating Distractors for Reading Comprehensi...,8,0.5886,"sequence, attention, user, text, word, recurre..."
2,2,Acting and Planning Using Operational Models,7,0.5363,"video, agent, action, plan, temporal, frame, p..."
3,3,Lifted Hinge-Loss Markov Random Fields,4,0.9915,"function, number, optimization, sample, soluti..."
4,4,BLOCK: Bilinear Superdiagonal Fusion for Visua...,3,0.4296,"structure, graph, sample, accuracy, machine, d..."
5,5,Meta Learning for Image Captioning,1,0.3855,"image, deep, adversarial, layer, object, convo..."
6,6,Personalized Robot Tutoring Using the Assistiv...,7,0.5111,"video, agent, action, plan, temporal, frame, p..."
7,7,A Pattern-Based Approach to Recognizing Time E...,8,0.404,"sequence, attention, user, text, word, recurre..."
8,8,Towards Optimal Discrete Online Hashing with B...,3,0.6495,"structure, graph, sample, accuracy, machine, d..."
9,9,Temporal Bilinear Networks for Video Action Re...,4,0.2861,"function, number, optimization, sample, soluti..."


### pyLDAvis

- 左側為“主題距離地圖”，展示各個主題之間的差異，圖中帶有數字編號的圓形即代表各個主題，圓形的面積與該主題出現的可能性成正比，並且按照面積大小自動進行編號，
- 右側為各個主題前30個最為相關的詞彙，對各個主題進行解釋說明，以水平柱狀圖的形式展示，藍色表示整體詞頻，紅色表示主題詞頻，

In [7]:
vis_data = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
# 在 notebook 中显示可视化结果，需要调用 display 方法，或者执行 “pyLDAvis.enable_notebook()” ，即可在 notebook 中自动展示可视化结果，无需再调用 display
pyLDAvis.display(vis_data)

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### 觀察結果
- train的passes不要太多(沒有全部收斂)，反而圈圈重疊的狀況較少