# 比較 LDA topic numbers

參考資料
- https://blog.csdn.net/sinat_26917383/article/details/79357700


In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from enum import Enum
from pprint import pprint
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.DEBUG)

2019-11-02 16:42:29,496 : DEBUG : Loaded backend module://ipykernel.pylab.backend_inline version unknown.


## 自定義 data types and functions

In [2]:
class ContentType(Enum):
    TIT = 'title'
    ABS = 'abstract'
    AUT = 'author'
    SEC = 'section'
    
def get_contents(content_type):
    all_contents = []
    dataset_path = '../dataset'
    for file in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file)
        if os.path.isfile(file_path):
            with open(file_path) as f:  
                line = f.readlines()
                if content_type == ContentType.AUT:
                    line = line[1]
                elif content_type == ContentType.SEC:
                    line = line[2]
                elif content_type == ContentType.ABS:
                    line = line[3]
                else:
                    line = line[0]
                line = line.strip()
                all_contents.append(line)
        else:
            print(file_path + ' does not exist.')
    return all_contents


def get_all_titles():
    return get_contents(ContentType.TIT)

def get_all_authors():        
    return get_contents(ContentType.AUT)

def get_all_sections():
    return get_contents(ContentType.SEC)

def get_all_abstracts():
    return get_contents(ContentType.ABS)

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            token = wordnet_lemmatizer.lemmatize(token, pos='v')
            token = wordnet_lemmatizer.lemmatize(token, pos='n')
            result.append(token)
    return result

### 取出所有摘要

In [3]:
contents = get_all_abstracts()
print('共',len(contents),'篇論文\n')

documents = pd.DataFrame(data=contents,columns=['abstract'])
documents['index'] = documents.index
documents[:10]

共 1343 篇論文



Unnamed: 0,abstract,index
0,We consider the problem of actively eliciting ...,0
1,We investigate the task of distractor generati...,1
2,The most common representation formalisms for ...,2
3,Statistical relational learning models are pow...,3
4,Multimodal representation learning is gaining ...,4
5,Reinforcement learning (RL) has shown its adva...,5
6,Selecting appropriate tutoring help actions th...,6
7,Recognizing time expressions is a fundamental ...,7
8,"When facing large-scale image datasets, online...",8
9,Temporal modeling in videos is a fundamental y...,9


### 預處理的全部論文摘要

In [4]:
processed_docs = documents['abstract'].map(preprocess)
processed_docs[:10]

0    [consider, problem, actively, elicit, preferen...
1    [investigate, task, distractor, generation, mu...
2    [common, representation, formalism, plan, desc...
3    [statistical, relational, learn, model, powerf...
4    [multimodal, representation, learn, gain, deep...
5    [reinforcement, learn, show, advantage, image,...
6    [select, appropriate, tutor, help, action, acc...
7    [recognize, time, expression, fundamental, imp...
8    [face, large, scale, image, datasets, online, ...
9    [temporal, model, video, fundamental, challeng...
Name: abstract, dtype: object

## Dataset

### 產生字典

In [5]:
dictionary = gensim.corpora.Dictionary(processed_docs)
print('共',len(dictionary),'個字\n')

2019-11-02 16:42:37,504 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-11-02 16:42:37,614 : INFO : built Dictionary(6927 unique tokens: ['active', 'actively', 'adaptive', 'aggregation', 'algorithm']...) from 1343 documents (total 121739 corpus positions)


共 6927 個字



### 產生 bag of words corpus

In [6]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

print('共',len(bow_corpus),'筆')

共 1343 筆


### 產生 TF-IDF Corpus

In [7]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

print('共',len(corpus_tfidf),'筆')

2019-11-02 16:42:39,917 : INFO : collecting document frequencies
2019-11-02 16:42:39,920 : INFO : PROGRESS: processing document #0
2019-11-02 16:42:39,947 : INFO : calculating IDF weights for 1343 documents and 6926 features (84803 matrix non-zeros)


共 1343 筆


## Train LDA models

In [8]:
num_topics = 5

### Running LDA using Bag of Words

In [9]:
#lda_model = gensim.models.LdaMulticore(bow_corpus, 
#                                       num_topics=num_topics, 
#                                       id2word=dictionary, 
#                                       passes=150,
#                                       workers=2,
#                                       eval_every=1)

### Running LDA using TF-IDF

In [10]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                       num_topics=num_topics, 
                                       id2word=dictionary, 
                                       passes=20,
                                       eval_every=1)

2019-11-02 16:42:44,202 : INFO : using symmetric alpha at 0.2
2019-11-02 16:42:44,203 : INFO : using symmetric eta at 0.2
2019-11-02 16:42:44,209 : INFO : using serial LDA version on this node
2019-11-02 16:42:44,222 : INFO : running online LDA training, 5 topics, 20 passes over the supplied corpus of 1343 documents, updating every 6000 documents, evaluating every ~1343 documents, iterating 50x with a convergence threshold of 0.001000
2019-11-02 16:42:44,224 : INFO : training LDA model using 3 processes
2019-11-02 16:42:44,261 : DEBUG : worker process entering E-step loop
2019-11-02 16:42:44,276 : DEBUG : getting a new job
2019-11-02 16:42:44,275 : DEBUG : worker process entering E-step loop
2019-11-02 16:42:44,285 : DEBUG : getting a new job
2019-11-02 16:42:44,285 : DEBUG : worker process entering E-step loop
2019-11-02 16:42:44,292 : DEBUG : getting a new job
2019-11-02 16:42:44,693 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1343/1343, outstanding queue size 1

2019-11-02 16:42:51,415 : INFO : topic diff=0.108032, rho=0.462671
2019-11-02 16:42:51,447 : DEBUG : bound: at document #0
2019-11-02 16:42:52,153 : INFO : -9.917 per-word bound, 966.4 perplexity estimate based on a held-out corpus of 1343 documents with 7958 words
2019-11-02 16:42:52,546 : INFO : PROGRESS: pass 4, dispatched chunk #0 = documents up to #1343/1343, outstanding queue size 1
2019-11-02 16:42:52,858 : DEBUG : processing chunk #0 of 1343 documents
2019-11-02 16:42:52,860 : DEBUG : performing inference on a chunk of 1343 documents
2019-11-02 16:42:53,215 : DEBUG : 1312/1343 documents converged within 50 iterations
2019-11-02 16:42:53,226 : DEBUG : processed chunk, queuing the result
2019-11-02 16:42:53,229 : DEBUG : result put
2019-11-02 16:42:53,232 : DEBUG : updating topics
2019-11-02 16:42:53,233 : DEBUG : getting a new job
2019-11-02 16:42:53,238 : INFO : topic #0 (0.200): 0.002*"plan" + 0.002*"agent" + 0.001*"problem" + 0.001*"algorithm" + 0.001*"content" + 0.001*"docum

2019-11-02 16:42:59,687 : DEBUG : processing chunk #0 of 1343 documents
2019-11-02 16:42:59,689 : DEBUG : performing inference on a chunk of 1343 documents
2019-11-02 16:42:59,922 : DEBUG : 1337/1343 documents converged within 50 iterations
2019-11-02 16:42:59,934 : DEBUG : processed chunk, queuing the result
2019-11-02 16:42:59,940 : DEBUG : updating topics
2019-11-02 16:42:59,939 : DEBUG : result put
2019-11-02 16:42:59,942 : DEBUG : getting a new job
2019-11-02 16:42:59,946 : INFO : topic #0 (0.200): 0.001*"agent" + 0.001*"plan" + 0.001*"dictionary" + 0.001*"vote" + 0.001*"robot" + 0.001*"valuation" + 0.001*"swap" + 0.001*"document" + 0.001*"division" + 0.001*"event"
2019-11-02 16:42:59,947 : INFO : topic #1 (0.200): 0.003*"model" + 0.003*"learn" + 0.003*"network" + 0.003*"feature" + 0.003*"data" + 0.003*"image" + 0.003*"label" + 0.003*"method" + 0.002*"train" + 0.002*"task"
2019-11-02 16:42:59,950 : INFO : topic #2 (0.200): 0.001*"snns" + 0.001*"stance" + 0.001*"ontology" + 0.001*"

2019-11-02 16:43:06,062 : DEBUG : result put
2019-11-02 16:43:06,064 : DEBUG : getting a new job
2019-11-02 16:43:06,068 : INFO : topic #0 (0.200): 0.001*"dictionary" + 0.001*"vote" + 0.001*"valuation" + 0.001*"division" + 0.001*"swap" + 0.001*"agent" + 0.001*"auction" + 0.001*"subtasks" + 0.001*"robot" + 0.001*"fair"
2019-11-02 16:43:06,074 : INFO : topic #1 (0.200): 0.003*"model" + 0.003*"learn" + 0.003*"network" + 0.003*"feature" + 0.003*"data" + 0.003*"image" + 0.003*"label" + 0.003*"method" + 0.002*"task" + 0.002*"train"
2019-11-02 16:43:06,076 : INFO : topic #2 (0.200): 0.001*"snns" + 0.001*"stance" + 0.001*"poverty" + 0.001*"comment" + 0.001*"fisher" + 0.001*"strategic" + 0.001*"centrality" + 0.001*"idiom" + 0.001*"ontology" + 0.001*"board"
2019-11-02 16:43:06,080 : INFO : topic #3 (0.200): 0.001*"satisfiability" + 0.001*"session" + 0.001*"gaze" + 0.001*"solver" + 0.001*"voter" + 0.001*"truthful" + 0.001*"survival" + 0.001*"compilation" + 0.001*"metrical" + 0.001*"circuit"
2019-

2019-11-02 16:43:11,901 : INFO : topic #1 (0.200): 0.003*"model" + 0.003*"learn" + 0.003*"network" + 0.003*"feature" + 0.003*"data" + 0.002*"image" + 0.002*"method" + 0.002*"label" + 0.002*"algorithm" + 0.002*"task"
2019-11-02 16:43:11,904 : INFO : topic #2 (0.200): 0.001*"snns" + 0.001*"stance" + 0.001*"poverty" + 0.001*"comment" + 0.001*"fisher" + 0.001*"idiom" + 0.001*"blame" + 0.001*"equation" + 0.001*"board" + 0.001*"seller"
2019-11-02 16:43:11,905 : INFO : topic #3 (0.200): 0.002*"satisfiability" + 0.001*"session" + 0.001*"gaze" + 0.001*"truthful" + 0.001*"metrical" + 0.001*"survival" + 0.001*"circuit" + 0.001*"aesthetic" + 0.001*"nonverbal" + 0.001*"hyperbolic"
2019-11-02 16:43:11,908 : INFO : topic #4 (0.200): 0.002*"agent" + 0.001*"mapf" + 0.001*"puzzle" + 0.001*"equivalence" + 0.001*"mcmc" + 0.001*"conversation" + 0.001*"reader" + 0.001*"epistemic" + 0.001*"execution" + 0.001*"wcsps"
2019-11-02 16:43:11,911 : INFO : topic diff=0.035602, rho=0.237883
2019-11-02 16:43:11,929 : 

### 結論
- topic num: 5
- bow: 100次尚未全部收斂
- tfidf: 第14次已全部收斂
- 
- topic num: 10
- bow: 100次尚未全部收斂
- tfidf: 第11次已全部收斂
- 
- topic num: 15
- bow: 100次尚未全部收斂
- tfidf: 第7次已全部收斂
- 
- topic num: 20
- bow: 100次尚未全部收斂
- tfidf: 第3次已全部收斂
-  
- topic num: 25
- tfidf: 第1次已全部收斂
- 
- topic num: 30
- tfidf: 第1次已全部收斂

##### Topic num愈多，收斂愈快