# trigram / bow / (no) filtered

In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from enum import Enum
from pprint import pprint
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import Phrases

import pyLDAvis
import pyLDAvis.gensim
from gensim.models.ldamodel import LdaModel

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                    level=logging.DEBUG)

2019-11-04 23:29:34,330 : DEBUG : Loaded backend module://ipykernel.pylab.backend_inline version unknown.


## 自定義 data types and functions

In [2]:
class ContentType(Enum):
    TIT = 'title'
    ABS = 'abstract'
    AUT = 'author'
    SEC = 'section'
    
def get_contents(content_type):
    all_contents = []
    dataset_path = '../dataset'
    for file in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file)
        if os.path.isfile(file_path):
            with open(file_path) as f:  
                line = f.readlines()
                if content_type == ContentType.AUT:
                    line = line[1]
                elif content_type == ContentType.SEC:
                    line = line[2]
                elif content_type == ContentType.ABS:
                    line = line[3]
                else:
                    line = line[0]
                line = line.strip()
                all_contents.append(line)
        else:
            print(file_path + ' does not exist.')
    return all_contents


def get_all_titles():
    return get_contents(ContentType.TIT)

def get_all_authors():        
    return get_contents(ContentType.AUT)

def get_all_sections():
    return get_contents(ContentType.SEC)

def get_all_abstracts():
    return get_contents(ContentType.ABS)

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            token = wordnet_lemmatizer.lemmatize(token, pos='v')
            token = wordnet_lemmatizer.lemmatize(token, pos='n')
            result.append(token)
    return result

def get_chart_data(num_topics,num_words,topics):
    buff = 300
    K = num_topics
    topicWordProbMat = topics
    
    #columns = ['1','2','3','4','5']
    columns = range(1,num_topics+1)

    df = pd.DataFrame(columns = columns)
    pd.set_option('display.width', 1000)

    # 40 will be resized later to match number of words in DC
    zz = np.zeros(shape=(buff,K))

    last_number = 0
    DC = {}

    for x in range (num_words): #取每個topic前10個字
        data= pd.DataFrame(columns=columns,index=[0])
        for i in range(num_topics):
            data[columns[i]] = ""
        df = df.append(data,ignore_index=True)  

    for line in topicWordProbMat:
        topic_id,words = line #一個line是一個topic
        probs = words.split("+")
        y = 0 #用來算第幾個word
        for pr in probs:    
            a = pr.split("*")
            df.iloc[y,topic_id] = a[1] #該word

            if a[1] in DC:
                zz[DC[a[1]]][topic_id] = a[0] #該word的機率
            else:
                zz[last_number][topic_id] = a[0]
                DC[a[1]] = last_number
                last_number = last_number+1
            y = y + 1

    return (df,DC,zz)

def show_words_table(df):  
    print(df)
    print('\n')
    
def show_dictionary(DC):
    print(DC)
    print('字典字數：',len(DC))
    print('\n')

def show_probs_table(zz):
    print(zz)
    print(zz.shape)
    
def show_heapmap(DC,zz):
    %matplotlib inline

    zz = np.resize(zz,(len(DC.keys()),zz.shape[1]))

    for val, key in enumerate(DC.keys()):
            plt.text(-3.5, val + 0.1, key,
                     horizontalalignment='right',
                     verticalalignment='center'
                     )

    #plt.figure(figsize=(10,50))
    plt.imshow(zz, cmap='hot', interpolation='nearest',aspect=0.5)#'auto'
    plt.show()

### 取出所有摘要

In [3]:
contents = get_all_abstracts()
print('共',len(contents),'篇論文\n')

documents = pd.DataFrame(data=contents,columns=['abstract'])
documents['index'] = documents.index
documents[:10]

共 1343 篇論文



Unnamed: 0,abstract,index
0,We consider the problem of actively eliciting ...,0
1,We investigate the task of distractor generati...,1
2,The most common representation formalisms for ...,2
3,Statistical relational learning models are pow...,3
4,Multimodal representation learning is gaining ...,4
5,Reinforcement learning (RL) has shown its adva...,5
6,Selecting appropriate tutoring help actions th...,6
7,Recognizing time expressions is a fundamental ...,7
8,"When facing large-scale image datasets, online...",8
9,Temporal modeling in videos is a fundamental y...,9


### 預處理的全部論文摘要

In [4]:
processed_docs = documents['abstract'].map(preprocess)
processed_docs[:10]

bigram = Phrases(processed_docs, min_count=10)
trigram = Phrases(bigram[processed_docs], min_count=10)

for i in range(len(processed_docs)):
    for token in bigram[processed_docs[i]]:
        if token.count('_') == 1:
            processed_docs[i].append(token)    
    for token in trigram[bigram[processed_docs[i]]]:
        if token.count('_') == 2:
            processed_docs[i].append(token)

2019-11-04 23:29:44,078 : INFO : collecting all words and their counts
2019-11-04 23:29:44,079 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2019-11-04 23:29:44,278 : INFO : collected 89298 word types from a corpus of 121739 words (unigram + bigrams) and 1343 sentences
2019-11-04 23:29:44,279 : INFO : using 89298 counts as vocab in Phrases<0 vocab, min_count=10, threshold=10.0, max_vocab_size=40000000>
2019-11-04 23:29:44,280 : INFO : collecting all words and their counts
2019-11-04 23:29:44,282 : INFO : PROGRESS: at sentence #0, processed 0 words and 0 word types
2019-11-04 23:29:44,873 : INFO : collected 91908 word types from a corpus of 115388 words (unigram + bigrams) and 1343 sentences
2019-11-04 23:29:44,874 : INFO : using 91908 counts as vocab in Phrases<0 vocab, min_count=10, threshold=10.0, max_vocab_size=40000000>


## Dataset

### 產生字典

In [16]:
dictionary = gensim.corpora.Dictionary(processed_docs)
print('共',len(dictionary),'個字\n')

# Filter out words that occur less than 10 documents, or more than 20% of the documents.
dictionary.filter_extremes(no_below=10, no_above=0.2)
print('Number of unique words after removing rare and common words:', len(dictionary))

dictionary.save('../corpus/dict_trigram_filtered.dict')
#loaded_dict = corpora.Dictionary.load('../corpus/mydict.dict')

2019-11-05 00:01:09,947 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-11-05 00:01:10,127 : INFO : built Dictionary(7170 unique tokens: ['active', 'active_learn', 'actively', 'adaptive', 'aggregation']...) from 1343 documents (total 128849 corpus positions)
2019-11-05 00:01:10,134 : INFO : discarding 5584 tokens: [('actively', 9), ('algorithm', 358), ('approach', 483), ('coefficient', 8), ('collective', 7), ('elicitation', 3), ('equity', 2), ('gini', 1), ('interleave', 3), ('knapsack', 5)]...
2019-11-05 00:01:10,135 : INFO : keeping 1586 tokens which were in no less than 10 and no more than 268 (=20.0%) documents
2019-11-05 00:01:10,137 : DEBUG : rebuilding dictionary, shrinking gaps
2019-11-05 00:01:10,139 : INFO : resulting dictionary: Dictionary(1586 unique tokens: ['active', 'active_learn', 'adaptive', 'aggregation', 'allow']...)
2019-11-05 00:01:10,144 : INFO : saving Dictionary object under ../corpus/dict_trigram_filtered.dict, separately None
  args, varargs

共 7170 個字

Number of unique words after removing rare and common words: 1586


### 產生 bag of words corpus

In [17]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print('共',len(bow_corpus),'筆')

corpora.MmCorpus.serialize('../corpus/corpus_trigram_filtered.mm', bow_corpus)
#corpus_test = corpora.MmCorpus('../corpus/bow_corpus.mm')

2019-11-05 00:01:25,305 : INFO : storing corpus in Matrix Market format to ../corpus/corpus_trigram_filtered.mm
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
2019-11-05 00:01:25,306 : DEBUG : {'uri': '../corpus/corpus_trigram_filtered.mm', 'mode': 'wb+', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': {}}
2019-11-05 00:01:25,331 : INFO : saving sparse matrix to ../corpus/corpus_trigram_filtered.mm
2019-11-05 00:01:25,332 : INFO : PROGRESS: saving document #0
2019-11-05 00:01:25,424 : INFO : PROGRESS: saving document #1000
2019-11-05 00:01:25,450 : INFO : saved 1343x1586 matrix, density=2.990% (63690/2129998)
2019-11-05 00:01:25,452 : DEBUG : closing ../corpus/corpus_trigram_filtered.mm
2019-11-05 00:01:25,459 : DEBUG : closing ../corpus/corpus_trigram_filtered.mm
2019-11-05 00:01:25,464 : INFO : saving MmCorpus index to ../corpus/corpus_trigram_filtered.mm.index
  args, va

共 1343 筆


## Train LDA models

In [12]:
num_topics = 10
num_words = 10

### Running LDA using BOW

In [13]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=num_topics, 
                                       id2word=dictionary, 
                                       passes=30,
                                       iterations=300,
                                       eval_every=1)

2019-11-04 23:46:25,341 : INFO : using symmetric alpha at 0.1
2019-11-04 23:46:25,344 : INFO : using symmetric eta at 0.1
2019-11-04 23:46:25,347 : INFO : using serial LDA version on this node
2019-11-04 23:46:25,351 : INFO : running online LDA training, 10 topics, 30 passes over the supplied corpus of 1343 documents, updating every 6000 documents, evaluating every ~1343 documents, iterating 300x with a convergence threshold of 0.001000
2019-11-04 23:46:25,353 : INFO : training LDA model using 3 processes
2019-11-04 23:46:25,374 : DEBUG : worker process entering E-step loop
2019-11-04 23:46:25,367 : DEBUG : worker process entering E-step loop
2019-11-04 23:46:25,398 : DEBUG : getting a new job
2019-11-04 23:46:25,394 : DEBUG : getting a new job
2019-11-04 23:46:25,396 : DEBUG : worker process entering E-step loop
2019-11-04 23:46:25,406 : DEBUG : getting a new job
2019-11-04 23:46:25,411 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1343/1343, outstanding queue size

2019-11-04 23:46:36,263 : INFO : topic #5 (0.100): 0.029*"agent" + 0.011*"action" + 0.011*"plan" + 0.009*"sample" + 0.006*"reinforcement" + 0.005*"word" + 0.005*"social" + 0.005*"consider" + 0.005*"solve" + 0.005*"space"
2019-11-04 23:46:36,265 : INFO : topic diff=0.247590, rho=0.462671
2019-11-04 23:46:36,273 : DEBUG : bound: at document #0
2019-11-04 23:46:37,560 : INFO : -7.112 per-word bound, 138.3 perplexity estimate based on a held-out corpus of 1343 documents with 87763 words
2019-11-04 23:46:37,562 : INFO : PROGRESS: pass 4, dispatched chunk #0 = documents up to #1343/1343, outstanding queue size 1
2019-11-04 23:46:37,581 : DEBUG : processing chunk #0 of 1343 documents
2019-11-04 23:46:37,583 : DEBUG : performing inference on a chunk of 1343 documents
2019-11-04 23:46:38,437 : DEBUG : 1339/1343 documents converged within 300 iterations
2019-11-04 23:46:38,441 : DEBUG : processed chunk, queuing the result
2019-11-04 23:46:38,445 : DEBUG : updating topics
2019-11-04 23:46:38,443 

2019-11-04 23:46:44,780 : DEBUG : bound: at document #0
2019-11-04 23:46:45,788 : INFO : -7.040 per-word bound, 131.6 perplexity estimate based on a held-out corpus of 1343 documents with 87763 words
2019-11-04 23:46:45,788 : INFO : PROGRESS: pass 8, dispatched chunk #0 = documents up to #1343/1343, outstanding queue size 1
2019-11-04 23:46:45,810 : DEBUG : processing chunk #0 of 1343 documents
2019-11-04 23:46:45,812 : DEBUG : performing inference on a chunk of 1343 documents
2019-11-04 23:46:46,642 : DEBUG : 1342/1343 documents converged within 300 iterations
2019-11-04 23:46:46,647 : DEBUG : processed chunk, queuing the result
2019-11-04 23:46:46,649 : DEBUG : result put
2019-11-04 23:46:46,651 : DEBUG : updating topics
2019-11-04 23:46:46,651 : DEBUG : getting a new job
2019-11-04 23:46:46,655 : INFO : topic #0 (0.100): 0.012*"image" + 0.009*"representation" + 0.009*"structure" + 0.008*"face" + 0.008*"sample" + 0.006*"human" + 0.006*"inference" + 0.006*"high" + 0.005*"attribute" + 

2019-11-04 23:46:53,421 : INFO : PROGRESS: pass 12, dispatched chunk #0 = documents up to #1343/1343, outstanding queue size 1
2019-11-04 23:46:53,440 : DEBUG : processing chunk #0 of 1343 documents
2019-11-04 23:46:53,442 : DEBUG : performing inference on a chunk of 1343 documents
2019-11-04 23:46:54,142 : DEBUG : 1342/1343 documents converged within 300 iterations
2019-11-04 23:46:54,146 : DEBUG : processed chunk, queuing the result
2019-11-04 23:46:54,151 : DEBUG : updating topics
2019-11-04 23:46:54,149 : DEBUG : result put
2019-11-04 23:46:54,152 : DEBUG : getting a new job
2019-11-04 23:46:54,154 : INFO : topic #2 (0.100): 0.023*"user" + 0.013*"attention" + 0.011*"text" + 0.009*"relation" + 0.008*"representation" + 0.008*"content" + 0.008*"dataset" + 0.007*"recommendation" + 0.007*"language" + 0.007*"word"
2019-11-04 23:46:54,156 : INFO : topic #6 (0.100): 0.011*"game" + 0.010*"cluster" + 0.009*"solution" + 0.009*"function" + 0.008*"number" + 0.008*"large" + 0.007*"view" + 0.007*

2019-11-04 23:47:00,590 : DEBUG : processing chunk #0 of 1343 documents
2019-11-04 23:47:00,592 : DEBUG : performing inference on a chunk of 1343 documents
2019-11-04 23:47:01,242 : DEBUG : 1342/1343 documents converged within 300 iterations
2019-11-04 23:47:01,246 : DEBUG : processed chunk, queuing the result
2019-11-04 23:47:01,249 : DEBUG : result put
2019-11-04 23:47:01,250 : DEBUG : updating topics
2019-11-04 23:47:01,251 : DEBUG : getting a new job
2019-11-04 23:47:01,254 : INFO : topic #5 (0.100): 0.036*"agent" + 0.015*"action" + 0.014*"plan" + 0.010*"decision" + 0.010*"reinforcement" + 0.009*"policy" + 0.008*"reinforcement_learn" + 0.007*"reward" + 0.006*"sample" + 0.006*"make"
2019-11-04 23:47:01,255 : INFO : topic #6 (0.100): 0.011*"cluster" + 0.011*"game" + 0.010*"function" + 0.010*"solution" + 0.008*"optimization" + 0.008*"number" + 0.008*"large" + 0.008*"view" + 0.007*"search" + 0.007*"solve"
2019-11-04 23:47:01,256 : INFO : topic #7 (0.100): 0.013*"event" + 0.011*"predict

2019-11-04 23:47:08,050 : DEBUG : 1342/1343 documents converged within 300 iterations
2019-11-04 23:47:08,054 : DEBUG : processed chunk, queuing the result
2019-11-04 23:47:08,057 : DEBUG : result put
2019-11-04 23:47:08,059 : DEBUG : updating topics
2019-11-04 23:47:08,060 : DEBUG : getting a new job
2019-11-04 23:47:08,063 : INFO : topic #5 (0.100): 0.036*"agent" + 0.015*"action" + 0.014*"plan" + 0.012*"decision" + 0.010*"reinforcement" + 0.010*"policy" + 0.008*"reinforcement_learn" + 0.007*"reward" + 0.007*"make" + 0.006*"online"
2019-11-04 23:47:08,064 : INFO : topic #8 (0.100): 0.050*"label" + 0.032*"domain" + 0.018*"target" + 0.015*"source" + 0.014*"class" + 0.014*"classification" + 0.012*"transfer" + 0.011*"supervise" + 0.010*"multi" + 0.009*"classifier"
2019-11-04 23:47:08,066 : INFO : topic #7 (0.100): 0.014*"event" + 0.013*"prediction" + 0.011*"mechanism" + 0.009*"dynamic" + 0.008*"world" + 0.008*"policy" + 0.007*"analysis" + 0.007*"process" + 0.007*"traffic" + 0.006*"real_wo

2019-11-04 23:47:14,365 : DEBUG : result put
2019-11-04 23:47:14,367 : DEBUG : updating topics
2019-11-04 23:47:14,370 : INFO : topic #1 (0.100): 0.025*"image" + 0.017*"video" + 0.014*"deep" + 0.014*"object" + 0.012*"temporal" + 0.010*"detection" + 0.010*"convolutional" + 0.008*"spatial" + 0.008*"shape" + 0.008*"layer"
2019-11-04 23:47:14,372 : INFO : topic #7 (0.100): 0.014*"event" + 0.014*"prediction" + 0.011*"mechanism" + 0.009*"dynamic" + 0.008*"world" + 0.008*"process" + 0.008*"policy" + 0.007*"analysis" + 0.007*"traffic" + 0.007*"effect"
2019-11-04 23:47:14,368 : DEBUG : getting a new job
2019-11-04 23:47:14,375 : INFO : topic #0 (0.100): 0.014*"image" + 0.013*"sample" + 0.012*"representation" + 0.010*"face" + 0.009*"structure" + 0.009*"latent" + 0.008*"high" + 0.007*"generate" + 0.007*"rank" + 0.007*"attribute"
2019-11-04 23:47:14,376 : INFO : topic #9 (0.100): 0.020*"word" + 0.016*"sentence" + 0.016*"language" + 0.014*"answer" + 0.013*"question" + 0.013*"semantic" + 0.013*"know

2019-11-04 23:47:20,555 : INFO : topic #7 (0.100): 0.015*"prediction" + 0.014*"event" + 0.011*"mechanism" + 0.009*"dynamic" + 0.008*"world" + 0.008*"process" + 0.008*"policy" + 0.007*"analysis" + 0.007*"traffic" + 0.007*"effect"
2019-11-04 23:47:20,556 : INFO : topic #6 (0.100): 0.013*"function" + 0.011*"optimization" + 0.011*"cluster" + 0.010*"game" + 0.010*"solution" + 0.009*"large" + 0.009*"number" + 0.008*"solve" + 0.008*"view" + 0.008*"search"
2019-11-04 23:47:20,558 : INFO : topic #8 (0.100): 0.051*"label" + 0.033*"domain" + 0.018*"target" + 0.016*"source" + 0.015*"classification" + 0.014*"class" + 0.013*"transfer" + 0.012*"supervise" + 0.011*"multi" + 0.010*"classifier"
2019-11-04 23:47:20,559 : INFO : topic #2 (0.100): 0.026*"user" + 0.013*"text" + 0.013*"attention" + 0.010*"relation" + 0.010*"content" + 0.009*"generate" + 0.008*"recommendation" + 0.008*"representation" + 0.007*"entity" + 0.007*"multi"
2019-11-04 23:47:20,560 : INFO : topic #0 (0.100): 0.015*"image" + 0.013*"sa

In [14]:
vis_data = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
# 在 notebook 中显示可视化结果，需要调用 display 方法，或者执行 “pyLDAvis.enable_notebook()” ，即可在 notebook 中自动展示可视化结果，无需再调用 display
pyLDAvis.display(vis_data)

2019-11-04 23:55:45,382 : DEBUG : performing inference on a chunk of 1343 documents
2019-11-04 23:55:45,955 : DEBUG : 1343/1343 documents converged within 300 iterations


In [15]:
file_name = '../models/lda_trigram_bow_filtered_topic_'+ str(num_topics) +'.model'
print(file_name)
lda_model.save(file_name)

2019-11-05 00:00:52,596 : INFO : saving LdaState object under ../models/lda_trigram_bow_filtered_topic_10.model.state, separately None
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
2019-11-05 00:00:52,604 : DEBUG : {'uri': '../models/lda_trigram_bow_filtered_topic_10.model.state', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': {}}
2019-11-05 00:00:52,612 : INFO : saved ../models/lda_trigram_bow_filtered_topic_10.model.state
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
2019-11-05 00:00:52,618 : DEBUG : {'uri': '../models/lda_trigram_bow_filtered_topic_10.model.id2word', 'mode': 'wb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': {}}
2019-11-05 00:00:52,625 : INFO : saving LdaMulticore object under ../models/lda_trigram_bow_filtered_topic_10.mo

../models/lda_trigram_bow_filtered_topic_10.model
