# 觀察topic用字

In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from enum import Enum
from pprint import pprint
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

## 自定義 data types and functions

In [2]:
class ContentType(Enum):
    TIT = 'title'
    ABS = 'abstract'
    AUT = 'author'
    SEC = 'section'
    
def get_contents(content_type):
    all_contents = []
    dataset_path = '../dataset'
    for file in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file)
        if os.path.isfile(file_path):
            with open(file_path) as f:  
                line = f.readlines()
                if content_type == ContentType.AUT:
                    line = line[1]
                elif content_type == ContentType.SEC:
                    line = line[2]
                elif content_type == ContentType.ABS:
                    line = line[3]
                else:
                    line = line[0]
                line = line.strip()
                all_contents.append(line)
        else:
            print(file_path + ' does not exist.')
    return all_contents


def get_all_titles():
    return get_contents(ContentType.TIT)

def get_all_authors():        
    return get_contents(ContentType.AUT)

def get_all_sections():
    return get_contents(ContentType.SEC)

def get_all_abstracts():
    return get_contents(ContentType.ABS)

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            token = wordnet_lemmatizer.lemmatize(token, pos='v')
            token = wordnet_lemmatizer.lemmatize(token, pos='n')
            result.append(token)
    return result

### 取出所有摘要

In [3]:
contents = get_all_abstracts()
print('共',len(contents),'篇論文\n')

documents = pd.DataFrame(data=contents,columns=['abstract'])
documents['index'] = documents.index
documents[:10]

共 1343 篇論文



Unnamed: 0,abstract,index
0,We consider the problem of actively eliciting ...,0
1,We investigate the task of distractor generati...,1
2,The most common representation formalisms for ...,2
3,Statistical relational learning models are pow...,3
4,Multimodal representation learning is gaining ...,4
5,Reinforcement learning (RL) has shown its adva...,5
6,Selecting appropriate tutoring help actions th...,6
7,Recognizing time expressions is a fundamental ...,7
8,"When facing large-scale image datasets, online...",8
9,Temporal modeling in videos is a fundamental y...,9


### 預處理的全部論文摘要

In [4]:
processed_docs = documents['abstract'].map(preprocess)
processed_docs[:10]

0    [consider, problem, actively, elicit, preferen...
1    [investigate, task, distractor, generation, mu...
2    [common, representation, formalism, plan, desc...
3    [statistical, relational, learn, model, powerf...
4    [multimodal, representation, learn, gain, deep...
5    [reinforcement, learn, show, advantage, image,...
6    [select, appropriate, tutor, help, action, acc...
7    [recognize, time, expression, fundamental, imp...
8    [face, large, scale, image, datasets, online, ...
9    [temporal, model, video, fundamental, challeng...
Name: abstract, dtype: object

## Dataset

### 產生字典

In [12]:
dictionary = corpora.Dictionary.load('../corpus/dict_bigram_filtered.dict')
print('共',len(dictionary),'個字\n')


共 1558 個字



### 產生 bag of words corpus

In [13]:
bow_corpus = corpora.MmCorpus('../corpus/corpus_bigram_filtered.mm')
print('共',len(bow_corpus),'筆')

共 1343 筆


### 產生TF-IDF corpus

In [None]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

print('共',len(corpus_tfidf),'筆')

## Load LDA models

In [16]:
num_topics = '10'
num_words = 10

file_name = '../models/lda_bigram_bow_filtered_topic_' + num_topics + '.model'
lda_model = models.ldamodel.LdaModel.load(file_name)

### show topics

In [17]:
for idx, topic in lda_model.print_topics(): #num_topics=3, num_words=3
    print('Topic: {}\nWords: {}\n'.format(idx, topic))

Topic: 0
Words: 0.013*"generate" + 0.012*"knowledge" + 0.010*"generation" + 0.009*"answer" + 0.009*"relation" + 0.008*"entity" + 0.007*"human" + 0.007*"dataset" + 0.007*"program" + 0.007*"question"

Topic: 1
Words: 0.018*"translation" + 0.018*"sequence" + 0.015*"recurrent" + 0.014*"memory" + 0.013*"student" + 0.010*"machine" + 0.010*"recurrent_neural" + 0.008*"inference" + 0.008*"resource" + 0.008*"temporal"

Topic: 2
Words: 0.038*"graph" + 0.024*"structure" + 0.021*"embed" + 0.019*"object" + 0.017*"detection" + 0.010*"nod" + 0.010*"attribute" + 0.010*"relation" + 0.007*"representation" + 0.007*"world"

Topic: 3
Words: 0.067*"label" + 0.025*"class" + 0.023*"classification" + 0.014*"multi" + 0.010*"hash" + 0.010*"supervise" + 0.009*"classifier" + 0.009*"source" + 0.008*"distribution" + 0.008*"instance"

Topic: 4
Words: 0.035*"domain" + 0.020*"video" + 0.019*"target" + 0.016*"temporal" + 0.016*"transfer" + 0.012*"source" + 0.012*"spatial" + 0.010*"event" + 0.010*"action" + 0.010*"represe

In [11]:
K = num_topics

topicWordProbMat = lda_model.print_topics(num_words=num_words)

#columns = ['1','2','3','4','5']
columns = range(1,num_topics+1)

df = pd.DataFrame(columns = columns)
pd.set_option('display.width', 1000)

# 40 will be resized later to match number of words in DC
zz = np.zeros(shape=(80,K))

last_number = 0
DC = {}

for x in range (num_words): #取每個topic前10個字
    data= pd.DataFrame(columns=columns,index=[0])
    for i in range(num_topics):
        data[columns[i]] = ""
    df = df.append(data,ignore_index=True)  
    
for line in topicWordProbMat:
    topic_id,words = line #一個line是一個topic
    probs = words.split("+")
    y = 0 #用來算第幾個word
    for pr in probs:    
        a = pr.split("*")
        df.iloc[y,topic_id] = a[1] #該word
       
        if a[1] in DC:
            zz[DC[a[1]]][topic_id] = a[0] #該word的機率
        else:
            zz[last_number][topic_id] = a[0]
            DC[a[1]] = last_number
            last_number = last_number+1
        y = y + 1

print(df)
print('\n')

print(DC)
print('字典字數：',len(DC))
print('\n')

print(zz)
print(zz.shape)


TypeError: can only concatenate str (not "int") to str

In [None]:
%matplotlib inline
zz = np.resize(zz,(len(DC.keys()),zz.shape[1]))

for val, key in enumerate(DC.keys()):
        plt.text(-3.5, val + 0.1, key,
                 horizontalalignment='right',
                 verticalalignment='center'
                 )

#plt.figure(figsize=(10,50))
plt.imshow(zz, cmap='hot', interpolation='nearest',aspect=0.4)
plt.show()

若主題數制定過少，則會有歌詞不相近的歌詞卻被分到同一個主題底下的情形；
反之，若主題數制定過多，則會有歌詞相近的歌詞被分到不同主題底下。

In [None]:
columns = range(1,5+1)
data = pd.DataFrame({columns[0]:"",
                     columns[1]:"",
                     columns[2]:"",
                     columns[3]:"",
                     columns[4]:"",
                    },index=[0])

data

In [None]:
columns = range(1,5+1)
data = pd.DataFrame(columns=columns,index=[0])
for i in range(len(columns)):
    data[columns[i]] = ""
data