# Exploratory Data Analysis

In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from enum import Enum
from pprint import pprint
import matplotlib.pyplot as plt

import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models
from gensim.models import Phrases

import pyLDAvis
import pyLDAvis.gensim
from gensim.models.ldamodel import LdaModel

## 自定義 data types and functions

In [2]:
class ContentType(Enum):
    TIT = 'title'
    ABS = 'abstract'
    AUT = 'author'
    SEC = 'section'
    
def get_contents(content_type):
    all_contents = []
    dataset_path = '../dataset'
    for file in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file)
        if os.path.isfile(file_path):
            with open(file_path) as f:  
                line = f.readlines()
                if content_type == ContentType.AUT:
                    line = line[1]
                elif content_type == ContentType.SEC:
                    line = line[2]
                elif content_type == ContentType.ABS:
                    line = line[3]
                else:
                    line = line[0]
                line = line.strip()
                all_contents.append(line)
        else:
            print(file_path + ' does not exist.')
    return all_contents


def get_all_titles():
    return get_contents(ContentType.TIT)

def get_all_authors():        
    return get_contents(ContentType.AUT)

def get_all_sections():
    return get_contents(ContentType.SEC)

def get_all_abstracts():
    return get_contents(ContentType.ABS)

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            token = wordnet_lemmatizer.lemmatize(token, pos='v')
            token = wordnet_lemmatizer.lemmatize(token, pos='n')
            result.append(token)
    return result

def get_chart_data(num_topics,num_words,topics):
    buff = 300
    K = num_topics
    topicWordProbMat = topics
    
    #columns = ['1','2','3','4','5']
    columns = range(1,num_topics+1)

    df = pd.DataFrame(columns = columns)
    pd.set_option('display.width', 1000)

    # 40 will be resized later to match number of words in DC
    zz = np.zeros(shape=(buff,K))

    last_number = 0
    DC = {}

    for x in range (num_words): #取每個topic前10個字
        data= pd.DataFrame(columns=columns,index=[0])
        for i in range(num_topics):
            data[columns[i]] = ""
        df = df.append(data,ignore_index=True)  

    for line in topicWordProbMat:
        topic_id,words = line #一個line是一個topic
        probs = words.split("+")
        y = 0 #用來算第幾個word
        for pr in probs:    
            a = pr.split("*")
            df.iloc[y,topic_id] = a[1] #該word

            if a[1] in DC:
                zz[DC[a[1]]][topic_id] = a[0] #該word的機率
            else:
                zz[last_number][topic_id] = a[0]
                DC[a[1]] = last_number
                last_number = last_number+1
            y = y + 1

    return (df,DC,zz)

def show_words_table(df):  
    print(df)
    print('\n')
    
def show_dictionary(DC):
    print(DC)
    print('字典字數：',len(DC))
    print('\n')

def show_probs_table(zz):
    print(zz)
    print(zz.shape)
    
def show_heapmap(DC,zz):
    %matplotlib inline

    zz = np.resize(zz,(len(DC.keys()),zz.shape[1]))

    for val, key in enumerate(DC.keys()):
            plt.text(-3.5, val + 0.1, key,
                     horizontalalignment='right',
                     verticalalignment='center'
                     )

    #plt.figure(figsize=(10,50))
    plt.imshow(zz, cmap='hot', interpolation='nearest',aspect=0.5)#'auto'
    plt.show()

### 取出所有摘要

In [5]:
titles = get_all_titles()
sections = get_all_sections()
authors = get_all_authors()
contents = get_all_abstracts()

print('共',len(contents),'篇論文\n')

documents = pd.DataFrame(columns=['title','author','section','abstract'])
#documents = pd.DataFrame(data=contents,columns=['abstract'])
#documents['index'] = documents.index
documents['title'] = titles
documents['author'] = authors
documents['section'] = sections
documents['abstract'] = contents
documents[:10]

共 1343 篇論文



Unnamed: 0,title,author,section,abstract
0,Active Preference Learning Based on Generalize...,"Nadjet Bourdache@Sorbonne University,Patrice P...",AAAI Technical Track: Reasoning under Uncertainty,We consider the problem of actively eliciting ...
1,Generating Distractors for Reading Comprehensi...,"Yifan Gao@The Chinese University of Hong Kong,...",AAAI Technical Track: Natural Language Processing,We investigate the task of distractor generati...
2,Acting and Planning Using Operational Models,"Sunandita Patra@University of Maryland, Colleg...","AAAI Technical Track: Planning, Routing, and S...",The most common representation formalisms for ...
3,Lifted Hinge-Loss Markov Random Fields,"Sriram Srinivasan@University of California, Sa...",AAAI Technical Track: Reasoning under Uncertainty,Statistical relational learning models are pow...
4,BLOCK: Bilinear Superdiagonal Fusion for Visua...,"Hedi Ben-younes@Sorbonne Université,Remi Caden...",AAAI Technical Track: Vision,Multimodal representation learning is gaining ...
5,Meta Learning for Image Captioning,"Nannan Li@Wuhan University,Zhenzhong Chen@Wuha...",AAAI Technical Track: Vision,Reinforcement learning (RL) has shown its adva...
6,Personalized Robot Tutoring Using the Assistiv...,"Aditi Ramachandran@Yale University,Sarah Stroh...",AAAI Technical Track: Robotics,Selecting appropriate tutoring help actions th...
7,A Pattern-Based Approach to Recognizing Time E...,"Wentao Ding@Nanjing University,Guanji Gao@Nanj...",AAAI Technical Track: Natural Language Processing,Recognizing time expressions is a fundamental ...
8,Towards Optimal Discrete Online Hashing with B...,"Mingbao Lin@Xiamen University,Rongrong Ji@Xiam...",AAAI Technical Track: Vision,"When facing large-scale image datasets, online..."
9,Temporal Bilinear Networks for Video Action Re...,"Yanghao Li@Peking University,Sijie Song@Peking...",AAAI Technical Track: Vision,Temporal modeling in videos is a fundamental y...


### 預處理的全部論文摘要

In [8]:
processed_docs = documents['abstract'].map(preprocess)
processed_docs[:10]

0    [consider, problem, actively, elicit, preferen...
1    [investigate, task, distractor, generation, mu...
2    [common, representation, formalism, plan, desc...
3    [statistical, relational, learn, model, powerf...
4    [multimodal, representation, learn, gain, deep...
5    [reinforcement, learn, show, advantage, image,...
6    [select, appropriate, tutor, help, action, acc...
7    [recognize, time, expression, fundamental, imp...
8    [face, large, scale, image, datasets, online, ...
9    [temporal, model, video, fundamental, challeng...
Name: abstract, dtype: object

## Dataset

### 產生字典

In [7]:
dictionary = gensim.corpora.Dictionary(processed_docs)
print('共',len(dictionary),'個字\n')

# Filter out words that occur less than 10 documents, or more than 20% of the documents.
#dictionary.filter_extremes(no_below=10, no_above=0.2)
#print('Number of unique words after removing rare and common words:', len(dictionary))

共 6927 個字



### 產生 bag of words corpus

In [None]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print('共',len(bow_corpus),'筆')

## Train LDA models

In [None]:
num_topics = 10
num_words = 10
passes = 30
iterations = 150

### Running LDA using BOW

In [None]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=num_topics, 
                                       id2word=dictionary, 
                                       passes=passes,
                                       iterations=iterations,
                                       eval_every=1)

In [None]:
vis_data = pyLDAvis.gensim.prepare(lda_model, bow_corpus, dictionary)
# 在 notebook 中显示可视化结果，需要调用 display 方法，或者执行 “pyLDAvis.enable_notebook()” ，即可在 notebook 中自动展示可视化结果，无需再调用 display
pyLDAvis.display(vis_data)

In [None]:
file_name = '../models/lda_unigram_bow_filtered_topic_'+ str(num_topics) +'.model'
print(file_name)
lda_model.save(file_name)