# 以LDA模型計算文件相似度

In [66]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from enum import Enum
from pprint import pprint

from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

# 初始化
porter_stemmer = PorterStemmer()
snowball_stemmer = SnowballStemmer('english')

## 自定義 data types and functions

In [2]:
class ContentType(Enum):
    TIT = 'title'
    ABS = 'abstract'
    AUT = 'author'
    SEC = 'section'

In [3]:
def get_contents(content_type):
    all_contents = []
    dataset_path = '../dataset'
    for file in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file)
        if os.path.isfile(file_path):
            with open(file_path) as f:  
                line = f.readlines()
                if content_type == ContentType.AUT:
                    line = line[1]
                elif content_type == ContentType.SEC:
                    line = line[2]
                elif content_type == ContentType.ABS:
                    line = line[3]
                else:
                    line = line[0]
                line = line.strip()
                all_contents.append(line)
        else:
            print(file_path + ' does not exist.')
    return all_contents


def get_all_titles():
    return get_contents(ContentType.TIT)

def get_all_authors():        
    return get_contents(ContentType.AUT)

def get_all_abstracts():
    return get_contents(ContentType.ABS)

### 預處理
- 以gensim的simple_preprocess處理：斷詞，統一小寫，去標點
- 去除stopword
- lemmatize,stemming

In [4]:
def lemmatize_stemming(text):
    #return porter_stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))
    return WordNetLemmatizer().lemmatize(text, pos='v')

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemmatize_stemming(token))
    return result

### 取出所有摘要

In [14]:
contents = get_all_abstracts()
print('共',len(contents),'篇論文\n')

共 1343 篇論文



In [13]:
documents = pd.DataFrame(data=contents,columns=['abstract'])
documents['index'] = documents.index
documents[:10]

Unnamed: 0,abstract,index
0,We consider the problem of actively eliciting ...,0
1,We investigate the task of distractor generati...,1
2,The most common representation formalisms for ...,2
3,Statistical relational learning models are pow...,3
4,Multimodal representation learning is gaining ...,4
5,Reinforcement learning (RL) has shown its adva...,5
6,Selecting appropriate tutoring help actions th...,6
7,Recognizing time expressions is a fundamental ...,7
8,"When facing large-scale image datasets, online...",8
9,Temporal modeling in videos is a fundamental y...,9


### 取其中一篇，比較預處理前後

In [30]:
sample_doc_id = 1001

print('original document: ')
print(contents[sample_doc_id])

sample_doc = documents[documents['index'] == sample_doc_id].values[0][0]
print('\n\noriginal tokens: ')

words = []
for word in sample_doc.split(' '):
    words.append(word)
print(words)

tokens = preprocess(sample_doc)
print('\n\nlemmatized tokens: ')
print(tokens)

print('\n共',len(tokens),'字')

original document: 
In real-world applications of natural language generation, there are often constraints on the target sentences in addition to fluency and naturalness requirements. Existing language generation techniques are usually based on recurrent neural networks (RNNs). However, it is non-trivial to impose constraints on RNNs while maintaining generation quality, since RNNs generate sentences sequentially (or with beam search) from the first word to the last. In this paper, we propose CGMH, a novel approach using Metropolis-Hastings sampling for constrained sentence generation. CGMH allows complicated constraints such as the occurrence of multiple keywords in the target sentences, which cannot be handled in traditional RNN-based approaches. Moreover, CGMH works in the inference stage, and does not require parallel corpora for training. We evaluate our method on a variety of tasks, including keywords-to-sentence generation, unsupervised sentence paraphrasing, and unsupervised se

### 預處理的全部論文摘要

In [20]:
processed_docs = documents['abstract'].map(preprocess)
processed_docs[:10]

0    [consider, problem, actively, elicit, preferen...
1    [investigate, task, distractor, generation, mu...
2    [common, representation, formalisms, plan, des...
3    [statistical, relational, learn, model, powerf...
4    [multimodal, representation, learn, gain, deep...
5    [reinforcement, learn, show, advantage, image,...
6    [select, appropriate, tutor, help, action, acc...
7    [recognize, time, expressions, fundamental, im...
8    [face, large, scale, image, datasets, online, ...
9    [temporal, model, videos, fundamental, challen...
Name: abstract, dtype: object

## Bag of Words Corpus

### 產生bow資料集

In [45]:
# 產生字典
dictionary = gensim.corpora.Dictionary(processed_docs)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 active
1 actively
2 adaptive
3 aggregation
4 algorithm
5 allow
6 approach
7 bind
8 coefficients
9 collective
10 combinatorial


In [46]:
# 濾掉出現於少於10篇的字，或是，出現超過半篇的字
dictionary.filter_extremes(no_below=10, no_above=0.4, keep_n=100000)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 active
1 adaptive
2 aggregation
3 algorithm
4 allow
5 approach
6 bind
7 combinatorial
8 compare
9 consider
10 context


In [49]:
# 產生 bag of words corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

print('共',len(bow_corpus),'筆')

共 1343 筆


### 查看其中一篇文章(sample document)的bow

In [51]:
c = 0

sample_doc_bow = bow_corpus[sample_doc_id]
for i in range(len(sample_doc_bow)):
    print("Word {} (\"{}\") appears {} time.".format(sample_doc_bow[i][0], 
                                               dictionary[sample_doc_bow[i][0]],
                                                     sample_doc_bow[i][1]))
    c = c + sample_doc_bow[i][1]
    
print('共',c,'字')   

Word 4 ("allow") appears 1 time.
Word 5 ("approach") appears 2 time.
Word 8 ("compare") appears 1 time.
Word 48 ("supervise") appears 1 time.
Word 68 ("generate") appears 1 time.
Word 69 ("generation") appears 6 time.
Word 80 ("multiple") appears 1 time.
Word 84 ("previous") appears 1 time.
Word 87 ("real") appears 1 time.
Word 88 ("recurrent") appears 1 time.
Word 92 ("sentence") appears 8 time.
Word 101 ("task") appears 1 time.
Word 104 ("word") appears 1 time.
Word 105 ("work") appears 1 time.
Word 121 ("require") appears 1 time.
Word 149 ("inference") appears 1 time.
Word 157 ("method") appears 1 time.
Word 161 ("parallel") appears 1 time.
Word 181 ("world") appears 1 time.
Word 187 ("code") appears 1 time.
Word 202 ("github") appears 1 time.
Word 204 ("https") appears 1 time.
Word 209 ("maintain") appears 1 time.
Word 243 ("methods") appears 1 time.
Word 245 ("quality") appears 1 time.
Word 291 ("achieve") appears 1 time.
Word 292 ("applications") appears 1 time.
Word 303 ("exist"

## TF-IDF Corpus

### 產生TF-IDF資料集

In [71]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

# sample doc的tf-idf
corpus_tfidf[sample_doc_id]

[(4, 0.06456533739403816),
 (5, 0.05277587385667208),
 (8, 0.04913897524372444),
 (48, 0.062536059578974),
 (68, 0.04598531720453427),
 (69, 0.4254987778412788),
 (80, 0.05200141170584894),
 (84, 0.057795388137199435),
 (87, 0.04036477998522042),
 (88, 0.07410615121494404),
 (92, 0.5721890809552969),
 (101, 0.028386635685164065),
 (104, 0.06410039462378403),
 (105, 0.03621715719381838),
 (121, 0.051572532708930974),
 (149, 0.07032325005205249),
 (157, 0.03144752146410322),
 (161, 0.09809154909321473),
 (181, 0.04862799566264723),
 (187, 0.0888021733214704),
 (202, 0.10494765410829669),
 (204, 0.10494765410829669),
 (209, 0.0849103959395723),
 (243, 0.029994301765549916),
 (245, 0.06341832075443449),
 (291, 0.03606124357370797),
 (292, 0.053181211510352364),
 (303, 0.03278133908202476),
 (312, 0.1213096011195062),
 (316, 0.06319491134480174),
 (319, 0.030932718840479326),
 (329, 0.032105812921030846),
 (333, 0.07121826320485267),
 (358, 0.033195309869213596),
 (384, 0.09267999902656376)

## Train LDA models

In [77]:
num_topics = 20

### Running LDA using Bag of Words

In [78]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=num_topics, 
                                       id2word=dictionary, 
                                       passes=2, 
                                       workers=2)

In [74]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {}\nWords: {}\n'.format(idx, topic))

Topic: 0
Words: 0.018*"word" + 0.012*"network" + 0.011*"train" + 0.011*"data" + 0.010*"approach" + 0.010*"neural" + 0.007*"deep" + 0.007*"sample" + 0.006*"state" + 0.006*"performance"

Topic: 1
Words: 0.026*"network" + 0.015*"embed" + 0.015*"view" + 0.012*"structure" + 0.012*"data" + 0.012*"action" + 0.010*"space" + 0.009*"function" + 0.009*"approach" + 0.008*"graph"

Topic: 2
Words: 0.018*"task" + 0.016*"feature" + 0.013*"network" + 0.012*"text" + 0.010*"word" + 0.010*"level" + 0.009*"neural" + 0.009*"state" + 0.009*"approach" + 0.008*"information"

Topic: 3
Words: 0.020*"data" + 0.009*"feature" + 0.008*"information" + 0.008*"user" + 0.007*"performance" + 0.007*"methods" + 0.007*"recommendation" + 0.006*"task" + 0.006*"work" + 0.006*"algorithms"

Topic: 4
Words: 0.011*"task" + 0.010*"approach" + 0.010*"methods" + 0.010*"framework" + 0.009*"train" + 0.008*"user" + 0.007*"problems" + 0.007*"algorithm" + 0.007*"plan" + 0.007*"problem"

Topic: 5
Words: 0.015*"task" + 0.014*"network" + 0.0

### Running LDA using TF-IDF

In [79]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics=num_topics, 
                                             id2word=dictionary, 
                                             passes=2, 
                                             workers=4)

In [80]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {}\nWord: {}\n'.format(idx, topic))

Topic: 0
Word: 0.006*"feature" + 0.005*"temporal" + 0.004*"spatial" + 0.004*"network" + 0.004*"data" + 0.004*"attention" + 0.004*"methods" + 0.004*"point" + 0.004*"human" + 0.003*"class"

Topic: 1
Word: 0.005*"market" + 0.005*"frame" + 0.004*"image" + 0.004*"game" + 0.004*"community" + 0.003*"train" + 0.003*"multi" + 0.003*"data" + 0.003*"condition" + 0.003*"agents"

Topic: 2
Word: 0.005*"causal" + 0.004*"face" + 0.004*"decision" + 0.004*"action" + 0.004*"word" + 0.004*"task" + 0.004*"objective" + 0.004*"agent" + 0.004*"graph" + 0.003*"data"

Topic: 3
Word: 0.004*"word" + 0.004*"network" + 0.004*"satisfiability" + 0.004*"check" + 0.004*"domains" + 0.004*"text" + 0.004*"domain" + 0.004*"change" + 0.004*"task" + 0.004*"post"

Topic: 4
Word: 0.006*"time" + 0.005*"network" + 0.004*"document" + 0.004*"game" + 0.004*"graph" + 0.004*"algorithms" + 0.004*"word" + 0.003*"rat" + 0.003*"image" + 0.003*"search"

Topic: 5
Word: 0.009*"hash" + 0.007*"vote" + 0.005*"network" + 0.005*"rank" + 0.004*"b

### Performance evaluation by classifying sample document

In [84]:
print(processed_docs[sample_doc_id])

['real', 'world', 'applications', 'natural', 'language', 'generation', 'constraints', 'target', 'sentence', 'addition', 'fluency', 'naturalness', 'requirements', 'exist', 'language', 'generation', 'techniques', 'usually', 'base', 'recurrent', 'neural', 'network', 'rnns', 'trivial', 'impose', 'constraints', 'rnns', 'maintain', 'generation', 'quality', 'rnns', 'generate', 'sentence', 'sequentially', 'beam', 'search', 'word', 'paper', 'propose', 'cgmh', 'novel', 'approach', 'metropolis', 'hastings', 'sample', 'constrain', 'sentence', 'generation', 'cgmh', 'allow', 'complicate', 'constraints', 'occurrence', 'multiple', 'keywords', 'target', 'sentence', 'handle', 'traditional', 'base', 'approach', 'cgmh', 'work', 'inference', 'stage', 'require', 'parallel', 'corpora', 'train', 'evaluate', 'method', 'variety', 'task', 'include', 'keywords', 'sentence', 'generation', 'unsupervised', 'sentence', 'paraphrase', 'unsupervised', 'sentence', 'error', 'correction', 'cgmh', 'achieve', 'high', 'perfor

In [89]:
for index, score in sorted(lda_model[bow_corpus[sample_doc_id]], key=lambda tup: -1*tup[1]):
    print('Topic id:',index)
    print("\nScore: {}\t \nTopic: {}\n".format(score, lda_model.print_topic(index, 30)))

Topic id: 18

Score: 0.619809091091156	 
Topic: 0.015*"task" + 0.012*"attention" + 0.012*"word" + 0.011*"train" + 0.009*"approach" + 0.009*"video" + 0.009*"state" + 0.009*"data" + 0.008*"sentence" + 0.008*"information" + 0.006*"network" + 0.006*"sequence" + 0.006*"question" + 0.006*"outperform" + 0.006*"novel" + 0.006*"method" + 0.006*"temporal" + 0.006*"structure" + 0.006*"problem" + 0.006*"generate" + 0.006*"work" + 0.005*"answer" + 0.005*"neural" + 0.005*"datasets" + 0.005*"experiment" + 0.005*"improve" + 0.005*"language" + 0.005*"target" + 0.005*"different" + 0.004*"embed"

Topic id: 13

Score: 0.25408852100372314	 
Topic: 0.013*"data" + 0.012*"approach" + 0.012*"network" + 0.010*"prediction" + 0.009*"performance" + 0.008*"sample" + 0.008*"feature" + 0.008*"state" + 0.007*"generate" + 0.007*"time" + 0.006*"problem" + 0.006*"neural" + 0.006*"process" + 0.005*"structure" + 0.005*"datasets" + 0.005*"methods" + 0.005*"temporal" + 0.005*"high" + 0.005*"demonstrate" + 0.005*"improve" + 0

In [90]:
for index, score in sorted(lda_model_tfidf[bow_corpus[sample_doc_id]], key=lambda tup: -1*tup[1]):
    print('Topic id:',index)
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 30)))

Topic id: 11

Score: 0.5399236083030701	 
Topic: 0.004*"temporal" + 0.004*"feature" + 0.004*"data" + 0.004*"sentence" + 0.004*"adversarial" + 0.004*"label" + 0.004*"instance" + 0.004*"neural" + 0.004*"shape" + 0.003*"network" + 0.003*"framework" + 0.003*"sample" + 0.003*"examples" + 0.003*"algorithm" + 0.003*"work" + 0.003*"match" + 0.003*"time" + 0.003*"scale" + 0.003*"general" + 0.003*"gradient" + 0.003*"problems" + 0.003*"detection" + 0.003*"relations" + 0.003*"train" + 0.003*"task" + 0.003*"distribution" + 0.003*"approach" + 0.003*"perturbations" + 0.003*"view" + 0.003*"different"
Topic id: 17

Score: 0.19564417004585266	 
Topic: 0.008*"plan" + 0.005*"network" + 0.005*"multi" + 0.005*"question" + 0.005*"approach" + 0.004*"reason" + 0.004*"answer" + 0.004*"feature" + 0.004*"problem" + 0.004*"algorithms" + 0.004*"video" + 0.004*"task" + 0.004*"deep" + 0.004*"information" + 0.004*"classification" + 0.003*"train" + 0.003*"knowledge" + 0.003*"method" + 0.003*"agent" + 0.003*"score" + 0.

## 找出相似的文件

### 以sample document為例