# 以LDA模型計算文件相似度

In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from enum import Enum
from pprint import pprint

import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

## 自定義 data types and functions

In [8]:
class ContentType(Enum):
    TIT = 'title'
    ABS = 'abstract'
    AUT = 'author'
    SEC = 'section'
    
def get_contents(content_type):
    all_contents = []
    dataset_path = '../dataset'
    for file in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file)
        if os.path.isfile(file_path):
            with open(file_path) as f:  
                line = f.readlines()
                if content_type == ContentType.AUT:
                    line = line[1]
                elif content_type == ContentType.SEC:
                    line = line[2]
                elif content_type == ContentType.ABS:
                    line = line[3]
                else:
                    line = line[0]
                line = line.strip()
                all_contents.append(line)
        else:
            print(file_path + ' does not exist.')
    return all_contents


def get_all_titles():
    return get_contents(ContentType.TIT)

def get_all_authors():        
    return get_contents(ContentType.AUT)

def get_all_sections():
    return get_contents(ContentType.SEC)

def get_all_abstracts():
    return get_contents(ContentType.ABS)

### 預處理
- 以gensim的simple_preprocess處理：斷詞，統一小寫，去標點
- 去除stopword
- lemmatize,stemming

In [9]:
def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            token = wordnet_lemmatizer.lemmatize(token, pos='v')
            token = wordnet_lemmatizer.lemmatize(token, pos='n')
            result.append(token)
    return result

### 取出所有摘要

In [4]:
contents = get_all_abstracts()
print('共',len(contents),'篇論文\n')

共 1343 篇論文



In [5]:
documents = pd.DataFrame(data=contents,columns=['abstract'])
documents['index'] = documents.index
documents[:10]

Unnamed: 0,abstract,index
0,We consider the problem of actively eliciting ...,0
1,We investigate the task of distractor generati...,1
2,The most common representation formalisms for ...,2
3,Statistical relational learning models are pow...,3
4,Multimodal representation learning is gaining ...,4
5,Reinforcement learning (RL) has shown its adva...,5
6,Selecting appropriate tutoring help actions th...,6
7,Recognizing time expressions is a fundamental ...,7
8,"When facing large-scale image datasets, online...",8
9,Temporal modeling in videos is a fundamental y...,9


### 取其中一篇，比較預處理前後

In [10]:
sample_doc_id = 1001

print('original document: ')
print(contents[sample_doc_id])

sample_doc = documents[documents['index'] == sample_doc_id].values[0][0]
print('\n\noriginal tokens: ')

words = []
for word in sample_doc.split(' '):
    words.append(word)
print(words)

tokens = preprocess(sample_doc)
print('\n\nlemmatized tokens: ')
print(tokens)

print('\n共',len(tokens),'字')

original document: 
In real-world applications of natural language generation, there are often constraints on the target sentences in addition to fluency and naturalness requirements. Existing language generation techniques are usually based on recurrent neural networks (RNNs). However, it is non-trivial to impose constraints on RNNs while maintaining generation quality, since RNNs generate sentences sequentially (or with beam search) from the first word to the last. In this paper, we propose CGMH, a novel approach using Metropolis-Hastings sampling for constrained sentence generation. CGMH allows complicated constraints such as the occurrence of multiple keywords in the target sentences, which cannot be handled in traditional RNN-based approaches. Moreover, CGMH works in the inference stage, and does not require parallel corpora for training. We evaluate our method on a variety of tasks, including keywords-to-sentence generation, unsupervised sentence paraphrasing, and unsupervised se

### 預處理的全部論文摘要

In [8]:
processed_docs = documents['abstract'].map(preprocess)
processed_docs[:10]

0    [consider, problem, actively, elicit, preferen...
1    [investigate, task, distractor, generation, mu...
2    [common, representation, formalisms, plan, des...
3    [statistical, relational, learn, model, powerf...
4    [multimodal, representation, learn, gain, deep...
5    [reinforcement, learn, show, advantage, image,...
6    [select, appropriate, tutor, help, action, acc...
7    [recognize, time, expressions, fundamental, im...
8    [face, large, scale, image, datasets, online, ...
9    [temporal, model, videos, fundamental, challen...
Name: abstract, dtype: object

## Bag of Words Corpus

### 產生bow資料集

In [9]:
# 產生字典
dictionary = gensim.corpora.Dictionary(processed_docs)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 active
1 actively
2 adaptive
3 aggregation
4 algorithm
5 allow
6 approach
7 bind
8 coefficients
9 collective
10 combinatorial


In [10]:
# 濾掉出現於少於10篇的字，或是，出現超過半篇的字
#dictionary.filter_extremes(no_below=10, no_above=0.4, keep_n=100000)
dictionary.filter_extremes(no_above=0.1, keep_n=100000)

count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break

0 active
1 actively
2 adaptive
3 aggregation
4 allow
5 bind
6 coefficients
7 collective
8 combinatorial
9 context
10 control


In [11]:
# 產生 bag of words corpus
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

print('共',len(bow_corpus),'筆')

共 1343 筆


### 查看其中一篇文章(sample document)的bow

In [12]:
c = 0

sample_doc_bow = bow_corpus[sample_doc_id]
for i in range(len(sample_doc_bow)):
    print("Word {} (\"{}\") appears {} time.".format(sample_doc_bow[i][0], 
                                               dictionary[sample_doc_bow[i][0]],
                                                     sample_doc_bow[i][1]))
    c = c + sample_doc_bow[i][1]
    
print('共',c,'字')   

Word 4 ("allow") appears 1 time.
Word 42 ("supervise") appears 1 time.
Word 62 ("generation") appears 6 time.
Word 76 ("recurrent") appears 1 time.
Word 80 ("sentence") appears 8 time.
Word 90 ("word") appears 1 time.
Word 133 ("inference") appears 1 time.
Word 142 ("parallel") appears 1 time.
Word 167 ("code") appears 1 time.
Word 179 ("github") appears 1 time.
Word 181 ("https") appears 1 time.
Word 187 ("maintain") appears 1 time.
Word 220 ("quality") appears 1 time.
Word 282 ("language") appears 2 time.
Word 286 ("natural") appears 1 time.
Word 303 ("addition") appears 1 time.
Word 354 ("constrain") appears 1 time.
Word 405 ("target") appears 2 time.
Word 423 ("handle") appears 1 time.
Word 439 ("search") appears 1 time.
Word 472 ("techniques") appears 1 time.
Word 473 ("usually") appears 1 time.
Word 567 ("error") appears 1 time.
Word 590 ("variety") appears 1 time.
Word 809 ("paraphrase") appears 1 time.
Word 815 ("traditional") appears 1 time.
Word 922 ("release") appears 1 time

## TF-IDF Corpus

### 產生TF-IDF資料集

In [13]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

# sample doc的tf-idf
corpus_tfidf[sample_doc_id]

[(4, 0.06068211250153376),
 (42, 0.05877488379893954),
 (62, 0.39990753163188436),
 (76, 0.06964910254609999),
 (80, 0.5377752767057431),
 (90, 0.060245133301391904),
 (133, 0.06609372061495924),
 (142, 0.092191919964687),
 (167, 0.08346124545105761),
 (179, 0.0986356706309067),
 (181, 0.0986356706309067),
 (187, 0.07980353556443551),
 (220, 0.05960408215932745),
 (282, 0.11401354286626544),
 (286, 0.059394109510280506),
 (303, 0.06693490399633174),
 (354, 0.08710584277208924),
 (405, 0.11634215221695465),
 (423, 0.06554871589335923),
 (439, 0.0642374944326745),
 (472, 0.06528073869748165),
 (473, 0.07307049690178523),
 (567, 0.07234651210975725),
 (590, 0.08290371079403613),
 (809, 0.12139029454401536),
 (815, 0.0738167616122096),
 (922, 0.0986356706309067),
 (985, 0.10743674197036336),
 (1028, 0.09760352807811248),
 (1068, 0.08915650228883412),
 (1140, 0.10202513385693786),
 (1249, 0.317899477403733),
 (1276, 0.23517124829678981),
 (1296, 0.14541160593934263),
 (1305, 0.1045803043649

## Train LDA models

In [14]:
num_topics = 20

### Running LDA using Bag of Words

In [15]:
lda_model = gensim.models.LdaMulticore(bow_corpus, 
                                       num_topics=num_topics, 
                                       id2word=dictionary, 
                                       passes=2, 
                                       workers=2)

In [16]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {}\nWords: {}\n'.format(idx, topic))

Topic: 0
Words: 0.008*"graph" + 0.007*"attack" + 0.006*"representations" + 0.006*"optimal" + 0.005*"latent" + 0.005*"sentence" + 0.005*"news" + 0.005*"point" + 0.005*"view" + 0.005*"word"

Topic: 1
Words: 0.011*"plan" + 0.011*"agents" + 0.009*"video" + 0.009*"action" + 0.008*"agent" + 0.008*"noise" + 0.006*"local" + 0.006*"view" + 0.006*"temporal" + 0.005*"representations"

Topic: 2
Words: 0.011*"graph" + 0.008*"temporal" + 0.008*"human" + 0.008*"spatial" + 0.007*"prediction" + 0.006*"sequence" + 0.005*"layer" + 0.005*"code" + 0.005*"search" + 0.005*"traffic"

Topic: 3
Words: 0.016*"action" + 0.014*"game" + 0.007*"agents" + 0.007*"question" + 0.006*"class" + 0.006*"plan" + 0.006*"mechanism" + 0.005*"answer" + 0.005*"value" + 0.005*"text"

Topic: 4
Words: 0.010*"decision" + 0.009*"agents" + 0.007*"agent" + 0.007*"user" + 0.007*"game" + 0.007*"make" + 0.006*"value" + 0.006*"prediction" + 0.005*"sequence" + 0.005*"risk"

Topic: 5
Words: 0.008*"decision" + 0.008*"plan" + 0.007*"agents" + 0

### Running LDA using TF-IDF

In [17]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, 
                                             num_topics=num_topics, 
                                             id2word=dictionary, 
                                             passes=2, 
                                             workers=4)

In [18]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {}\nWord: {}\n'.format(idx, topic))

Topic: 0
Word: 0.005*"domain" + 0.005*"search" + 0.004*"target" + 0.004*"source" + 0.004*"rule" + 0.004*"metric" + 0.004*"review" + 0.003*"cluster" + 0.003*"domains" + 0.003*"spectral"

Topic: 1
Word: 0.006*"graph" + 0.005*"vote" + 0.004*"word" + 0.004*"embed" + 0.004*"spatial" + 0.003*"question" + 0.003*"caption" + 0.003*"cost" + 0.003*"nod" + 0.003*"position"

Topic: 2
Word: 0.005*"sentence" + 0.004*"embed" + 0.004*"program" + 0.003*"answer" + 0.003*"representations" + 0.003*"risk" + 0.003*"word" + 0.003*"bound" + 0.003*"semantic" + 0.003*"spatial"

Topic: 3
Word: 0.005*"face" + 0.005*"translation" + 0.004*"cluster" + 0.004*"hash" + 0.004*"online" + 0.003*"sequence" + 0.003*"workers" + 0.003*"agents" + 0.003*"game" + 0.003*"instance"

Topic: 4
Word: 0.004*"text" + 0.004*"policy" + 0.003*"document" + 0.003*"bayesian" + 0.003*"discount" + 0.003*"gradient" + 0.003*"estimators" + 0.003*"experience" + 0.003*"approximation" + 0.003*"long"

Topic: 5
Word: 0.010*"plan" + 0.004*"graph" + 0.00

### Performance evaluation by classifying sample document

In [19]:
print(processed_docs[sample_doc_id])

['real', 'world', 'applications', 'natural', 'language', 'generation', 'constraints', 'target', 'sentence', 'addition', 'fluency', 'naturalness', 'requirements', 'exist', 'language', 'generation', 'techniques', 'usually', 'base', 'recurrent', 'neural', 'network', 'rnns', 'trivial', 'impose', 'constraints', 'rnns', 'maintain', 'generation', 'quality', 'rnns', 'generate', 'sentence', 'sequentially', 'beam', 'search', 'word', 'paper', 'propose', 'cgmh', 'novel', 'approach', 'metropolis', 'hastings', 'sample', 'constrain', 'sentence', 'generation', 'cgmh', 'allow', 'complicate', 'constraints', 'occurrence', 'multiple', 'keywords', 'target', 'sentence', 'handle', 'traditional', 'base', 'approach', 'cgmh', 'work', 'inference', 'stage', 'require', 'parallel', 'corpora', 'train', 'evaluate', 'method', 'variety', 'task', 'include', 'keywords', 'sentence', 'generation', 'unsupervised', 'sentence', 'paraphrase', 'unsupervised', 'sentence', 'error', 'correction', 'cgmh', 'achieve', 'high', 'perfor

In [20]:
for index, score in sorted(lda_model[bow_corpus[sample_doc_id]], key=lambda tup: -1*tup[1]):
    print('Topic id:',index)
    print("\nScore: {}\t \nTopic: {}\n".format(score, lda_model.print_topic(index, 30)))

Topic id: 16

Score: 0.9846753478050232	 
Topic: 0.013*"sentence" + 0.013*"query" + 0.012*"language" + 0.007*"translation" + 0.006*"document" + 0.006*"domain" + 0.006*"pattern" + 0.006*"sequence" + 0.006*"natural" + 0.005*"generation" + 0.005*"word" + 0.005*"layer" + 0.005*"paraphrase" + 0.005*"quality" + 0.005*"hide" + 0.004*"attack" + 0.004*"target" + 0.004*"action" + 0.004*"human" + 0.004*"adversarial" + 0.004*"systems" + 0.004*"examples" + 0.004*"understand" + 0.004*"decoder" + 0.004*"answer" + 0.004*"saliency" + 0.003*"video" + 0.003*"allow" + 0.003*"domains" + 0.003*"encoder"



In [21]:
for index, score in sorted(lda_model_tfidf[bow_corpus[sample_doc_id]], key=lambda tup: -1*tup[1]):
    print('Topic id:',index)
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 30)))

Topic id: 1

Score: 0.5217251181602478	 
Topic: 0.006*"graph" + 0.005*"vote" + 0.004*"word" + 0.004*"embed" + 0.004*"spatial" + 0.003*"question" + 0.003*"caption" + 0.003*"cost" + 0.003*"nod" + 0.003*"position" + 0.003*"class" + 0.003*"cluster" + 0.003*"bound" + 0.003*"abstraction" + 0.003*"local" + 0.003*"convolutional" + 0.003*"optimization" + 0.003*"video" + 0.003*"convex" + 0.003*"sentence" + 0.003*"underlie" + 0.003*"generation" + 0.003*"weight" + 0.003*"set" + 0.002*"predict" + 0.002*"recurrent" + 0.002*"inference" + 0.002*"dependency" + 0.002*"general" + 0.002*"decision"
Topic id: 17

Score: 0.32334885001182556	 
Topic: 0.006*"word" + 0.005*"query" + 0.005*"sentence" + 0.005*"language" + 0.004*"graph" + 0.003*"expert" + 0.003*"embed" + 0.003*"english" + 0.003*"video" + 0.003*"flow" + 0.003*"fair" + 0.003*"layer" + 0.003*"decision" + 0.003*"decoder" + 0.002*"loss" + 0.002*"margin" + 0.002*"paraphrase" + 0.002*"attack" + 0.002*"attribute" + 0.002*"computational" + 0.002*"post" + 0

## 找出最相似的文件

### 以sample document為例

In [22]:
# 方式一：cos相似
cos_sim = []
vec_lda1 = lda_model[bow_corpus[sample_doc_id]]
for i in range(len(bow_corpus)):
    vec_lda2 = lda_model[bow_corpus[i]]
    sim = gensim.matutils.cossim(vec_lda1, vec_lda2)
    cos_sim.append(sim)

top2_sim_index = sorted(range(len(cos_sim)), key=lambda i: cos_sim[i])[-2:]
cos_most_sim_id = top2_sim_index[0]

In [23]:
# 方式二：Hellinger distance is useful for similarity between probability distributions (such as LDA topics)
hd_sim = []
vec_lda1 = lda_model[bow_corpus[sample_doc_id]]
for i in range(len(bow_corpus)):
    vec_lda2 = lda_model[bow_corpus[i]]
    dense1 = gensim.matutils.sparse2full(vec_lda1, lda_model.num_topics)
    dense2 = gensim.matutils.sparse2full(vec_lda2, lda_model.num_topics)
    sim = np.sqrt(0.5 * ((np.sqrt(dense1) - np.sqrt(dense2))**2).sum())
    hd_sim.append(sim)

top1_sim_index = sorted(range(len(hd_sim)), key=lambda i: hd_sim[i])[-1:]
hd_most_sim_id = top1_sim_index[0]

In [24]:
titles = get_all_titles()
sections = get_all_sections()

In [25]:
def show_doc(head,doc_id):
    print('[',head,':',doc_id,']\n')
    print(titles[doc_id],'\n')
    print(sections[doc_id],'\n')
    print(contents[doc_id],'\n\n')

In [26]:
#documents.loc[sample_doc_id,'abstract']

show_doc('Sample Document',sample_doc_id)
show_doc('Most similar Document(Cos Similarity)',cos_most_sim_id)
show_doc('Most similar Document(Hellinger distance)',hd_most_sim_id)

[ Sample Document : 1001 ]

CGMH: Constrained Sentence Generation by Metropolis-Hastings Sampling 

AAAI Technical Track: Natural Language Processing 

In real-world applications of natural language generation, there are often constraints on the target sentences in addition to fluency and naturalness requirements. Existing language generation techniques are usually based on recurrent neural networks (RNNs). However, it is non-trivial to impose constraints on RNNs while maintaining generation quality, since RNNs generate sentences sequentially (or with beam search) from the first word to the last. In this paper, we propose CGMH, a novel approach using Metropolis-Hastings sampling for constrained sentence generation. CGMH allows complicated constraints such as the occurrence of multiple keywords in the target sentences, which cannot be handled in traditional RNN-based approaches. Moreover, CGMH works in the inference stage, and does not require parallel corpora for training. We evaluate 

### small test
- gensim.utils.simple_preprocess(doc,deacc=True)
- deacc=True 是去除上標，不是去除標點符號，預設就已經會去除標點符號了

In [15]:
s = 'In real-world applications of natural language generation, there are often constraints on the target sentences in addition to fluency and naturalness requirements. Existing language generation techniques are usually based on recurrent neural networks (RNNs). However, it is non-trivial to impose constraints on RNNs while maintaining generation quality, since RNNs generate sentences sequentially (or with beam search) from the first word to the last. In this paper, we propose CGMH, a novel approach using Metropolis-Hastings sampling for constrained sentence generation. CGMH allows complicated constraints such as the occurrence of multiple keywords in the target sentences, which cannot be handled in traditional RNN-based approaches. Moreover, CGMH works in the inference stage, and does not require parallel corpora for training. We evaluate our method on a variety of tasks, including keywords-to-sentence generation, unsupervised sentence paraphrasing, and unsupervised sentence error correction. CGMH achieves high performance compared with previous supervised methods for sentence generation. Our code is released at https://github.com/NingMiao/CGMH'
r = []
for token in gensim.utils.simple_preprocess(s,deacc=True):#,deacc=True
    r.append(token)
print(r)

['in', 'real', 'world', 'applications', 'of', 'natural', 'language', 'generation', 'there', 'are', 'often', 'constraints', 'on', 'the', 'target', 'sentences', 'in', 'addition', 'to', 'fluency', 'and', 'naturalness', 'requirements', 'existing', 'language', 'generation', 'techniques', 'are', 'usually', 'based', 'on', 'recurrent', 'neural', 'networks', 'rnns', 'however', 'it', 'is', 'non', 'trivial', 'to', 'impose', 'constraints', 'on', 'rnns', 'while', 'maintaining', 'generation', 'quality', 'since', 'rnns', 'generate', 'sentences', 'sequentially', 'or', 'with', 'beam', 'search', 'from', 'the', 'first', 'word', 'to', 'the', 'last', 'in', 'this', 'paper', 'we', 'propose', 'cgmh', 'novel', 'approach', 'using', 'metropolis', 'hastings', 'sampling', 'for', 'constrained', 'sentence', 'generation', 'cgmh', 'allows', 'complicated', 'constraints', 'such', 'as', 'the', 'occurrence', 'of', 'multiple', 'keywords', 'in', 'the', 'target', 'sentences', 'which', 'cannot', 'be', 'handled', 'in', 'tradit