# 以LDA Topics分佈 計算文件相似度

### 參考資料
-

In [None]:
#!pip install pyldavis

In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from enum import Enum
from pprint import pprint

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]

import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

import pyLDAvis
import pyLDAvis.gensim
from gensim.models.ldamodel import LdaModel

## 自定義 data types and functions

In [15]:
class ContentType(Enum):
    TIT = 'title'
    ABS = 'abstract'
    AUT = 'author'
    SEC = 'section'
    
def get_contents(content_type):
    all_contents = []
    dataset_path = '../dataset'
    for file in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file)
        if os.path.isfile(file_path):
            with open(file_path) as f:  
                line = f.readlines()
                if content_type == ContentType.AUT:
                    line = line[1]
                elif content_type == ContentType.SEC:
                    line = line[2]
                elif content_type == ContentType.ABS:
                    line = line[3]
                else:
                    line = line[0]
                line = line.strip()
                all_contents.append(line)
        else:
            print(file_path + ' does not exist.')
    return all_contents


def get_all_titles():
    return get_contents(ContentType.TIT)

def get_all_authors():        
    return get_contents(ContentType.AUT)

def get_all_sections():
    return get_contents(ContentType.SEC)

def get_all_abstracts():
    return get_contents(ContentType.ABS)

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            token = wordnet_lemmatizer.lemmatize(token, pos='v')
            token = wordnet_lemmatizer.lemmatize(token, pos='n')
            result.append(token)
    return result

def show_doc(head,doc_id):
    print('[',head,':',doc_id,']\n')
    print(titles[doc_id],'\n')
    print(sections[doc_id],'\n')
    print(contents[doc_id],'\n\n')

### 取出所有摘要

In [3]:
contents = get_all_abstracts()
print('共',len(contents),'篇論文\n')

documents = pd.DataFrame(data=contents,columns=['abstract'])
documents['index'] = documents.index
documents[:10]

共 1343 篇論文



Unnamed: 0,abstract,index
0,We consider the problem of actively eliciting ...,0
1,We investigate the task of distractor generati...,1
2,The most common representation formalisms for ...,2
3,Statistical relational learning models are pow...,3
4,Multimodal representation learning is gaining ...,4
5,Reinforcement learning (RL) has shown its adva...,5
6,Selecting appropriate tutoring help actions th...,6
7,Recognizing time expressions is a fundamental ...,7
8,"When facing large-scale image datasets, online...",8
9,Temporal modeling in videos is a fundamental y...,9


### 預處理的全部論文摘要

In [4]:
processed_docs = documents['abstract'].map(preprocess)
processed_docs[:10]

0    [consider, problem, actively, elicit, preferen...
1    [investigate, task, distractor, generation, mu...
2    [common, representation, formalism, plan, desc...
3    [statistical, relational, learn, model, powerf...
4    [multimodal, representation, learn, gain, deep...
5    [reinforcement, learn, show, advantage, image,...
6    [select, appropriate, tutor, help, action, acc...
7    [recognize, time, expression, fundamental, imp...
8    [face, large, scale, image, datasets, online, ...
9    [temporal, model, video, fundamental, challeng...
Name: abstract, dtype: object

## Dataset

### 產生字典

In [5]:
dictionary = gensim.corpora.Dictionary(processed_docs)
print('共',len(dictionary),'個字\n')

共 6927 個字



### 產生 bag of words corpus

In [6]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
print('共',len(bow_corpus),'筆')

共 1343 筆


### 產生TF-IDF Corpus

In [7]:
tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]
print('共',len(corpus_tfidf),'筆')

共 1343 筆


## Load LDA models

In [121]:
num_topics = '30'
file_name = '../models/lda_tfidf_topic_' + num_topics + '.model'
lda_model = models.ldamodel.LdaModel.load(file_name)

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)


### 找出最相似的文件

In [122]:
titles = get_all_titles()
sections = get_all_sections()

In [123]:
sample_doc_id = 137

### Cosine Similarity

In [154]:
cos_sim = []
dd = corpus_tfidf[sample_doc_id]
#lda_vec1 = lda_model[dd]
lda_vec1 = np.array([tup[1] for tup in lda_mode.get_document_topics(bow=dd)])
print(lda_vec1)

for i in range(len(corpus_tfidf)):
    lda_vec2 = lda_model[corpus_tfidf[i]]
    sim = gensim.matutils.cossim(lda_vec1, lda_vec2)
    cos_sim.append(sim)

cos_sim_doc_id = sorted(range(len(cos_sim)), key=lambda i: cos_sim[i])[-2:]
cos_sim_doc_id

NameError: name 'lda_mode' is not defined

### Hellinger distance
- is useful for similarity between probability distributions (such as LDA topics)

In [153]:
hd_sim = []
lda_vec1 = lda_model[corpus_tfidf[sample_doc_id]]
print(lda_vec1)
for i in range(len(corpus_tfidf)):
    lda_vec2 = lda_model[corpus_tfidf[i]]
    dense1 = gensim.matutils.sparse2full(lda_vec1, lda_model.num_topics)
    dense2 = gensim.matutils.sparse2full(lda_vec2, lda_model.num_topics)
    sim = np.sqrt(0.5 * ((np.sqrt(dense1) - np.sqrt(dense2))**2).sum())
    hd_sim.append(sim)

hd_sim_doc_id = sorted(range(len(docs_sim)), key=lambda i: docs_sim[i])[1]
hd_sim_doc_id

[(15, 0.055832896), (27, 0.75282466)]


869

In [126]:
for index, score in sorted(lda_model[corpus_tfidf[sample_doc_id]], key=lambda tup: -1*tup[1]):
    print('Topic id:',index)
    print("\nScore: {}\t \nTopic: {}\n".format(score, lda_model.print_topic(index, 30)))

Topic id: 27

Score: 0.7513797879219055	 
Topic: 0.006*"model" + 0.005*"learn" + 0.005*"network" + 0.005*"data" + 0.005*"feature" + 0.004*"method" + 0.004*"image" + 0.004*"label" + 0.004*"task" + 0.004*"agent" + 0.004*"algorithm" + 0.004*"train" + 0.004*"problem" + 0.004*"approach" + 0.004*"domain" + 0.004*"information" + 0.003*"representation" + 0.003*"neural" + 0.003*"base" + 0.003*"word" + 0.003*"time" + 0.003*"graph" + 0.003*"multi" + 0.003*"text" + 0.003*"performance" + 0.003*"knowledge" + 0.003*"deep" + 0.003*"sample" + 0.003*"video" + 0.003*"structure"

Topic id: 15

Score: 0.05764486640691757	 
Topic: 0.003*"customer" + 0.003*"dialogue" + 0.003*"check" + 0.002*"character" + 0.002*"macro" + 0.002*"provider" + 0.002*"dispersal" + 0.002*"acoustic" + 0.002*"gold" + 0.002*"removal" + 0.002*"speech" + 0.002*"attack" + 0.002*"forbid" + 0.002*"worker" + 0.002*"daml" + 0.002*"tamp" + 0.002*"peer" + 0.002*"reid" + 0.002*"satisfiability" + 0.002*"clip" + 0.002*"market" + 0.001*"disentangl

In [127]:
for index, score in sorted(lda_model[corpus_tfidf[hd_sim_doc_id]], key=lambda tup: -1*tup[1]):
    print('Topic id:',index)
    print("\nScore: {}\t \nTopic: {}\n".format(score, lda_model.print_topic(index, 30)))

Topic id: 13

Score: 0.4843583405017853	 
Topic: 0.004*"reader" + 0.003*"circuit" + 0.003*"generator" + 0.002*"traffic" + 0.002*"count" + 0.002*"sans" + 0.002*"communicate" + 0.002*"criterion" + 0.002*"norm" + 0.002*"pddl" + 0.002*"rgnn" + 0.002*"logistic" + 0.002*"boaf" + 0.002*"bdcmf" + 0.002*"milp" + 0.002*"fluid" + 0.002*"blind" + 0.002*"spot" + 0.002*"mtas" + 0.002*"create" + 0.002*"planner" + 0.002*"kernel" + 0.002*"vmcqs" + 0.002*"scene" + 0.002*"activation" + 0.002*"subgoals" + 0.002*"imvc" + 0.002*"independence" + 0.002*"bone" + 0.002*"molecule"

Topic id: 27

Score: 0.33079037070274353	 
Topic: 0.006*"model" + 0.005*"learn" + 0.005*"network" + 0.005*"data" + 0.005*"feature" + 0.004*"method" + 0.004*"image" + 0.004*"label" + 0.004*"task" + 0.004*"agent" + 0.004*"algorithm" + 0.004*"train" + 0.004*"problem" + 0.004*"approach" + 0.004*"domain" + 0.004*"information" + 0.003*"representation" + 0.003*"neural" + 0.003*"base" + 0.003*"word" + 0.003*"time" + 0.003*"graph" + 0.003*"mul

In [128]:
show_doc('Sample Document',sample_doc_id)
show_doc('Most similar Document - Cos Similarity',cos_sim_doc_id)
show_doc('Most similar Document - Hellinger distance',hd_sim_doc_id)

[ Sample Document : 137 ]

Entity Alignment between Knowledge Graphs Using Attribute Embeddings 

AAAI Technical Track: AI and the Web 

The task of entity alignment between knowledge graphs aims to find entities in two knowledge graphs that represent the same real-world entity. Recently, embedding-based models are proposed for this task. Such models are built on top of a knowledge graph embedding model that learns entity embeddings to capture the semantic similarity between entities in the same knowledge graph. We propose to learn embeddings that can capture the similarity between entities in different knowledge graphs. Our proposed model helps align entities from different knowledge graphs, and hence enables the integration of multiple knowledge graphs. Our model exploits large numbers of attribute triples existing in the knowledge graphs and generates attribute character embeddings. The attribute character embedding shifts the entity embeddings from two knowledge graphs into the sam