# 以LDA Topics分佈 計算文件相似度

- 載入其他程式處理好的語料和模型
- 本程式只計算similarity

In [1]:
import os
import numpy as np
import pandas as pd
from collections import Counter
from enum import Enum
from pprint import pprint

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
cols = [color for name, color in mcolors.TABLEAU_COLORS.items()]

import nltk
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora, models

import pyLDAvis
import pyLDAvis.gensim
from gensim.models.ldamodel import LdaModel

In [2]:
class ContentType(Enum):
    TIT = 'title'
    ABS = 'abstract'
    AUT = 'author'
    SEC = 'section'
    
def get_contents(content_type):
    all_contents = []
    dataset_path = '../dataset'
    for file in os.listdir(dataset_path):
        file_path = os.path.join(dataset_path, file)
        if os.path.isfile(file_path):
            with open(file_path) as f:  
                line = f.readlines()
                if content_type == ContentType.AUT:
                    line = line[1]
                elif content_type == ContentType.SEC:
                    line = line[2]
                elif content_type == ContentType.ABS:
                    line = line[3]
                else:
                    line = line[0]
                line = line.strip()
                all_contents.append(line)
        else:
            print(file_path + ' does not exist.')
    return all_contents


def get_all_titles():
    return get_contents(ContentType.TIT)

def get_all_authors():        
    return get_contents(ContentType.AUT)

def get_all_sections():
    return get_contents(ContentType.SEC)

def get_all_abstracts():
    return get_contents(ContentType.ABS)

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            token = wordnet_lemmatizer.lemmatize(token, pos='v')
            token = wordnet_lemmatizer.lemmatize(token, pos='n')
            result.append(token)
    return result

def show_doc(head,doc_id):
    print('[',head,':',doc_id,']\n')
    print(titles[doc_id],'\n')
    print(sections[doc_id],'\n')
    print(contents[doc_id],'\n\n')

### 取出所有摘要

In [3]:
contents = get_all_abstracts()
print('共',len(contents),'篇論文\n')

documents = pd.DataFrame(data=contents,columns=['abstract'])
documents['index'] = documents.index
documents[:10]

共 1343 篇論文



Unnamed: 0,abstract,index
0,We consider the problem of actively eliciting ...,0
1,We investigate the task of distractor generati...,1
2,The most common representation formalisms for ...,2
3,Statistical relational learning models are pow...,3
4,Multimodal representation learning is gaining ...,4
5,Reinforcement learning (RL) has shown its adva...,5
6,Selecting appropriate tutoring help actions th...,6
7,Recognizing time expressions is a fundamental ...,7
8,"When facing large-scale image datasets, online...",8
9,Temporal modeling in videos is a fundamental y...,9


### 載入字典

In [4]:
dictionary = corpora.Dictionary.load('../corpus/dict_trigram_filtered.dict')
print('共',len(dictionary),'個字\n')

共 1586 個字



  args, varargs, keywords, defaults = inspect.getargspec(kallable)


### 載入 bow corpus

In [5]:
bow_corpus = corpora.MmCorpus('../corpus/corpus_trigram_filtered.mm')
print('共',len(bow_corpus),'筆')

共 1343 筆


  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)


### 載入 TF-IDF Corpus

In [None]:
#tfidf = models.TfidfModel(bow_corpus)
#corpus_tfidf = tfidf[bow_corpus]
#print('共',len(corpus_tfidf),'筆')

### 載入 LDA models

In [6]:
num_topics = 10
file_name = '../models/lda_trigram_bow_filtered_topic_' + str(num_topics) + '.model'
lda_model = models.ldamodel.LdaModel.load(file_name)

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)


### 讀入title和section

In [7]:
titles = get_all_titles()
sections = get_all_sections()

### 測試範例

In [8]:
sample_doc_id = 137

### Cosine Similarity

In [9]:
cos_sim = []

#sample_doc = bow_corpus[sample_doc_id]
#print(sample_doc)
#lda_vec1 = np.array([tup[1] for tup in lda_model.get_document_topics(bow=sample_doc)])

lda_vec1 = lda_model[bow_corpus[sample_doc_id]]
print(lda_vec1)

for i in range(len(bow_corpus)):
    lda_vec2 = lda_model[bow_corpus[i]]
    #print(lda_vec2)
    sim = gensim.matutils.cossim(lda_vec1,lda_vec2)
    cos_sim.append(sim)

cos_sim_doc_id = sorted(range(len(cos_sim)), key=lambda i: cos_sim[i])[-1]
cos_sim_doc_id

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kalla

[(2, 0.09054613), (4, 0.901203)]


  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kalla

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kalla

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kalla

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kalla

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kalla

423

### Hellinger distance
- is useful for similarity between probability distributions (such as LDA topics)

In [13]:
hd_sim = []

lda_vec1 = lda_model[bow_corpus[sample_doc_id]]
print(lda_vec1)

for i in range(len(bow_corpus)):
    lda_vec2 = lda_model[bow_corpus[i]]
    dense1 = gensim.matutils.sparse2full(lda_vec1, lda_model.num_topics)
    dense2 = gensim.matutils.sparse2full(lda_vec2, lda_model.num_topics)
    sim = np.sqrt(0.5 * ((np.sqrt(dense1) - np.sqrt(dense2))**2).sum())
    hd_sim.append(sim)

hd_sim_doc_id = sorted(range(len(hd_sim)), key=lambda i: hd_sim[i])[1]
hd_sim_doc_id

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kalla

[(4, 0.92542166), (8, 0.06632753)]


  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kalla

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kalla

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kalla

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kalla

  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kallable)
  args, varargs, keywords, defaults = inspect.getargspec(kalla

384

In [14]:
for index, score in sorted(lda_model[bow_corpus[sample_doc_id]], key=lambda tup: -1*tup[1]):
    print('Topic id:',index)
    print("\nScore: {}\t \nTopic: {}\n".format(score, lda_model.print_topic(index, 30)))

Topic id: 4

Score: 0.9012150764465332	 
Topic: 0.047*"graph" + 0.016*"structure" + 0.015*"embed" + 0.014*"knowledge" + 0.010*"nod" + 0.009*"relation" + 0.008*"attribute" + 0.007*"knowledge_graph" + 0.007*"path" + 0.007*"community" + 0.007*"node" + 0.006*"edge" + 0.006*"entity" + 0.006*"deep" + 0.006*"loss" + 0.005*"good" + 0.005*"object" + 0.005*"dynamic" + 0.005*"interaction" + 0.005*"causal" + 0.005*"triple" + 0.005*"embeddings" + 0.005*"term" + 0.005*"flow" + 0.005*"large" + 0.004*"market" + 0.004*"unknown" + 0.004*"application" + 0.004*"population" + 0.004*"transition"

Topic id: 2

Score: 0.09053397178649902	 
Topic: 0.026*"user" + 0.013*"text" + 0.013*"attention" + 0.010*"content" + 0.010*"relation" + 0.009*"generate" + 0.008*"recommendation" + 0.008*"representation" + 0.007*"entity" + 0.007*"multi" + 0.007*"dataset" + 0.007*"interaction" + 0.006*"context" + 0.006*"sequence" + 0.006*"aspect" + 0.006*"word" + 0.006*"level" + 0.006*"sentiment" + 0.005*"item" + 0.005*"memory" + 0.0

  args, varargs, keywords, defaults = inspect.getargspec(kallable)


In [15]:
for index, score in sorted(lda_model[bow_corpus[hd_sim_doc_id]], key=lambda tup: -1*tup[1]):
    print('Topic id:',index)
    print("\nScore: {}\t \nTopic: {}\n".format(score, lda_model.print_topic(index, 30)))

Topic id: 4

Score: 0.9709571599960327	 
Topic: 0.047*"graph" + 0.016*"structure" + 0.015*"embed" + 0.014*"knowledge" + 0.010*"nod" + 0.009*"relation" + 0.008*"attribute" + 0.007*"knowledge_graph" + 0.007*"path" + 0.007*"community" + 0.007*"node" + 0.006*"edge" + 0.006*"entity" + 0.006*"deep" + 0.006*"loss" + 0.005*"good" + 0.005*"object" + 0.005*"dynamic" + 0.005*"interaction" + 0.005*"causal" + 0.005*"triple" + 0.005*"embeddings" + 0.005*"term" + 0.005*"flow" + 0.005*"large" + 0.004*"market" + 0.004*"unknown" + 0.004*"application" + 0.004*"population" + 0.004*"transition"



  args, varargs, keywords, defaults = inspect.getargspec(kallable)


In [16]:
show_doc('Sample Document',sample_doc_id)
show_doc('Most similar Document - Cos Similarity',cos_sim_doc_id)
show_doc('Most similar Document - Hellinger distance',hd_sim_doc_id)

[ Sample Document : 137 ]

Entity Alignment between Knowledge Graphs Using Attribute Embeddings 

AAAI Technical Track: AI and the Web 

The task of entity alignment between knowledge graphs aims to find entities in two knowledge graphs that represent the same real-world entity. Recently, embedding-based models are proposed for this task. Such models are built on top of a knowledge graph embedding model that learns entity embeddings to capture the semantic similarity between entities in the same knowledge graph. We propose to learn embeddings that can capture the similarity between entities in different knowledge graphs. Our proposed model helps align entities from different knowledge graphs, and hence enables the integration of multiple knowledge graphs. Our model exploits large numbers of attribute triples existing in the knowledge graphs and generates attribute character embeddings. The attribute character embedding shifts the entity embeddings from two knowledge graphs into the sam

In [None]:
lda_vec1 = [(1,0.1),(2,0.8)]
lda_vec2 = [(2,0.8),(1,0.1)]
sim = gensim.matutils.cossim(lda_vec1,lda_vec2)
sim