# Ontology Mapping by ensembling several mapping strategies
1. concept label mapping
2. label synonyms mapping
3. Jaccard distances between related words of concepts
4. Wasserstein distances between embeddings of related words of concepts
5. Wasserstein distances between persistent homology graphs of embeddings of related words of concepts
6. Wasserstein distances between the average embeddings of the rest concepts

In [2]:
# import libraries
import pandas as pd
import numpy as np
import scipy as sp
import json

from rdflib import Graph, URIRef, RDFS
from gensim.models.fasttext import FastText
from gensim.models.fasttext import load_facebook_model
from gensim.models.fasttext import load_facebook_vectors

import corpus_build_utils
from corpus_build_utils import clean_document_lower
from AlignmentFormat import serialize_mapping_to_tmp_file

import sys
import logging
from collections import defaultdict

import jellyfish
import ot

from xml.dom import minidom
from nltk.corpus import wordnet

import importlib

import matplotlib.pyplot as plt
%matplotlib inline

import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

from gtda.homology import VietorisRipsPersistence

from tqdm import tqdm

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.info("logging info test")

INFO:root:logging info test
INFO:root:logging info test


# Load pre-trained Fasttext embeddings

In [9]:
importlib.reload(maponto)

<module 'OTMapOnto' from '/Users/yan/Google Drive/ontology-mapping/src/OTMapOnto.py'>

In [39]:
importlib.reload(mapneighbor)

INFO:root:logging info test


<module 'OTNeighborhood_TDA' from '/Users/yan/Google Drive/ontology-mapping/src/OTNeighborhood_TDA.py'>

In [None]:
%%time
model_path="../model/crawl-300d-2M-subword.bin"
embs_model = maponto.load_embeddings(model_path, None)

INFO:root:Load pre-trained embeddings for about 10 mins if the model hasn't been load yet.
INFO:root:Loading the pre-trained Fasttext model...Please be patient...It may take about 10 mins to load...
INFO:gensim.models._fasttext_bin:loading 2000000 words for fastText model from ../model/crawl-300d-2M-subword.bin


Loading the pre-trained Fasttext model...Please be patient...It may take a while to load...


ERROR:gensim.models._fasttext_bin:failed to decode invalid unicode bytes b'DeutschHrvatskiEnglishDanskNederlandssuomiFran\xc3\xa7ais\xce\x95\xce\xbb\xce\xbb\xce'; replacing invalid characters, using 'DeutschHrvatskiEnglishDanskNederlandssuomiFrançaisΕλλ\\xce'
ERROR:gensim.models._fasttext_bin:failed to decode invalid unicode bytes b'\xe3\x81\x99\xe3\x81\xb9\xe3\x81\xa6\xe3\x81\xae\xe5\x9b\x9e\xe7\xad\x94\xe3\x82\x92\xe9\x9d\x9e\xe8\xa1\xa8\xe7\xa4\xba\xe3\x81\xab\xe3\x81\x99\xe3\x82\x8b\xe8\xb3\xaa\xe5\x95\x8f\xe3\x82\x92\xe5\x89\x8a\xe9\x99\xa4\xe3\x81\x97\xe3'; replacing invalid characters, using 'すべての回答を非表示にする質問を削除し\\xe3'
ERROR:gensim.models._fasttext_bin:failed to decode invalid unicode bytes b'00Z\xe9\x83\xa8\xe5\xb1\x8b\xe3\x82\xbf\xe3\x82\xa4\xe3\x83\x97\xe3\x81\xbe\xe3\x82\x8b\xe3\x81\xbe\xe3\x82\x8b\xe8\xb2\xb8\xe5\x88\x87\xe5\xbb\xba\xe7\x89\xa9\xe3\x82\xbf\xe3\x82\xa4\xe3\x83\x97\xe4\xb8\x80\xe8\xbb\x92\xe5'; replacing invalid characters, using '00Z部屋タイプまるまる貸切建物タイプ一軒\\xe5'
E

# Load com.owl and Conference.owl RDF Graphs

In [18]:
cmt_url = "../data/conference/cmt.owl"
conference_url = "../data/conference/Conference.owl"
cmt_graph = Graph().parse(cmt_url)
conference_graph = Graph().parse(conference_url)

In [19]:
cmt_conference_url = "../data/conference/reference-alignment/cmt-conference.rdf"

In [20]:
cmtlabel_uris = maponto.extract_label_uris(cmt_graph)
conferencelabel_uris = maponto.extract_label_uris(conference_graph)

In [21]:
cmtlabel_clnd_uris = maponto.clean_labels(cmtlabel_uris, rmStopWords=True)
conferencelabel_clnd_uris = maponto.clean_labels(conferencelabel_uris, rmStopWords=True)
cmtlabel_clnd_uris.columns, conferencelabel_clnd_uris.columns

(Index(['label', 'uri', 'clndLabel'], dtype='object'),
 Index(['label', 'uri', 'clndLabel'], dtype='object'))

# Mapping Functions with Various Mapping Methods
1. concept label mapping
2. label synonyms mapping
3. Jaccard distances between related words of concepts
4. Wasserstein distances between embeddings of related words of concepts
5. Wasserstein distances between persistent homology graphs of embeddings of related words of concepts
6. Wasserstein distances between the average embeddings of the rest concepts

In [54]:
importlib.reload(mapneighbor)

INFO:root:logging info test


<module 'OTNeighborhood_TDA' from '/Users/yan/Google Drive/ontology-mapping/src/OTNeighborhood_TDA.py'>

In [23]:
# extract the concepts that are not matched so far
def extract_rest_concepts(label_clnd_uris, current_align, where):
    """
        input: label_clnd_uris: DataFrame with the concepts for mapping, {'label', 'uri', 'clndLabel'}
               current_align: DataFrame with the current mapping created by previous step
               where: a string indicating whether this is for source or target concept
        output: a DataFrame with rest of the concepts for mapping, {'label', 'uri', 'clndLabel'}
    """
    if where == 'source':
        return label_clnd_uris[~label_clnd_uris.uri.isin(current_align.source)].reset_index(drop=True)
    elif where == 'target':
        return label_clnd_uris[~label_clnd_uris.uri.isin(current_align.target)].reset_index(drop=True)
    else:
        return None

In [38]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_jac_was_ph_rest(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, LEVEL_3. JACCARD, 
               LEVEL_4. WASSERSTEIN DISTANCE, LEVEL_5. PH, AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    # mapping using Jaccard distance between sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    jds = mapneighbor.compute_pairwise_Jaccard_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords)
    if len(jds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, jds, None) 
        logging.info("The number of level_3 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    # mapping using wasserstein distance between the embeddings of sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    wds = mapneighbor.compute_pairwise_Wasserstein_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords, embs_model)
    if len(wds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds, None) 
        logging.info("The number of level_4 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    logging.info("Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...")
    src_diagrams = mapneighbor.compute_phDiagrams(slabel_clnd_uris_rest, src_uris_relatedWords, embs_model)
    tgt_diagrams = mapneighbor.compute_phDiagrams(tlabel_clnd_uris_rest, tgt_uris_relatedWords, embs_model)

    # mapping using WD between PH diagrams of source and target concepts
    wds_phDiagrams_arr = mapneighbor.compute_pairwise_wd_phDiagrams(src_diagrams, tgt_diagrams)
    if len(wds_phDiagrams_arr) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds_phDiagrams_arr, None)
        logging.info("The number of level_5 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    current_align = maponto.match_label_embeddings_OT(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, 
                                                      embs_model, maponto.make_mappings_nn, None, None)
    logging.info("The number of level_6 predicted mapping is {}".format(current_align.shape[0]))
    
    ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    #maponto.evaluate(ensemble_align, refs_url)

    return ensemble_align.reset_index(drop=True)

In [125]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_jac_was_rest(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, LEVEL_3. JACCARD, 
               LEVEL_4. WASSERSTEIN DISTANCE, LEVEL_5. PH, AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    # mapping using Jaccard distance between sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    jds = mapneighbor.compute_pairwise_Jaccard_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords)
    if len(jds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, jds, None) 
        logging.info("The number of level_3 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    # mapping using wasserstein distance between the embeddings of sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    wds = mapneighbor.compute_pairwise_Wasserstein_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords, embs_model)
    if len(wds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds, None) 
        logging.info("The number of level_4 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    logging.info("Skip level_5 mapping.")

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    current_align = maponto.match_label_embeddings_OT(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, 
                                                      embs_model, maponto.make_mappings_nn, None, None)
    logging.info("The number of level_6 predicted mapping is {}".format(current_align.shape[0]))
    
    ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    #maponto.evaluate(ensemble_align, refs_url)

    return ensemble_align.reset_index(drop=True)

In [84]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_jac_was(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, LEVEL_3. JACCARD, 
               LEVEL_4. WASSERSTEIN DISTANCE, LEVEL_5. PH, AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    # mapping using Jaccard distance between sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    jds = mapneighbor.compute_pairwise_Jaccard_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords)
    if len(jds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, jds, None) 
        logging.info("The number of level_3 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    # mapping using wasserstein distance between the embeddings of sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    wds = mapneighbor.compute_pairwise_Wasserstein_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords, embs_model)
    if len(wds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds, None) 
        logging.info("The number of level_4 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    logging.info("Skip Level_5 and Level_6 mapping.")
    
    #maponto.evaluate(ensemble_align, refs_url)

    return ensemble_align.reset_index(drop=True)

In [90]:
ensemble_align = mapping_label_syn_jac_was_ph_rest(cmtlabel_clnd_uris, conferencelabel_clnd_uris, cmt_graph, conference_graph, embs_model)
maponto.evaluate(ensemble_align, cmt_conference_url)

INFO:root:The number of level_1 predicted mapping is 6.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 1
INFO:root:Compute the related words for each source concept....
22it [00:00, 38.56it/s]
INFO:root:Total number of source concepts computed with relatd words is 22
INFO:root:Compute the related words for each target concept....
52it [00:01, 37.87it/s]
INFO:root:Total number of target concepts computed with related words is 52
INFO:root:The number of level_3 predicted mapping is 7
INFO:root:The number of level_4 predicted mapping is 1
INFO:root:Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...
INFO:root:The number of level_5 predicted mapping is 4
INFO:root:Matching Label Embeddings by Optimal Transport...
INFO:root:Computing the Ground Embedding Costs between the Source and Target Points...
INFO:root:The shape of the cost matrix is (10, 40)
INFO:root:Computing Optimal Transpo

Total number of references is 15
Total correctly predicted alignments is 9
Total number of predicted is 19
Precision is 0.47368421052631576
Recall is 0.6
F1-Measure is 0.5294117647058824


In [91]:
ensemble_align

Unnamed: 0,source,source_label,target,target_label
0,http://cmt#Conference,Conference,http://conference#Conference,Conference
1,http://cmt#Paper,Paper,http://conference#Paper,Paper
2,http://cmt#Person,Person,http://conference#Person,Person
3,http://cmt#ProgramCommittee,ProgramCommittee,http://conference#Program_committee,Program_committee
4,http://cmt#Review,Review,http://conference#Review,Review
5,http://cmt#Reviewer,Reviewer,http://conference#Reviewer,Reviewer
6,http://cmt#Chairman,Chairman,http://conference#Chair,Chair
7,http://cmt#Author,Author,http://conference#Regular_author,Regular_author
8,http://cmt#Co-author,Co-author,http://conference#Contribution_co-author,Contribution_co-author
9,http://cmt#ConferenceChair,ConferenceChair,http://conference#Co-chair,Co-chair


In [39]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_was_ph_rest(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, (SKIP LEVEL_3) 
               LEVEL_4. WASSERSTEIN DISTANCE, LEVEL_5. PH, AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    
    logging.info("Skip the level_3 mapping")
        

    # mapping using wasserstein distance between the embeddings of sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    wds = mapneighbor.compute_pairwise_Wasserstein_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords, embs_model)
    if len(wds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds, None) 
        logging.info("The number of level_4 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    logging.info("Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...")
    src_diagrams = mapneighbor.compute_phDiagrams(slabel_clnd_uris_rest, src_uris_relatedWords, embs_model)
    tgt_diagrams = mapneighbor.compute_phDiagrams(tlabel_clnd_uris_rest, tgt_uris_relatedWords, embs_model)

    # mapping using WD between PH diagrams of source and target concepts
    wds_phDiagrams_arr = mapneighbor.compute_pairwise_wd_phDiagrams(src_diagrams, tgt_diagrams)
    if len(wds_phDiagrams_arr) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds_phDiagrams_arr, None)
        logging.info("The number of level_5 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    current_align = maponto.match_label_embeddings_OT(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, 
                                                      embs_model, maponto.make_mappings_nn, None, None)
    logging.info("The number of level_6 predicted mapping is {}".format(current_align.shape[0]))
    
    ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    #maponto.evaluate(ensemble_align, refs_url)

    return ensemble_align.reset_index(drop=True)

In [108]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_was_ph(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, (SKIP LEVEL_3) 
               LEVEL_4. WASSERSTEIN DISTANCE, LEVEL_5. PH, AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    
    logging.info("Skip the level_3 mapping")
        

    # mapping using wasserstein distance between the embeddings of sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    wds = mapneighbor.compute_pairwise_Wasserstein_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords, embs_model)
    if len(wds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds, None) 
        logging.info("The number of level_4 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    logging.info("Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...")
    src_diagrams = mapneighbor.compute_phDiagrams(slabel_clnd_uris_rest, src_uris_relatedWords, embs_model)
    tgt_diagrams = mapneighbor.compute_phDiagrams(tlabel_clnd_uris_rest, tgt_uris_relatedWords, embs_model)

    # mapping using WD between PH diagrams of source and target concepts
    wds_phDiagrams_arr = mapneighbor.compute_pairwise_wd_phDiagrams(src_diagrams, tgt_diagrams)
    if len(wds_phDiagrams_arr) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds_phDiagrams_arr, None)
        logging.info("The number of level_5 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    logging.info("Skip level_6 mapping.")
    
    #maponto.evaluate(ensemble_align, refs_url)

    return ensemble_align.reset_index(drop=True)

In [104]:
importlib.reload(mapneighbor)

INFO:root:logging info test


<module 'OTNeighborhood_TDA' from '/Users/yan/Google Drive/ontology-mapping/src/OTNeighborhood_TDA.py'>

In [105]:
ensemble_align = mapping_label_syn_was_ph_rest(cmtlabel_clnd_uris, conferencelabel_clnd_uris, cmt_graph, conference_graph, embs_model)
maponto.evaluate(ensemble_align, cmt_conference_url)

INFO:root:The number of level_1 predicted mapping is 6.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 1
INFO:root:Compute the related words for each source concept....
22it [00:00, 27.58it/s]
INFO:root:Total number of source concepts computed with relatd words is 22
INFO:root:Compute the related words for each target concept....
52it [00:01, 36.74it/s]
INFO:root:Total number of target concepts computed with related words is 52
INFO:root:Skip the level_3 mapping
INFO:root:The number of level_4 predicted mapping is 6
INFO:root:Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...
INFO:root:The number of level_5 predicted mapping is 5
INFO:root:Matching Label Embeddings by Optimal Transport...
INFO:root:Computing the Ground Embedding Costs between the Source and Target Points...
INFO:root:The shape of the cost matrix is (11, 41)
INFO:root:Computing Optimal Transport Plan...
INFO:root

Total number of references is 15
Total correctly predicted alignments is 10
Total number of predicted is 28
Precision is 0.35714285714285715
Recall is 0.6666666666666666
F1-Measure is 0.46511627906976744


In [40]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_was_rest(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, (SKIP LEVEL_3) 
               LEVEL_4. WASSERSTEIN DISTANCE, (SKIP LEVEL_5), AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    
    logging.info("Skip the level_3 mapping")
        

    # mapping using wasserstein distance between the embeddings of sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    wds = mapneighbor.compute_pairwise_Wasserstein_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords, embs_model)
    if len(wds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds, None) 
        logging.info("The number of level_4 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    
    logging.info("Skip level_5 mapping.")

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    current_align = maponto.match_label_embeddings_OT(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, 
                                                      embs_model, maponto.make_mappings_nn, None, None)
    logging.info("The number of level_6 predicted mapping is {}".format(current_align.shape[0]))
    
    ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    #maponto.evaluate(ensemble_align, refs_url)

    return ensemble_align.reset_index(drop=True)

In [99]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_was(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, (SKIP LEVEL_3) 
               LEVEL_4. WASSERSTEIN DISTANCE, (SKIP LEVEL_5), AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    
    logging.info("Skip the level_3 mapping")
        

    # mapping using wasserstein distance between the embeddings of sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    wds = mapneighbor.compute_pairwise_Wasserstein_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords, embs_model)
    if len(wds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds, None) 
        logging.info("The number of level_4 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    
    logging.info("Skip level_5 mapping.")
    logging.info("Skip level_6 mapping.")
    
    return ensemble_align.reset_index(drop=True)

In [106]:
ensemble_align = mapping_label_syn_was_rest(cmtlabel_clnd_uris, conferencelabel_clnd_uris, cmt_graph, conference_graph, embs_model)
maponto.evaluate(ensemble_align, cmt_conference_url)

INFO:root:The number of level_1 predicted mapping is 6.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 1
INFO:root:Compute the related words for each source concept....
22it [00:00, 37.63it/s]
INFO:root:Total number of source concepts computed with relatd words is 22
INFO:root:Compute the related words for each target concept....
52it [00:01, 37.71it/s]
INFO:root:Total number of target concepts computed with related words is 52
INFO:root:Skip the level_3 mapping
INFO:root:The number of level_4 predicted mapping is 6
INFO:root:Skip level_5 mapping.
INFO:root:Matching Label Embeddings by Optimal Transport...
INFO:root:Computing the Ground Embedding Costs between the Source and Target Points...
INFO:root:The shape of the cost matrix is (16, 46)
INFO:root:Computing Optimal Transport Plan...
INFO:root:Computing Wasserstein distance by the Sinkhorn algorithm...
INFO:root:Making Mappings from a Pairwise OT Plan Matrix by Mutual NN...
INFO:root:

Total number of references is 15
Total correctly predicted alignments is 11
Total number of predicted is 26
Precision is 0.4230769230769231
Recall is 0.7333333333333333
F1-Measure is 0.5365853658536585


In [None]:
maponto.evaluate(ensemble_align, mouse_human_url)

In [41]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_jac_ph_rest(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, LEVEL_3. JACCARD, 
               (SKIP LEVEL_4), LEVEL_5. PH, AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    # mapping using Jaccard distance between sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    jds = mapneighbor.compute_pairwise_Jaccard_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords)
    if len(jds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, jds, None) 
        logging.info("The number of level_3 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    

    logging.info("Skip the level_4 mapping.")

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    logging.info("Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...")
    src_diagrams = mapneighbor.compute_phDiagrams(slabel_clnd_uris_rest, src_uris_relatedWords, embs_model)
    tgt_diagrams = mapneighbor.compute_phDiagrams(tlabel_clnd_uris_rest, tgt_uris_relatedWords, embs_model)

    # mapping using WD between PH diagrams of source and target concepts
    wds_phDiagrams_arr = mapneighbor.compute_pairwise_wd_phDiagrams(src_diagrams, tgt_diagrams)
    if len(wds_phDiagrams_arr) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds_phDiagrams_arr, None)
        logging.info("The number of level_5 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    current_align = maponto.match_label_embeddings_OT(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, 
                                                      embs_model, maponto.make_mappings_nn, None, None)
    logging.info("The number of level_6 predicted mapping is {}".format(current_align.shape[0]))
    
    ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    #maponto.evaluate(ensemble_align, refs_url)

    return ensemble_align.reset_index(drop=True)

In [114]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_jac_ph(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, LEVEL_3. JACCARD, 
               (SKIP LEVEL_4), LEVEL_5. PH, AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    # mapping using Jaccard distance between sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    jds = mapneighbor.compute_pairwise_Jaccard_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords)
    if len(jds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, jds, None) 
        logging.info("The number of level_3 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    

    logging.info("Skip the level_4 mapping.")

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    logging.info("Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...")
    src_diagrams = mapneighbor.compute_phDiagrams(slabel_clnd_uris_rest, src_uris_relatedWords, embs_model)
    tgt_diagrams = mapneighbor.compute_phDiagrams(tlabel_clnd_uris_rest, tgt_uris_relatedWords, embs_model)

    # mapping using WD between PH diagrams of source and target concepts
    wds_phDiagrams_arr = mapneighbor.compute_pairwise_wd_phDiagrams(src_diagrams, tgt_diagrams)
    if len(wds_phDiagrams_arr) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds_phDiagrams_arr, None)
        logging.info("The number of level_5 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    logging.info("Skip level_6 mapping.")

    return ensemble_align.reset_index(drop=True)

In [107]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_ph_rest(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, LEVEL_3. JACCARD, 
               (SKIP LEVEL_4), LEVEL_5. PH, AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    
    logging.info("Skip the level_3 mapping.")

    logging.info("Skip the level_4 mapping.")

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    logging.info("Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...")
    src_diagrams = mapneighbor.compute_phDiagrams(slabel_clnd_uris_rest, src_uris_relatedWords, embs_model)
    tgt_diagrams = mapneighbor.compute_phDiagrams(tlabel_clnd_uris_rest, tgt_uris_relatedWords, embs_model)

    # mapping using WD between PH diagrams of source and target concepts
    wds_phDiagrams_arr = mapneighbor.compute_pairwise_wd_phDiagrams(src_diagrams, tgt_diagrams)
    if len(wds_phDiagrams_arr) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds_phDiagrams_arr, None)
        logging.info("The number of level_5 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    current_align = maponto.match_label_embeddings_OT(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, 
                                                      embs_model, maponto.make_mappings_nn, None, None)
    logging.info("The number of level_6 predicted mapping is {}".format(current_align.shape[0]))
    
    ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    #maponto.evaluate(ensemble_align, refs_url)

    return ensemble_align.reset_index(drop=True)

In [42]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_jac_rest(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, LEVEL_3. JACCARD, 
               (SKIP LEVEL_4), (SKIP LEVEL_5), AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    # mapping using Jaccard distance between sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    jds = mapneighbor.compute_pairwise_Jaccard_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords)
    if len(jds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, jds, None) 
        logging.info("The number of level_3 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    

    
    logging.info("Skip the level_4 mapping.")
        
    logging.info("Skip the level_5 mapping.")
        
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    current_align = maponto.match_label_embeddings_OT(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, 
                                                      embs_model, maponto.make_mappings_nn, None, None)
    logging.info("The number of level_6 predicted mapping is {}".format(current_align.shape[0]))
    
    ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    #maponto.evaluate(ensemble_align, refs_url)

    return ensemble_align.reset_index(drop=True)

In [43]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_jac(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, LEVEL_3. JACCARD, 
               AND (SKIP LEVEL_4, LEVEL_5, LEVEL_6)
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    # mapping using Jaccard distance between sets of related words
    suris = list(slabel_clnd_uris_rest.uri)
    turis = list(tlabel_clnd_uris_rest.uri)
    jds = mapneighbor.compute_pairwise_Jaccard_distances(suris, src_uris_relatedWords, turis, tgt_uris_relatedWords)
    if len(jds) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, jds, None) 
        logging.info("The number of level_3 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    

    
    logging.info("Skip the level_4 mapping.")
        
    logging.info("Skip the level_5 mapping.")
        
    logging.info("Skip the level_6 mappping.")
    
    return ensemble_align.reset_index(drop=True)

In [44]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_ph_rest(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, (SKIP LEVEL_3), 
               (SKIP LEVEL_4), LEVEL_5. PH, AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    logging.info("Skip the level_3 mapping.")
    logging.info("Skip the level_4 mapping.")

    
    logging.info("Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...")
    src_diagrams = mapneighbor.compute_phDiagrams(slabel_clnd_uris_rest, src_uris_relatedWords, embs_model)
    tgt_diagrams = mapneighbor.compute_phDiagrams(tlabel_clnd_uris_rest, tgt_uris_relatedWords, embs_model)

    # mapping using WD between PH diagrams of source and target concepts
    wds_phDiagrams_arr = mapneighbor.compute_pairwise_wd_phDiagrams(src_diagrams, tgt_diagrams)
    if len(wds_phDiagrams_arr) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds_phDiagrams_arr, None)
        logging.info("The number of level_5 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    current_align = maponto.match_label_embeddings_OT(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, 
                                                      embs_model, maponto.make_mappings_nn, None, None)
    logging.info("The number of level_6 predicted mapping is {}".format(current_align.shape[0]))
    
    ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    #maponto.evaluate(ensemble_align, refs_url)

    return ensemble_align.reset_index(drop=True)

In [45]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_ph(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, (SKIP LEVEL_3), 
               (SKIP LEVEL_4), LEVEL_5. PH, AND (SKIP LEVEL_6)
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')
   

    src_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each source concept....")
    for row in tqdm(slabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, src_graph, slabel_clnd_uris)
        src_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of source concepts computed with relatd words is {}".format(len(src_uris_relatedWords)))

    tgt_uris_relatedWords = defaultdict(dict)
    logging.info("Compute the related words for each target concept....")
    for row in tqdm(tlabel_clnd_uris_rest.itertuples()):
        word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, tgt_graph, tlabel_clnd_uris)
        tgt_uris_relatedWords[row.uri] = word_count
    logging.info("Total number of target concepts computed with related words is {}".format(len(tgt_uris_relatedWords)))

    logging.info("Skip the level_3 mapping.")
    logging.info("Skip the level_4 mapping.")

    
    logging.info("Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...")
    src_diagrams = mapneighbor.compute_phDiagrams(slabel_clnd_uris_rest, src_uris_relatedWords, embs_model)
    tgt_diagrams = mapneighbor.compute_phDiagrams(tlabel_clnd_uris_rest, tgt_uris_relatedWords, embs_model)

    # mapping using WD between PH diagrams of source and target concepts
    wds_phDiagrams_arr = mapneighbor.compute_pairwise_wd_phDiagrams(src_diagrams, tgt_diagrams)
    if len(wds_phDiagrams_arr) > 0:
        current_align = mapneighbor.make_mappings_distance_nn(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, wds_phDiagrams_arr, None)
        logging.info("The number of level_5 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    logging.info("Skip the level_6 mapping.")

    return ensemble_align.reset_index(drop=True)

In [46]:
import OTMapOnto as maponto
import OTNeighborhood_TDA as mapneighbor

# create ensemble mappings from all methods
def mapping_label_syn_rest(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model):
    """
    MAPPING BY LEVEL_1. LABEL STRING, LEVEL_2. LABLE SYNONYMS, (SKIP LEVEL_3, LEVEL_4, LEVEL_5), 
            AND LEVEL_6. OT ON THE REST
        input: slabel_clnd_uris: DataFrame containing source concepts and uris with columns {'label', 'uri', 'clndLabel'}
               tlabel_clnd_uris: DataFrame containing target concepts and uris with columns {'label', 'uri', 'clndLabel'}
               src_graph: source RDF graph
               tgt_graph: target RDF graph
               embs_model: pre-trained embedding model
        output: DataFrame containing mappings with columns {'source', 'source_label', 'target', 'target_label'}
    """

    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    # start with mapping label strings
    current_align = maponto.match_concept_labels(slabel_clnd_uris, tlabel_clnd_uris, None)
    if len(current_align) > 0:
        logging.info("The number of level_1 predicted mapping is {}.".format(current_align.shape[0]))
        # concatenate found mappings
        ensemble_align = pd.concat([ensemble_align, current_align], 0)

    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris, current_align, 'target')
    # mapping label synonyms
    current_align = maponto.match_label_synonyms(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, None)
    if len(current_align) > 0:
        logging.info("The number of level_2 predicted mapping is {}".format(current_align.shape[0]))
        ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    
    logging.info("Skip the level_3 mapping.")
    logging.info("Skip the level_4 mapping.")
    logging.info("Skip the level_5 mapping.")

    
    # extract the concepts that are not matched so far
    slabel_clnd_uris_rest = extract_rest_concepts(slabel_clnd_uris_rest, current_align, 'source')
    tlabel_clnd_uris_rest = extract_rest_concepts(tlabel_clnd_uris_rest, current_align, 'target')

    current_align = maponto.match_label_embeddings_OT(slabel_clnd_uris_rest, tlabel_clnd_uris_rest, 
                                                      embs_model, maponto.make_mappings_nn, None, None)
    logging.info("The number of level_6 predicted mapping is {}".format(current_align.shape[0]))
    
    ensemble_align = pd.concat([ensemble_align, current_align], 0)
    
    #maponto.evaluate(ensemble_align, refs_url)

    return ensemble_align.reset_index(drop=True)

# Load human.owl and mouse.owl

In [12]:
mouse_url = "../data/mouse.owl"
human_url = "../data/human.owl"
mouse_graph = Graph().parse(mouse_url)
human_graph = Graph().parse(human_url)

In [13]:
mouse_human_url = "../data/reference.rdf"

In [14]:
mouselabel_uris = maponto.extract_label_uris(mouse_graph)
mouselabel_clnd_uris = maponto.clean_labels(mouselabel_uris)

In [15]:
humanlabel_uris = maponto.extract_label_uris(human_graph)
humanlabel_clnd_uris = maponto.clean_labels(humanlabel_uris)

In [16]:
importlib.reload(mapneighbor)

INFO:root:logging info test


<module 'OTNeighborhood_TDA' from '/Users/yan/Google Drive/ontology-mapping/src/OTNeighborhood_TDA.py'>

In [27]:
ensemble_align = mapping_label_syn_rest(mouselabel_clnd_uris, humanlabel_clnd_uris, mouse_graph, human_graph, embs_model)
maponto.evaluate(ensemble_align, mouse_human_url)

INFO:root:The number of level_1 predicted mapping is 951.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 32
INFO:root:Skip the level_3 mapping.
INFO:root:Skip the level_4 mapping.
INFO:root:Skip the level_5 mapping.
INFO:root:Matching Label Embeddings by Optimal Transport...
INFO:root:Computing the Ground Embedding Costs between the Source and Target Points...
INFO:root:The shape of the cost matrix is (1757, 2315)
INFO:root:Computing Optimal Transport Plan...
INFO:root:Computing Wasserstein distance by the Sinkhorn algorithm...
INFO:root:Making Mappings from a Pairwise OT Plan Matrix by Mutual NN...
INFO:root:The number of level_6 predicted mapping is 920


Total number of references is 1516
Total correctly predicted alignments is 1230
Total number of predicted is 1903
Precision is 0.6463478717813977
Recall is 0.8113456464379947
F1-Measure is 0.7195086282538753


In [30]:
ensemble_align = mapping_label_syn_jac_ph_rest(mouselabel_clnd_uris, humanlabel_clnd_uris, mouse_graph, human_graph, embs_model)
maponto.evaluate(ensemble_align, mouse_human_url)

INFO:root:The number of level_1 predicted mapping is 951.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 32
INFO:root:Compute the related words for each source concept....
1757it [00:50, 34.66it/s]
INFO:root:Total number of source concepts computed with relatd words is 1757
INFO:root:Compute the related words for each target concept....
2315it [01:22, 28.13it/s]
INFO:root:Total number of target concepts computed with related words is 2315
100%|██████████| 1757/1757 [00:16<00:00, 107.16it/s]
INFO:root:The number of level_3 predicted mapping is 267
INFO:root:Skip the level_4 mapping.
INFO:root:Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...
100%|██████████| 1490/1490 [11:25<00:00,  2.17it/s]
INFO:root:The number of level_5 predicted mapping is 104
INFO:root:Matching Label Embeddings by Optimal Transport...
INFO:root:Computing the Ground Embedding Costs between the Source and Ta

Total number of references is 1516
Total correctly predicted alignments is 1220
Total number of predicted is 2103
Precision is 0.5801236329053733
Recall is 0.8047493403693932
F1-Measure is 0.6742193976236529


In [181]:
ensemble_align = mapping_label_syn_jac_rest(mouselabel_clnd_uris, humanlabel_clnd_uris, mouse_graph, human_graph, embs_model)
maponto.evaluate(ensemble_align, mouse_human_url)

INFO:root:The number of level_1 predicted mapping is 951.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 32
INFO:root:Compute the related words for each source concept....
1757it [00:52, 33.51it/s]
INFO:root:Total number of source concepts computed with relatd words is 1757
INFO:root:Compute the related words for each target concept....
2315it [01:20, 28.89it/s]
INFO:root:Total number of target concepts computed with related words is 2315
100%|██████████| 1757/1757 [00:20<00:00, 86.39it/s] 
INFO:root:The number of level_3 predicted mapping is 267
INFO:root:Skip the level_4 mapping.
INFO:root:Skip the level_5 mapping.
INFO:root:Matching Label Embeddings by Optimal Transport...
INFO:root:Computing the Ground Embedding Costs between the Source and Target Points...
INFO:root:The shape of the cost matrix is (1490, 2048)
INFO:root:Computing Optimal Transport Plan...
INFO:root:Computing Wasserstein distance by the Sinkhorn algorithm...
INFO:roo

Total number of references is 1516
Total correctly predicted alignments is 1234
Total number of predicted is 2052
Precision is 0.601364522417154
Recall is 0.8139841688654353
F1-Measure is 0.6917040358744395


In [34]:
ensemble_align = mapping_label_syn_ph_rest(mouselabel_clnd_uris, humanlabel_clnd_uris, mouse_graph, human_graph, embs_model)
maponto.evaluate(ensemble_align, mouse_human_url)

INFO:root:The number of level_1 predicted mapping is 951.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 32
INFO:root:Compute the related words for each source concept....
1757it [00:55, 31.40it/s]
INFO:root:Total number of source concepts computed with relatd words is 1757
INFO:root:Compute the related words for each target concept....
2315it [01:32, 25.14it/s]
INFO:root:Total number of target concepts computed with related words is 2315
INFO:root:Skip the level_3 mapping.
INFO:root:Skip the level_4 mapping.
INFO:root:Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...
100%|██████████| 1757/1757 [15:34<00:00,  1.88it/s]
INFO:root:The number of level_5 predicted mapping is 146
INFO:root:Matching Label Embeddings by Optimal Transport...
INFO:root:Computing the Ground Embedding Costs between the Source and Target Points...
INFO:root:The shape of the cost matrix is (1611, 2169)
INFO

Total number of references is 1516
Total correctly predicted alignments is 1203
Total number of predicted is 1990
Precision is 0.6045226130653266
Recall is 0.7935356200527705
F1-Measure is 0.6862521391899601


In [35]:
ensemble_align = mapping_label_syn_jac(mouselabel_clnd_uris, humanlabel_clnd_uris, mouse_graph, human_graph, embs_model)
maponto.evaluate(ensemble_align, mouse_human_url)

INFO:root:The number of level_1 predicted mapping is 951.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 32
INFO:root:Compute the related words for each source concept....
1757it [00:54, 32.20it/s]
INFO:root:Total number of source concepts computed with relatd words is 1757
INFO:root:Compute the related words for each target concept....
2315it [01:25, 27.00it/s]
INFO:root:Total number of target concepts computed with related words is 2315
100%|██████████| 1757/1757 [00:15<00:00, 110.50it/s]
INFO:root:The number of level_3 predicted mapping is 267
INFO:root:Skip the level_4 mapping.
INFO:root:Skip the level_5 mapping.
INFO:root:Skip the level_6 mappping.


Total number of references is 1516
Total correctly predicted alignments is 1091
Total number of predicted is 1250
Precision is 0.8728
Recall is 0.7196569920844327
F1-Measure is 0.7888647866955892


In [37]:
ensemble_align = mapping_label_syn_ph(mouselabel_clnd_uris, humanlabel_clnd_uris, mouse_graph, human_graph, embs_model)
maponto.evaluate(ensemble_align, mouse_human_url)

INFO:root:The number of level_1 predicted mapping is 951.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 32
INFO:root:Compute the related words for each source concept....
1757it [00:56, 31.06it/s]
INFO:root:Total number of source concepts computed with relatd words is 1757
INFO:root:Compute the related words for each target concept....
2315it [01:19, 28.98it/s]
INFO:root:Total number of target concepts computed with related words is 2315
INFO:root:Skip the level_3 mapping.
INFO:root:Skip the level_4 mapping.
INFO:root:Compute Persistent Homology Diagrams for embeddings of sets of related words for source and target concepts...
100%|██████████| 1757/1757 [15:20<00:00,  1.91it/s]
INFO:root:The number of level_5 predicted mapping is 146
INFO:root:Skip the level_6 mapping.


Total number of references is 1516
Total correctly predicted alignments is 984
Total number of predicted is 1129
Precision is 0.8715677590788308
Recall is 0.6490765171503958
F1-Measure is 0.7440453686200378


In [29]:
importlib.reload(mapneighbor)

INFO:root:logging info test


<module 'OTNeighborhood_TDA' from '/Users/yan/Google Drive/ontology-mapping/src/OTNeighborhood_TDA.py'>

In [None]:
ensemble_align = mapping_label_syn_was_rest(mouselabel_clnd_uris, humanlabel_clnd_uris, mouse_graph, human_graph, embs_model)

INFO:root:The number of level_1 predicted mapping is 951.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 32
INFO:root:Compute the related words for each source concept....
1757it [00:54, 32.27it/s]
INFO:root:Total number of source concepts computed with relatd words is 1757
INFO:root:Compute the related words for each target concept....
2315it [01:30, 25.52it/s]
INFO:root:Total number of target concepts computed with related words is 2315
INFO:root:Skip the level_3 mapping
INFO:root:Compute Pairwise Wasserstein Distances...
  check_result(result_code)
  8%|▊         | 144/1757 [11:51<4:07:08,  9.19s/it]

In [None]:
maponto.evaluate(ensemble_align, mouse_human_url)

# =========================

# Test Individual Methods

In [22]:
conf_label_align = maponto.match_concept_labels(cmtlabel_clnd_uris, conferencelabel_clnd_uris, None)
conf_label_align.shape

(6, 4)

In [16]:
conf_label_align

Unnamed: 0,source,source_label,target,target_label
0,http://cmt#Conference,Conference,http://conference#Conference,Conference
1,http://cmt#Paper,Paper,http://conference#Paper,Paper
2,http://cmt#Person,Person,http://conference#Person,Person
3,http://cmt#ProgramCommittee,ProgramCommittee,http://conference#Program_committee,Program_committee
4,http://cmt#Review,Review,http://conference#Review,Review
5,http://cmt#Reviewer,Reviewer,http://conference#Reviewer,Reviewer


In [17]:
maponto.evaluate(conf_label_align, cmt_conference_url)

Total number of references is 15
Total correctly predicted alignments is 4
Total number of predicted is 6
Precision is 0.6666666666666666
Recall is 0.26666666666666666
F1-Measure is 0.38095238095238093


In [18]:
# get concepts that are not matched by labels
# extract the concepts that are not matched by labels
cmtlabel_clnd_uris_rest = cmtlabel_clnd_uris[~cmtlabel_clnd_uris.uri.isin(conf_label_align.source)].\
    reset_index(drop=True)
conferencelabel_clnd_uris_rest = conferencelabel_clnd_uris[~conferencelabel_clnd_uris.uri.isin(conf_label_align.target)].\
    reset_index(drop=True)

In [19]:
cmtlabel_clnd_uris_rest.shape, conferencelabel_clnd_uris_rest.shape

((23, 3), (53, 3))

In [20]:
conf_label_syn_align = maponto.match_label_synonyms(cmtlabel_clnd_uris_rest, conferencelabel_clnd_uris_rest, None)
conf_label_syn_align.shape

INFO:root:Retrieving Synsets by WordNet...


(1, 4)

In [21]:
conf_label_syn_align

Unnamed: 0,source,source_label,target,target_label
0,http://cmt#Chairman,Chairman,http://conference#Chair,Chair


In [22]:
maponto.evaluate(conf_label_syn_align, cmt_conference_url)

Total number of references is 15
Total correctly predicted alignments is 1
Total number of predicted is 1
Precision is 1.0
Recall is 0.06666666666666667
F1-Measure is 0.125


In [23]:
# extract the concepts that are not matched so far
cmtlabel_clnd_uris_rest = cmtlabel_clnd_uris_rest[~cmtlabel_clnd_uris_rest.uri.isin(conf_label_syn_align.source)].\
    reset_index(drop=True)
conferencelabel_clnd_uris_rest = conferencelabel_clnd_uris_rest[~conferencelabel_clnd_uris_rest.uri.isin(conf_label_syn_align.target)].\
    reset_index(drop=True)
cmtlabel_clnd_uris_rest.shape, conferencelabel_clnd_uris_rest.shape

((22, 3), (52, 3))

In [25]:
cmt_uris_relatedWords = defaultdict(dict)
for row in tqdm(cmtlabel_clnd_uris_rest.itertuples()):
    word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, cmt_graph, cmtlabel_clnd_uris)
    cmt_uris_relatedWords[row.uri] = word_count
len(cmt_uris_relatedWords)

22it [00:00, 36.20it/s]


22

In [26]:
conference_uris_relatedWords = defaultdict(dict)
for row in tqdm(conferencelabel_clnd_uris_rest.itertuples()):
    word_count = mapneighbor.get_relatedWords_counts(None, row.uri, row.label, row.clndLabel, conference_graph, conferencelabel_clnd_uris)
    conference_uris_relatedWords[row.uri] = word_count
len(conference_uris_relatedWords)

52it [00:01, 36.60it/s]


52

In [27]:
suris = list(cmtlabel_clnd_uris_rest.uri)
turis = list(conferencelabel_clnd_uris_rest.uri)
jds = mapneighbor.compute_pairwise_Jaccard_distances(suris, cmt_uris_relatedWords, turis, conference_uris_relatedWords)

In [29]:
jds_align_relatedWords = mapneighbor.make_mappings_distance_nn(cmtlabel_clnd_uris_rest, conferencelabel_clnd_uris_rest, jds, None) 
jds_align_relatedWords

Unnamed: 0,source,source_label,target,target_label
0,http://cmt#Author,Author,http://conference#Regular_author,Regular_author
1,http://cmt#Co-author,Co-author,http://conference#Contribution_co-author,Contribution_co-author
2,http://cmt#ConferenceChair,ConferenceChair,http://conference#Co-chair,Co-chair
3,http://cmt#ConferenceMember,ConferenceMember,http://conference#Committee_member,Committee_member
4,http://cmt#Document,Document,http://conference#Conference_document,Conference_document
5,http://cmt#Meta-Review,Meta-Review,http://conference#Review_expertise,Review_expertise
6,http://cmt#PaperAbstract,PaperAbstract,http://conference#Abstract,Abstract


In [30]:
maponto.evaluate(jds_align_relatedWords, cmt_conference_url)

Total number of references is 15
Total correctly predicted alignments is 4
Total number of predicted is 7
Precision is 0.5714285714285714
Recall is 0.26666666666666666
F1-Measure is 0.36363636363636365


In [31]:
# extract the concepts that are not matched so far
cmtlabel_clnd_uris_rest = cmtlabel_clnd_uris_rest[~cmtlabel_clnd_uris_rest.uri.isin(jds_align_relatedWords.source)].\
    reset_index(drop=True)
conferencelabel_clnd_uris_rest = conferencelabel_clnd_uris_rest[~conferencelabel_clnd_uris_rest.uri.isin(jds_align_relatedWords.target)].\
    reset_index(drop=True)
cmtlabel_clnd_uris_rest.shape, conferencelabel_clnd_uris_rest.shape

((15, 3), (45, 3))

In [33]:
suris = list(cmtlabel_clnd_uris_rest.uri)
turis = list(conferencelabel_clnd_uris_rest.uri)
wds = mapneighbor.compute_pairwise_Wasserstein_distances(suris, cmt_uris_relatedWords, turis, conference_uris_relatedWords, embs_model)

In [35]:
wds_align_relatedWords = mapneighbor.make_mappings_distance_nn(cmtlabel_clnd_uris_rest, conferencelabel_clnd_uris_rest, wds, None) 
wds_align_relatedWords

Unnamed: 0,source,source_label,target,target_label
0,http://cmt#AuthorNotReviewer,AuthorNotReviewer,http://conference#Contribution_1th-author,Contribution_1th-author


In [36]:
maponto.evaluate(wds_align_relatedWords, cmt_conference_url)

Total number of references is 15
Total correctly predicted alignments is 0


ZeroDivisionError: float division by zero

In [37]:
# extract the concepts that are not matched so far
cmtlabel_clnd_uris_rest = cmtlabel_clnd_uris_rest[~cmtlabel_clnd_uris_rest.uri.isin(wds_align_relatedWords.source)].\
    reset_index(drop=True)
conferencelabel_clnd_uris_rest = conferencelabel_clnd_uris_rest[~conferencelabel_clnd_uris_rest.uri.isin(wds_align_relatedWords.target)].\
    reset_index(drop=True)
cmtlabel_clnd_uris_rest.shape, conferencelabel_clnd_uris_rest.shape

((14, 3), (44, 3))

In [40]:
cmt_diagrams = mapneighbor.compute_phDiagrams(cmtlabel_clnd_uris_rest, cmt_uris_relatedWords, embs_model)
len(cmt_diagrams)

14

In [42]:
conference_diagrams = mapneighbor.compute_phDiagrams(conferencelabel_clnd_uris_rest, conference_uris_relatedWords, embs_model)
len(conference_diagrams)

44

In [43]:
wds_phDiagrams_arr = mapneighbor.compute_pairwise_wd_phDiagrams(cmt_diagrams, conference_diagrams)

In [45]:
wds_align_ph = mapneighbor.make_mappings_distance_nn(cmtlabel_clnd_uris_rest, conferencelabel_clnd_uris_rest, wds_phDiagrams_arr, None)

In [46]:
wds_align_ph

Unnamed: 0,source,source_label,target,target_label
0,http://cmt#Administrator,Administrator,http://conference#Registeered_applicant,Registeered_applicant
1,http://cmt#PaperFullVersion,PaperFullVersion,http://conference#Call_for_participation,Call_for_participation
2,http://cmt#ProgramCommitteeChair,ProgramCommitteeChair,http://conference#Organization,Organization
3,http://cmt#ProgramCommitteeMember,ProgramCommitteeMember,http://conference#Active_conference_participant,Active_conference_participant


In [47]:
maponto.evaluate(wds_align_ph, cmt_conference_url)

Total number of references is 15
Total correctly predicted alignments is 0


ZeroDivisionError: float division by zero

In [48]:
# extract the concepts that are not matched so far
cmtlabel_clnd_uris_rest = cmtlabel_clnd_uris_rest[~cmtlabel_clnd_uris_rest.uri.isin(wds_align_ph.source)].\
    reset_index(drop=True)
conferencelabel_clnd_uris_rest = conferencelabel_clnd_uris_rest[~conferencelabel_clnd_uris_rest.uri.isin(wds_align_ph.target)].\
    reset_index(drop=True)
cmtlabel_clnd_uris_rest.shape, conferencelabel_clnd_uris_rest.shape

((10, 3), (40, 3))

In [49]:
rest_emb_align = maponto.match_label_embeddings_OT(cmtlabel_clnd_uris_rest, conferencelabel_clnd_uris_rest, \
                                                      embs_model, maponto.make_mappings_nn, None, None)
rest_emb_align.shape

INFO:root:Matching Label Embeddings by Optimal Transport...
INFO:root:Computing the Ground Embedding Costs between the Source and Target Points...
INFO:root:The shape of the cost matrix is (10, 40)
INFO:root:Computing Optimal Transport Plan...
INFO:root:Computing Wasserstein distance by the Sinkhorn algorithm...
INFO:root:Making Mappings from a Pairwise OT Plan Matrix by Mutual NN...




(0, 4)

In [50]:
maponto.evaluate(rest_emb_align, cmt_conference_url)

Total number of references is 15
Total correctly predicted alignments is 0


ZeroDivisionError: division by zero

# Test Other Conference OWLs

In [177]:
src_url = "../data/conference/cmt.owl"
tgt_url = "../data/conference/sigkdd.owl"
src_graph = Graph().parse(src_url)
tgt_graph = Graph().parse(tgt_url)
slabel_uris = maponto.extract_label_uris(src_graph)
tlabel_uris = maponto.extract_label_uris(tgt_graph)
slabel_clnd_uris = maponto.clean_labels(slabel_uris, rmStopWords=True)
tlabel_clnd_uris = maponto.clean_labels(tlabel_uris, rmStopWords=True)
slabel_clnd_uris.shape, tlabel_clnd_uris.shape

((29, 3), (49, 3))

In [178]:
refs_url = "../data/conference/reference-alignment/cmt-sigkdd.rdf"

In [180]:
ensemble_align = mapping_label_syn_jac_was_rest(slabel_clnd_uris, tlabel_clnd_uris, src_graph, tgt_graph, embs_model)
maponto.evaluate(ensemble_align, refs_url)

INFO:root:The number of level_1 predicted mapping is 8.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:Compute the related words for each source concept....
21it [00:00, 36.37it/s]
INFO:root:Total number of source concepts computed with relatd words is 21
INFO:root:Compute the related words for each target concept....
41it [00:01, 38.20it/s]
INFO:root:Total number of target concepts computed with related words is 41
100%|██████████| 21/21 [00:00<00:00, 8070.40it/s]
INFO:root:The number of level_3 predicted mapping is 5
INFO:root:Compute Pairwise Wasserstein Distances...
100%|██████████| 16/16 [00:00<00:00, 34.61it/s]
INFO:root:The number of level_4 predicted mapping is 1
INFO:root:Skip level_5 mapping.
INFO:root:Matching Label Embeddings by Optimal Transport...
INFO:root:Computing the Ground Embedding Costs between the Source and Target Points...
INFO:root:The shape of the cost matrix is (15, 35)
INFO:root:Computing Optimal Transport Plan...
INFO:root:Computing Wasserstein distan

Total number of references is 12
Total correctly predicted alignments is 10
Total number of predicted is 14
Precision is 0.7142857142857143
Recall is 0.8333333333333334
F1-Measure is 0.7692307692307694


In [73]:
ensemble_align

Unnamed: 0,source,source_label,target,target_label
0,http://conference#Abstract,Abstract,http://sigkdd#Abstract,Abstract
1,http://conference#Committee,Committee,http://sigkdd#Committee,Committee
2,http://conference#Conference,Conference,http://sigkdd#Conference,Conference
3,http://conference#Invited_speaker,Invited_speaker,http://sigkdd#Invited_Speaker,Invited_Speaker
4,http://conference#Organizing_committee,Organizing_committee,http://sigkdd#Organizing_Committee,Organizing_Committee
5,http://conference#Paper,Paper,http://sigkdd#Paper,Paper
6,http://conference#Person,Person,http://sigkdd#Person,Person
7,http://conference#Program_committee,Program_committee,http://sigkdd#Program_Committee,Program_Committee
8,http://conference#Review,Review,http://sigkdd#Review,Review
9,http://conference#Co-chair,Co-chair,http://sigkdd#General_Chair,General_Chair


In [115]:
tlabel_clnd_uris

Unnamed: 0,label,uri,clndLabel
0,ACM_SIGKDD,http://sigkdd#ACM_SIGKDD,acm sigkdd
1,Abstract,http://sigkdd#Abstract,abstract
2,Author,http://sigkdd#Author,author
3,Author_of_paper,http://sigkdd#Author_of_paper,author paper
4,Author_of_paper_student,http://sigkdd#Author_of_paper_student,author paper student
5,Award,http://sigkdd#Award,award
6,Best_Applications_Paper_Award,http://sigkdd#Best_Applications_Paper_Award,best applications paper award
7,Best_Paper_Awards_Committee,http://sigkdd#Best_Paper_Awards_Committee,best paper awards committee
8,Best_Research_Paper_Award,http://sigkdd#Best_Research_Paper_Award,best research paper award
9,Best_Student_Paper_Award,http://sigkdd#Best_Student_Paper_Award,best student paper award


# ================================

# Mapping on Object Properties

In [182]:
cmtobject_uris = mapneighbor.extract_objectProperty_uris(cmt_graph)
cmtobject_clnd_uris = maponto.clean_labels(cmtobject_uris)
conferenceobject_uris = mapneighbor.extract_objectProperty_uris(conference_graph)
conferenceobject_clnd_uris = maponto.clean_labels(conferenceobject_uris)

In [194]:
ensemble_align = mapping_label_syn_was(cmtobject_clnd_uris, conferenceobject_clnd_uris, src_graph, tgt_graph, embs_model)
maponto.evaluate(ensemble_align, cmt_conference_url)

INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 1
INFO:root:Compute the related words for each source concept....
48it [00:01, 37.72it/s]
INFO:root:Total number of source concepts computed with relatd words is 48
INFO:root:Compute the related words for each target concept....
45it [00:01, 40.46it/s]
INFO:root:Total number of target concepts computed with related words is 45
INFO:root:Skip the level_3 mapping
INFO:root:Compute Pairwise Wasserstein Distances...
100%|██████████| 48/48 [00:03<00:00, 15.07it/s]
INFO:root:The number of level_4 predicted mapping is 5
INFO:root:Skip level_5 mapping.
INFO:root:Skip level_6 mapping.


Total number of references is 15
Total correctly predicted alignments is 0


ZeroDivisionError: float division by zero

In [190]:
ensemble_align

Unnamed: 0,source,source_label,target,target_label
0,http://cmt#hasAuthor,hasAuthor,http://conference#has_authors,has_authors
1,http://cmt#hasConflictOfInterest,hasConflictOfInterest,http://conference#is_the_1th_part_of,is_the_1th_part_of
2,http://cmt#memberOfProgramCommittee,memberOfProgramCommittee,http://conference#has_a_program_committee,has_a_program_committee
3,http://cmt#reviewCriteriaEnteredBy,reviewCriteriaEnteredBy,http://conference#belongs_to_a_review_reference,belongs_to_a_review_reference
4,http://cmt#submitPaper,submitPaper,http://conference#has_a_submitted_contribution,has_a_submitted_contribution
5,http://cmt#writeReview,writeReview,http://conference#has_a_review,has_a_review


In [924]:
cmt_object_uris_relatedWords = defaultdict(dict)
for row in tqdm(cmtobject_clnd_uris.itertuples()):
    word_count = get_relatedWords_counts("Conference", row.uri, row.label, row.clndLabel, cmt_graph, cmtobject_clnd_uris)
    cmt_object_uris_relatedWords[row.uri] = word_count
len(cmt_object_uris_relatedWords)

49it [00:01, 33.82it/s]


49

In [925]:
conference_object_uris_relatedWords = defaultdict(dict)
for row in tqdm(conferenceobject_clnd_uris.itertuples()):
    word_count = get_relatedWords_counts("Conference", row.uri, row.label, row.clndLabel, conference_graph, conferenceobject_clnd_uris)
    conference_object_uris_relatedWords[row.uri] = word_count
len(conference_object_uris_relatedWords)

46it [00:01, 38.14it/s]


46

In [931]:
%%time
sobject_uris = list(cmtobject_clnd_uris.uri)
tobject_uris = list(conferenceobject_clnd_uris.uri)
object_wds = compute_pairwise_Wasserstein_distances(sobject_uris, cmt_object_uris_relatedWords, \
                                                    tobject_uris, conference_object_uris_relatedWords, embs_model)

CPU times: user 3.11 s, sys: 209 ms, total: 3.32 s
Wall time: 3.33 s


In [932]:
object_wds_align_relatedWords = make_mappings_distance_nn(cmtobject_clnd_uris, conferenceobject_clnd_uris, object_wds, 999) 
object_wds_align_relatedWords

Unnamed: 0,source,source_label,target,target_label
0,http://cmt#hasAuthor,hasAuthor,http://conference#has_authors,has_authors
1,http://cmt#hasConflictOfInterest,hasConflictOfInterest,http://conference#is_the_1th_part_of,is_the_1th_part_of
2,http://cmt#memberOfConference,memberOfConference,http://conference#was_a_member_of,was_a_member_of
3,http://cmt#memberOfProgramCommittee,memberOfProgramCommittee,http://conference#was_a_program_committee_of,was_a_program_committee_of
4,http://cmt#reviewCriteriaEnteredBy,reviewCriteriaEnteredBy,http://conference#belongs_to_a_review_reference,belongs_to_a_review_reference
5,http://cmt#submitPaper,submitPaper,http://conference#has_a_submitted_contribution,has_a_submitted_contribution
6,http://cmt#writeReview,writeReview,http://conference#has_a_review,has_a_review


# Mapping Datatype Properties

In [191]:
cmtdatatype_uris = mapneighbor.extract_datatypeProperty_uris(cmt_graph)
cmtdatatype_clnd_uris = maponto.clean_labels(cmtdatatype_uris)
conferencedatatype_uris = mapneighbor.extract_datatypeProperty_uris(conference_graph)
conferencedatatype_clnd_uris = maponto.clean_labels(conferencedatatype_uris)

In [196]:
ensemble_align = mapping_label_syn_jac(cmtdatatype_clnd_uris, conferencedatatype_clnd_uris, src_graph, tgt_graph, embs_model)
maponto.evaluate(ensemble_align, cmt_conference_url)

INFO:root:The number of level_1 predicted mapping is 2.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:Compute the related words for each source concept....
8it [00:00, 36.10it/s]
INFO:root:Total number of source concepts computed with relatd words is 8
INFO:root:Compute the related words for each target concept....
16it [00:00, 39.32it/s]
INFO:root:Total number of target concepts computed with related words is 16
100%|██████████| 8/8 [00:00<00:00, 17503.62it/s]
INFO:root:The number of level_3 predicted mapping is 2
INFO:root:Skip the level_4 mapping.
INFO:root:Skip the level_5 mapping.
INFO:root:Skip the level_6 mappping.


Total number of references is 15
Total correctly predicted alignments is 1
Total number of predicted is 4
Precision is 0.25
Recall is 0.06666666666666667
F1-Measure is 0.10526315789473684


In [197]:
ensemble_align

Unnamed: 0,source,source_label,target,target_label
0,http://cmt#email,email,http://conference#has_an_email,has_an_email
1,http://cmt#name,name,http://conference#has_a_name,has_a_name
2,http://cmt#date,date,http://conference#is_an_ending_date,is_an_ending_date
3,http://cmt#logoURL,logoURL,http://conference#has_a_URL,has_a_URL


In [934]:
cmt_datatype_uris_relatedWords = defaultdict(dict)
for row in tqdm(cmtdatatype_clnd_uris.itertuples()):
    word_count = get_relatedWords_counts("Conference", row.uri, row.label, row.clndLabel, cmt_graph, cmtdatatype_clnd_uris)
    cmt_datatype_uris_relatedWords[row.uri] = word_count
len(cmt_datatype_uris_relatedWords)

10it [00:00, 31.07it/s]


10

In [935]:
conference_datatype_uris_relatedWords = defaultdict(dict)
for row in tqdm(conferencedatatype_clnd_uris.itertuples()):
    word_count = get_relatedWords_counts("Conference", row.uri, row.label, row.clndLabel, conference_graph, conferencedatatype_clnd_uris)
    conference_datatype_uris_relatedWords[row.uri] = word_count
len(conference_datatype_uris_relatedWords)

18it [00:00, 33.93it/s]


18

In [938]:
%%time
sdatatype_uris = list(cmtdatatype_clnd_uris.uri)
tdatatype_uris = list(conferencedatatype_clnd_uris.uri)
datatype_wds = compute_pairwise_Wasserstein_distances(sdatatype_uris, cmt_datatype_uris_relatedWords, \
                                                    tdatatype_uris, conference_datatype_uris_relatedWords, embs_model)

CPU times: user 216 ms, sys: 122 ms, total: 339 ms
Wall time: 998 ms


In [940]:
datatype_wds_align_relatedWords = make_mappings_distance_nn(cmtdatatype_clnd_uris, conferencedatatype_clnd_uris, datatype_wds, 999) 
datatype_wds_align_relatedWords

Unnamed: 0,source,source_label,target,target_label
0,http://cmt#date,date,http://conference#is_a_date_of_acceptance_anno...,is_a_date_of_acceptance_announcement
1,http://cmt#email,email,http://conference#has_an_email,has_an_email
2,http://cmt#logoURL,logoURL,http://conference#has_a_URL,has_a_URL
3,http://cmt#name,name,http://conference#has_a_name,has_a_name
4,http://cmt#reviewsPerPaper,reviewsPerPaper,http://conference#is_a_full_paper_submission_date,is_a_full_paper_submission_date
5,http://cmt#siteURL,siteURL,http://conference#has_a_location,has_a_location


In [941]:
cmt_object_diagrams = compute_phDiagrams(cmtobject_clnd_uris, cmt_object_uris_relatedWords)
len(cmt_object_diagrams)

49

In [943]:
conference_object_diagrams = compute_phDiagrams(conferenceobject_clnd_uris, conference_object_uris_relatedWords)
len(conference_object_diagrams)

46

In [944]:
object_wds_arr = compute_pairwise_wd_phDiagrams(cmt_object_diagrams, conference_object_diagrams)

In [945]:
object_wds_arr[9]

array([0.11347955, 0.14686791, 0.08847126, 0.15089048, 0.08704813,
       0.18638405, 0.14754526, 0.22760828, 0.24423718, 0.34780705,
       0.16500626, 0.16505767, 0.19350362, 0.10226089, 0.06952032,
       0.08644143, 0.13999385, 0.12363588, 0.15132383, 0.12080384,
       0.13855008, 0.15022476, 0.15392936, 0.13766553, 0.08128668,
       0.09899963, 0.13933857, 0.17724023, 0.14934499, 0.08355571,
       0.09879113, 0.10437255, 0.07200296, 0.08770755, 0.06983825,
       0.06039581, 0.07488232, 0.15604215, 0.13842965, 0.18690343,
       0.24200752, 0.07541296, 0.245167  , 0.10419271, 0.1434967 ,
       0.12185793])

In [952]:
object_wds_align_ph = make_mappings_distance_nn(cmtobject_clnd_uris, conferenceobject_clnd_uris, object_wds_arr, None)

In [953]:
object_wds_align_ph

Unnamed: 0,source,source_label,target,target_label
0,http://cmt#addProgramCommitteeMember,addProgramCommitteeMember,http://conference#has_a_track-workshop-tutoria...,has_a_track-workshop-tutorial_chair
1,http://cmt#detailsEnteredBy,detailsEnteredBy,http://conference#issues,issues
2,http://cmt#hasCo-author,hasCo-author,http://conference#invites_co-reviewers,invites_co-reviewers
3,http://cmt#markConflictOfInterest,markConflictOfInterest,http://conference#has_tracks,has_tracks
4,http://cmt#memberOfProgramCommittee,memberOfProgramCommittee,http://conference#belongs_to_a_review_reference,belongs_to_a_review_reference
5,http://cmt#reviewCriteriaEnteredBy,reviewCriteriaEnteredBy,http://conference#has_been_assigned_a_review_r...,has_been_assigned_a_review_reference
6,http://cmt#submitPaper,submitPaper,http://conference#has_a_submitted_contribution,has_a_submitted_contribution


# Combine object_align_relatedWords and datatype_align_relatedWords

In [998]:
ensemble_align = pd.concat([ensemble_align, object_wds_align_relatedWords, datatype_wds_align_relatedWords])

In [1000]:
maponto.evaluate(ensemble_align, cmt_conference_url)

Total number of references is 15
Total correctly predicted alignments is 11
Total number of predicted is 41
Precision is 0.2682926829268293
Recall is 0.7333333333333333
F1-Measure is 0.39285714285714285


In [1001]:
ensemble_align

Unnamed: 0,source,source_label,target,target_label
0,http://cmt#Co-author,Co-author,http://conference#Contribution_co-author,Contribution_co-author
1,http://cmt#Conference,Conference,http://conference#Conference,Conference
2,http://cmt#ConferenceChair,ConferenceChair,http://conference#Chair,Chair
3,http://cmt#ConferenceMember,ConferenceMember,http://conference#Committee_member,Committee_member
4,http://cmt#Paper,Paper,http://conference#Paper,Paper
5,http://cmt#Person,Person,http://conference#Person,Person
6,http://cmt#ProgramCommittee,ProgramCommittee,http://conference#Program_committee,Program_committee
7,http://cmt#Review,Review,http://conference#Review,Review
8,http://cmt#Reviewer,Reviewer,http://conference#Reviewer,Reviewer
0,http://cmt#AssociatedChair,AssociatedChair,http://conference#Invited_speaker,Invited_speaker


# Ensemble Mapper

In [240]:
# combine mappings for concepts, object properties, and datatype properties
def ensemble_map(source_url, target_url, embs_model):
    
    logging.info("Python ensemble mapper info: map " + source_url + " to " + target_url)

    source_graph = Graph()
    source_graph.parse(source_url)
    logging.info("Read source with %s triples.", len(source_graph))

    target_graph = Graph()
    target_graph.parse(target_url)
    logging.info("Read target with %s triples.", len(target_graph))
    
    # initialize the final ensemble mappings
    column_names = ["source", "source_label", "target", "target_label"]
    ensemble_align = pd.DataFrame(columns = column_names)
    
    
    # map concepts
    logging.info("MAP CONCEPTS")
    slabel_uris = maponto.extract_label_uris(source_graph)
    tlabel_uris = maponto.extract_label_uris(target_graph)
    slabel_clnd_uris = maponto.clean_labels(slabel_uris)
    tlabel_clnd_uris = maponto.clean_labels(tlabel_uris)
    
    if 0 < len(slabel_clnd_uris) < 500 and 0 < len(tlabel_clnd_uris) < 500:
        concept_align = mapping_label_syn_jac_was_rest(slabel_clnd_uris, \
                                    tlabel_clnd_uris, source_graph, target_graph, embs_model)
        logging.info("TOTAL NUMBER OF MAPPINGS BETWEEN CONCPETS IS {}".format(len(concept_align)))
        ensemble_align = pd.concat([ensemble_align, concept_align], 0)
    elif len(slabel_clnd_uris) >=500 and len(tlabel_clnd_uris) >= 500:
        concept_align = mapping_label_syn_rest(slabel_clnd_uris, \
                                    tlabel_clnd_uris, source_graph, target_graph, embs_model)
        ensemble_align = pd.concat([ensemble_align, concept_align], 0)
        logging.info("TOTAL NUMBER OF MAPPINGS BETWEEN CONCPETS IS {}".format(len(concept_align)))
    
    logging.info("=================================================")
    
    # map object properties
    logging.info("MAP OBJECT PROPERTIES")
    sobject_uris = mapneighbor.extract_objectProperty_uris(source_graph)
    sobject_clnd_uris = maponto.clean_labels(sobject_uris)
    tobject_uris = mapneighbor.extract_objectProperty_uris(target_graph)
    tobject_clnd_uris = maponto.clean_labels(tobject_uris)
    
    if 0 < len(sobject_clnd_uris) < 500 and 0 < len(tobject_clnd_uris) < 500:
        object_align = mapping_label_syn_jac(sobject_clnd_uris, tobject_clnd_uris, source_graph, 
                                          target_graph, embs_model)
        logging.info("TOTAL NUMBER OF MAPPINGS BETWEEN OBJECT PROPERTIES IS {}".format(len(object_align)))
        ensemble_align = pd.concat([ensemble_align, object_align], 0)
    elif len(sobject_clnd_uris) >=500 and len(tobject_clnd_uris) >= 500:
        object_align = mapping_label_syn_rest(sobject_clnd_uris, tobject_clnd_uris, source_graph, 
                                          target_graph, embs_model)
        ensemble_align = pd.concat([ensemble_align, object_align], 0)
        logging.info("TOTAL NUMBER OF MAPPINGS BETWEEN OBJECT PROPERTIES IS {}".format(len(object_align)))
    
    logging.info("========================================================")
    
    # map datatype properties
    logging.info("MAP DATATYPE PROPERTIES")
    sdatatype_uris = mapneighbor.extract_datatypeProperty_uris(source_graph)
    sdatatype_clnd_uris = maponto.clean_labels(sdatatype_uris)
    tdatatype_uris = mapneighbor.extract_datatypeProperty_uris(target_graph)
    tdatatype_clnd_uris = maponto.clean_labels(tdatatype_uris)
    
    if 0 < len(sdatatype_clnd_uris) < 500 and 0 < len(tdatatype_clnd_uris) < 500:
        datatype_align = mapping_label_syn_jac(sdatatype_clnd_uris, tdatatype_clnd_uris, source_graph, 
                                          target_graph, embs_model)
        logging.info("TOTAL NUMBER OF MAPPINGS BETWEEN DATATYPE PROPERTIES IS {}".format(len(datatype_align)))
        ensemble_align = pd.concat([ensemble_align, datatype_align], 0)
    elif len(sdatatype_clnd_uris) >=500 and len(tdatatype_clnd_uris) >= 500:
        datatype_align = mapping_label_syn_rest(sdatatype_clnd_uris, tdatatype_clnd_uris, source_graph, 
                                          target_graph, embs_model)
        ensemble_align = pd.concat([ensemble_align, datatype_align], 0)
        logging.info("TOTAL NUMBER OF MAPPINGS BETWEEN DATATYPE PROPERTIES IS {}".format(len(datatype_align)))
    
    logging.info("===========================================================")
    
   
    return ensemble_align.reset_index(drop=True)

In [243]:
src_url = "../data/conference/edas.owl"
tgt_url = "../data/conference/iasted.owl"
refs_url = "../data/conference/reference-alignment/edas-iasted.rdf"
result_align = ensemble_map(src_url, tgt_url, embs_model)
maponto.evaluate(result_align, refs_url)

INFO:root:Python ensemble mapper info: map ../data/conference/edas.owl to ../data/conference/iasted.owl
INFO:root:Read source with 1720 triples.
INFO:root:Read target with 1173 triples.
INFO:root:MAP CONCEPTS
INFO:root:The number of level_1 predicted mapping is 8.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 4
INFO:root:Compute the related words for each source concept....
91it [00:02, 37.57it/s]
INFO:root:Total number of source concepts computed with relatd words is 91
INFO:root:Compute the related words for each target concept....
128it [00:04, 31.30it/s]
INFO:root:Total number of target concepts computed with related words is 128
100%|██████████| 91/91 [00:00<00:00, 2696.60it/s]
INFO:root:The number of level_3 predicted mapping is 14
INFO:root:Compute Pairwise Wasserstein Distances...
100%|██████████| 77/77 [00:12<00:00,  6.38it/s]
INFO:root:The number of level_4 predicted mapping is 1
INFO:root:Skip level_5 mapping.
INFO:root:Match



30it [00:00, 37.23it/s]
INFO:root:Total number of source concepts computed with relatd words is 30
INFO:root:Compute the related words for each target concept....
38it [00:01, 37.98it/s]
INFO:root:Total number of target concepts computed with related words is 38
100%|██████████| 30/30 [00:00<00:00, 9894.56it/s]
INFO:root:The number of level_3 predicted mapping is 2
INFO:root:Skip the level_4 mapping.
INFO:root:Skip the level_5 mapping.
INFO:root:Skip the level_6 mappping.
INFO:root:TOTAL NUMBER OF MAPPINGS BETWEEN OBJECT PROPERTIES IS 2
INFO:root:MAP DATATYPE PROPERTIES
INFO:root:Retrieving Synsets by WordNet...
INFO:root:Compute the related words for each source concept....
20it [00:00, 34.88it/s]
INFO:root:Total number of source concepts computed with relatd words is 20
INFO:root:Compute the related words for each target concept....
3it [00:00, 37.27it/s]
INFO:root:Total number of target concepts computed with related words is 3
100%|██████████| 20/20 [00:00<00:00, 45990.18it/s]
INFO

Total number of references is 19
Total correctly predicted alignments is 11
Total number of predicted is 92
Precision is 0.11956521739130435
Recall is 0.5789473684210527
F1-Measure is 0.19819819819819823


In [213]:
result_align

Unnamed: 0,source,source_label,target,target_label
0,http://conference#Person,Person,http://iasted#Person,Person
1,http://conference#Presentation,Presentation,http://iasted#Presentation,Presentation
2,http://conference#Review,Review,http://iasted#Review,Review
3,http://conference#Reviewer,Reviewer,http://iasted#Reviewer,Reviewer
4,http://conference#Tutorial,Tutorial,http://iasted#Tutorial,Tutorial
5,http://conference#Poster,Poster,http://iasted#Card,Card
6,http://conference#Topic,Topic,http://iasted#Publication,Publication
7,http://conference#Track,Track,http://iasted#Form,Form
8,http://conference#Track,Track,http://iasted#Tip,Tip
9,http://conference#Accepted_contribution,Accepted_contribution,http://iasted#Accepting_manuscript,Accepting_manuscript


In [221]:
importlib.reload(mapneighbor)

INFO:root:logging info test


<module 'OTNeighborhood_TDA' from '/Users/yan/Google Drive/ontology-mapping/src/OTNeighborhood_TDA.py'>

In [242]:
src_url = "../data/mouse.owl"
tgt_url = "../data/human.owl"
refs_url = "../data/reference.rdf"
result_align = ensemble_map(src_url, tgt_url, embs_model)
maponto.evaluate(result_align, refs_url)

INFO:root:Python ensemble mapper info: map ../data/mouse.owl to ../data/human.owl
INFO:root:Read source with 15958 triples.
INFO:root:Read target with 35354 triples.
INFO:root:MAP CONCEPTS
INFO:root:The number of level_1 predicted mapping is 951.
INFO:root:Retrieving Synsets by WordNet...
INFO:root:The number of level_2 predicted mapping is 32
INFO:root:Skip the level_3 mapping.
INFO:root:Skip the level_4 mapping.
INFO:root:Skip the level_5 mapping.
INFO:root:Matching Label Embeddings by Optimal Transport...
INFO:root:Computing the Ground Embedding Costs between the Source and Target Points...
INFO:root:The shape of the cost matrix is (1757, 2315)
INFO:root:Computing Optimal Transport Plan...
INFO:root:Computing Wasserstein distance by the Sinkhorn algorithm...
INFO:root:Making Mappings from a Pairwise OT Plan Matrix by Mutual NN...
INFO:root:The number of level_6 predicted mapping is 920
INFO:root:TOTAL NUMBER OF MAPPINGS BETWEEN CONCPETS IS 1903
INFO:root:MAP OBJECT PROPERTIES
INFO:r

Total number of references is 1516
Total correctly predicted alignments is 1230
Total number of predicted is 1905
Precision is 0.6456692913385826
Recall is 0.8113456464379947
F1-Measure is 0.7190879859690149
