In [1]:
%matplotlib inline

import spacy
from spacy import displacy
import networkx as nx

%run ../script/webnlg.py

train_dev = WebNLGCorpus.load(['train', 'dev'])

# Makes graphs from dependency trees

In [2]:
def make_graph(doc):
    
    g = nx.DiGraph()
    
    for token in doc:

        if token.dep_ != 'ROOT':

            g.add_edge(token.head, token, dep=token.dep_)
            g.nodes[token.head]['token'] = token.head
            g.nodes[token]['token'] = token
            
    return g

# Calculate graph distances between dependency trees, using word embedding

In [3]:
from itertools import islice
import numpy as np
from networkx.algorithms import similarity

def p(x, y):
    
    x, y = x['token'], y['token']
    
    sim = x.similarity(y)
    
    return 1 - sim

def calculate_distance(hypothesis_text, g, doc=None, n=10):
    
    hypothesis_doc = nlp(hypothesis_text)
    hypothesis_g = make_graph(hypothesis_doc)    
    
    print(f'doc 1: "{hypothesis_doc}"')
    if doc is not None:
        print(f'doc 2: "{doc}"')

    sim = list(islice(similarity.optimize_graph_edit_distance(hypothesis_g, g, node_subst_cost=p), 0, n))[-1]

    print(f'Edit distance = {sim}')

    print("\n")
    
    return sim
    
def calculate_distances(hypothesis_text, gs, docs, n=10):
    
    sims = [calculate_distance(hypothesis_text, g, doc, n) for g, doc in zip(gs, docs)]
    
    print(f'Mean: {np.mean(sims)}')
    print(f'Min: {min(sims)}')
    print(f'Std: {np.std(sims)}')

# Example

In [4]:
nlp = spacy.load('en_core_web_lg')

In [5]:
# sample
e = train_dev.sample(idx='5_40')

e

Triple info: {'category': 'Food', 'eid': 'Id41', 'idx': '5_40', 'ntriples': 1}

	Modified triples:

Asam_pedas | alternativeName | "Asam padeh"


	Lexicalizations:

The alternative name for asam pedas is asam padeh.
Asam padeh is also known as Asam pedas.
An alternative name for Asam pedas is Asam padeh.

In [6]:
# generate docs for the lexicalizations

docs = [nlp(lex) for lex in e.lexes()]
gs = [make_graph(doc) for doc in docs]

# Calculating graph similarities between dependencies trees and word embeddings

In [7]:
N = 2

In [8]:
text = "I call Asam pedas as Asam padeh."
calculate_distances(text, gs, docs, n=N)

doc 1: "I call Asam pedas as Asam padeh."
doc 2: "The alternative name for asam pedas is asam padeh."
Edit distance = 12.318304419517517


doc 1: "I call Asam pedas as Asam padeh."
doc 2: "Asam padeh is also known as Asam pedas."
Edit distance = 16.000961780548096


doc 1: "I call Asam pedas as Asam padeh."
doc 2: "An alternative name for Asam pedas is Asam padeh."
Edit distance = 12.329225301742554


Mean: 13.549497167269388
Min: 12.318304419517517
Std: 1.7334529854283007


In [9]:
text = "The New York Times is a good online newspaper."
calculate_distances(text, gs, docs, n=N)

doc 1: "The New York Times is a good online newspaper."
doc 2: "The alternative name for asam pedas is asam padeh."
Edit distance = 18.146992914378643


doc 1: "The New York Times is a good online newspaper."
doc 2: "Asam padeh is also known as Asam pedas."
Edit distance = 20.262665562331676


doc 1: "The New York Times is a good online newspaper."
doc 2: "An alternative name for Asam pedas is Asam padeh."
Edit distance = 19.872984491288662


Mean: 19.42754765599966
Min: 18.146992914378643
Std: 0.9193578081096829


In [10]:
text = e.lexes()[0]
calculate_distances(text, gs, docs, n=N)

doc 1: "The alternative name for asam pedas is asam padeh."
doc 2: "The alternative name for asam pedas is asam padeh."
Edit distance = 0.0


doc 1: "The alternative name for asam pedas is asam padeh."
doc 2: "Asam padeh is also known as Asam pedas."
Edit distance = 7.370896875858307


doc 1: "The alternative name for asam pedas is asam padeh."
doc 2: "An alternative name for Asam pedas is Asam padeh."
Edit distance = 0.4030686020851135


Mean: 2.59132182598114
Min: 0.0
Std: 3.3836734788838547


In [11]:
text = e.lexes()[1]
calculate_distances(text, gs, docs, n=N)

doc 1: "Asam padeh is also known as Asam pedas."
doc 2: "The alternative name for asam pedas is asam padeh."
Edit distance = 10.155589371919632


doc 1: "Asam padeh is also known as Asam pedas."
doc 2: "Asam padeh is also known as Asam pedas."
Edit distance = 0.0


doc 1: "Asam padeh is also known as Asam pedas."
doc 2: "An alternative name for Asam pedas is Asam padeh."
Edit distance = 10.201723039150238


Mean: 6.785770803689957
Min: 0.0
Std: 4.798301514006202


In [12]:
text = e.lexes()[2]
calculate_distances(text, gs, docs, n=N)

doc 1: "An alternative name for Asam pedas is Asam padeh."
doc 2: "The alternative name for asam pedas is asam padeh."
Edit distance = 0.4030686020851135


doc 1: "An alternative name for Asam pedas is Asam padeh."
doc 2: "Asam padeh is also known as Asam pedas."
Edit distance = 7.417030543088913


doc 1: "An alternative name for Asam pedas is Asam padeh."
doc 2: "An alternative name for Asam pedas is Asam padeh."
Edit distance = 0.0


Mean: 2.6066997150580087
Min: 0.0
Std: 3.4053955307056554
