In [1]:
## This script computes the Smatch, WWLK, and AMR similarity between the AMR parses of two sentences 
# Run using spring environment with Python 3.8
# James Fodor 2023
#
# This code requires the input sentences have been parsed using the 'Calc_parse_AMRs.ipynb' file.
# Three types of similarities are computed:
# 1. Smatch similarity: a standard graph similarity measure
# 2. AMR-sim: this is a custom measure developed for this project, see Fodor 2023 for details.
# 3. WLK-Wasser: incorporates both graph distance and graded word similarity, 
#                see https://github.com/flipz357/weisfeiler-leman-amr-metrics/blob/main/README.md


# load libraries
import json
import numpy as np
import glob
import itertools
import sentence_embeds_processing as sep

from amrlib.evaluate.smatch_enhanced import match_pair # for smatch score

# file containing path directories
with open("file_paths.json", "r") as file:
    file_paths_dict = json.load(file)

# load nltk lemmatizer
from nltk.data import path
path.append(file_paths_dict['path_root']+"\Frames and Structured Data\\FrameNet\\nltk_data")
from nltk.stem import WordNetLemmatizer

# numpy print options
np.set_printoptions(precision=2, threshold=2000, linewidth=200, suppress=True, floatmode='fixed')

### Load sentence datasets, word embeddings, and AMR parses

In [2]:
## Show available datasets, as specified in the sep module
pairs = True # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
print('Available datasets:')
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

Available datasets:
0 GS2011_processed
1 KS2013_processed
2 Fodor_pilot_2022
3 STS131_processed
4 SICK_relatedness
5 STR_processed
6 STSb_captions_test
7 STSb_forums_test
8 STSb_headlines_test
9 STSb_test
10 STS3k_all


In [6]:
## Load sentence set 

# choose number from those printed above
dataset_name = datasets[5]

# load sentence set into dictionary depending on type
if pairs == True:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['data_pairs_path'], pairs)
else:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['neuro_root'], pairs)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset_name,'with',n,'sentences')

# store in list
sentences = []
if pairs==True: # use this for sentence similarity pair data
    sentences.append(list(np.array(list(sentences_dict.values()))[:,0].flatten()))
    sentences.append(list(np.array(list(sentences_dict.values()))[:,1].flatten()))
else: # use this for neuroimaging data/list of sentences
    sentences.append(list(sentences_dict.values()))


loaded STR_processed with 5500 sentences


In [41]:
## Load word embeddings and AMR parses

# Load ConceptNet embeddings
model_address = file_paths_dict['path_root']+'\Word Embeddings\ConceptNet Embeddings\\numberbatch-en.txt'
conceptnet_embeds = sep.import_word_model(model_address)

# Load AMR parsed sentence data
with open(file_paths_dict['parses_path']+'\AMR Parsing\\'+dataset_name+"_AMR_parse.json",'r') as file:
    AMR_graph_storage = json.load(file)

### Compute and save AMR and Smatch similarities

In [8]:
## Functions to perform AMR parsing and calculate AMR similarity

# Function to convert a string parse tree of a sentence to a dictionary of roles and values
def parse_tree_to_dict(parse_tree): 
    # extract parse tree only
    parse = parse_tree.split('.')[1] # uses the period in the sentence to separate it from the parse tree
    parse_list = parse.split('\n')
    parse_list[1]=':MAIN '+parse_list[1]

    # construct a list of parse tree terms
    new_parse_list = []
    tab_length = 6 # length of indentation in parse tree
    last_element_of_level_n = {0:'MAIN'}
    for line in parse_list:
        # skip any blank lines
        if line=='': 
            continue
        
        # extract terms
        leading_ws = line.split(':')[0] # get leading white space
        line_no_ws = line.split(':')[1] # get terms following ws
        
        # calculate current level in parse tree
        if len(leading_ws)%tab_length != 0:
            list_level = 0
        else:
            list_level = int(len(leading_ws)/6) 
        
        # extract list item at current parse tree level
        header_string = ''
        for i in range(0,list_level):
            header_string = header_string+'.'+last_element_of_level_n[i]
        header_string = header_string+'.'
        new_parse_list.append(str(header_string)+line_no_ws)
        
        # append parse tree item to list
        last_element_of_level_n[list_level] = line_no_ws.split(' ')[0]
        
    # turn parse tree list into dictionary
    parse_tree_dict = {}
    for item in new_parse_list:
        item_list = item.split(' ')
        parse_tree_dict[item_list[0]] = item_list[-1]
        
    return parse_tree_dict


# Function to calculate AMR parse similarity
def AMR_graph_sim(graphs_pair):
    semantic_filler_sims = {}
    graph_dict_1 = parse_tree_to_dict(graphs_pair[0])
    graph_dict_2 = parse_tree_to_dict(graphs_pair[1])
    total_num_roles = len(graph_dict_1.keys())+len(graph_dict_2.keys())
    
    lemmatizer = WordNetLemmatizer()
    for role in graph_dict_1.keys():
        if role in graph_dict_2.keys():
            filler_1 = graph_dict_1[role].strip('\')') # get filler for role
            filler_2 = graph_dict_2[role].strip('\')')
            
            if filler_1.find('-'):
                filler_1 = filler_1.split('-')[0] # remove sense number for verbs and lemmatise
                filler_1 = lemmatizer.lemmatize(filler_1,'v')
            if filler_2.find('-'):
                filler_2 = filler_2.split('-')[0]
                filler_2 = lemmatizer.lemmatize(filler_2,'v')
            
            try: # when embeddings are available
                word_embedding_1 = conceptnet_embeds[filler_1] # get embeddings for filler
                word_embedding_2 = conceptnet_embeds[filler_2]
                semantic_filler_sims[(role,filler_1,filler_2)] = sep.cosine_sim(word_embedding_1, word_embedding_2) # store in dict
            except KeyError: # if the word doesn't have embeddings
                if filler_1==filler_2:
                    semantic_filler_sims[(role,filler_1,filler_2)] = 1
                else:
                    semantic_filler_sims[(role,filler_1,filler_2)] = 0
        
    overall_sim = 2*np.sum(list(semantic_filler_sims.values()))/total_num_roles # average rolewise similarity
    return (overall_sim,semantic_filler_sims)


# Calculate smatch similarity
def smatch_sim(graph_pair):
    out = match_pair((graph_pair[0].split('.')[1],graph_pair[1].split('.')[1]))
    return out[0]/12

In [100]:
## Compute AMR and Smatch similarities.

# Load AMR parsed sentence data
with open(file_paths_dict['parses_path']+'\AMR Parsing\\'+dataset_name+"_AMR_parse.json",'r') as file:
    AMR_graph_storage = json.load(file)
    
# Note: If this code causes errors its almost certainly because one or more sentences don't end with a full stop.

# Prepare storage array for sentence similarities
sim_storage = {}
sim_funcs = ['AMR','smatch']
for function in sim_funcs:
    sim_storage[function] = np.array([])
    
# Compute similarities for all sentence pairs in dataset
if pairs==True:
    for sentence_id in AMR_graph_storage.keys():
        AMR_parse_trees = AMR_graph_storage[sentence_id][2:]
        
        # add periods after a sentence if needed
        if AMR_parse_trees[0].find('.')==-1: 
            AMR_parse_trees[0] = AMR_parse_trees[0].replace('\n','.\n',1)
        if AMR_parse_trees[1].find('.')==-1:
            AMR_parse_trees[1] = AMR_parse_trees[1].replace('\n','.\n',1)
        print(sentence_id, AMR_parse_trees)
        # compute and store similarities
        sim_storage['AMR'] = np.append(sim_storage['AMR'],AMR_graph_sim(AMR_parse_trees)[0])
        sim_storage['smatch'] = np.append(sim_storage['smatch'],smatch_sim(AMR_parse_trees))

# Compute similarities for all pairwise comparisons for lists of sentences (neuro)
elif pairs==False:
    sent_id_pairs = list(itertools.combinations(AMR_graph_storage.keys(), 2))
    for sentence_id_pair in sent_id_pairs:
        sent_id_1 = sentence_id_pair[0]
        sent_id_2 = sentence_id_pair[1]
        AMR_parse_tree_1 = AMR_graph_storage[sent_id_1][1][0]
        AMR_parse_tree_2 = AMR_graph_storage[sent_id_2][1][0]
        AMR_parse_trees = [AMR_parse_tree_1,AMR_parse_tree_2]
        sim_storage['AMR'] = np.append(sim_storage['AMR'],AMR_graph_sim(AMR_parse_trees)[0])
        sim_storage['smatch'] = np.append(sim_storage['smatch'],smatch_sim(AMR_parse_trees))
    
# Save results
for sim_type in sim_funcs:
    np.savetxt(dataset_name+'_'+sim_type+'_similarities.txt', sim_storage[sim_type], fmt='%f')
print(dataset_name+'_'+sim_type+'_similarities.txt'+' saved')

1 ['# ::snt It that happens, just pull the plug.\n(z1 / pull-06\n      :mode imperative\n      :ARG0 (z2 / you)\n      :ARG1 (z3 / plug)\n      :mod (z4 / just)\n      :condition (z5 / that))', '# ::snt if that ever happens, just pull the plug.\n(z1 / pull-06\n      :ARG0 (z2 / you)\n      :ARG1 (z3 / plug)\n      :mod (z4 / just)\n      :condition (z5 / event\n            :mod (z6 / that)\n            :time (z7 / ever)))']
2 ['# ::snt A black dog running through water.\n(z1 / dog\n      :ARG1-of (z2 / black-04)\n      :ARG0-of (z3 / run-02\n            :path (z4 / water)))', '# ::snt A black dog is running through some water.\n(z1 / run-02\n      :ARG0 (z2 / dog\n            :ARG1-of (z3 / black-04))\n      :path (z4 / water\n            :quant (z5 / some)))']
3 ["# ::snt I've been searchingthe entire abbey for you.\n(z1 / search-01\n      :ARG0 (z2 / i)\n      :ARG1 (z3 / abbey\n            :mod (z4 / entire))\n      :ARG2 (z5 / you))", "# ::snt I'm looking for you all over the abbey

Unmatched parenthesis at position 1 in processing 
(


4240 ['# ::snt indicate the method [18] of implementation.\n(z1 / indicate-01\n      :ARG1 (z2 / method\n            :li 18\n            :manner-of (z3 / implement-01)))', '# ::snt Indicate the groups that the attendee belongs to.\n(z1 / indicate-01\n      :mode imperative\n      :ARG0 (z2 / you)\n      :ARG1 (z3 / group\n            :ARG1-of (z4 / belong-01\n                  :ARG0 (z5 / person\n                        :ARG0-of (z6 / attend-01)))))']
4241 ['# ::snt go to dis site from ebay.\n(z1 / go-02\n      :mode imperative\n      :ARG0 (z2 / you)\n      :ARG4 (z3 / site\n            :mod (z4 / company\n                  :name (z5 / name\n                        :op1 "eBay"))))', '# ::snt Try wwwallmusiccom, it\'s my favorite site.\n(z1 / try-01\n      :mode imperative\n      :ARG0 (z2 / you)\n      :ARG1 (z3 / url-entity\n            :value "wwwallmusiccom"\n            :domain (z4 / site\n                  :mod (z5 / favorite)\n                  :poss (z6 / i))))']
4242 ['# ::snt

Unmatched parenthesis at position 1 in processing 
(


4681 ['# ::snt The child is well bundled as he walks in the snow.\n(z1 / bundle-01\n      :ARG1 (z2 / child)\n      :time (z3 / walk-01\n            :ARG0 z2\n            :ARG2 (z4 / snow-01))\n      :ARG1-of (z5 / well-09))', '# ::snt A small child getting a taste of dessert mix.\n(z1 / get-01\n      :ARG0 (z2 / child\n            :mod (z3 / small))\n      :ARG1 (z4 / taste\n            :mod (z5 / mix-01\n                  :ARG1 (z6 / dessert))))']
4682 ['# ::snt Remember the mother, avenge the father.\n(z1 / and\n      :op1 (z2 / remember-01\n            :mode imperative\n            :ARG0 (z3 / you)\n            :ARG1 (z4 / person\n                  :ARG0-of (z5 / have-rel-role-91\n                        :ARG2 (z6 / mother))))\n      :op2 (z7 / avenge-01\n            :mode imperative\n            :ARG0 z3\n            :ARG1 (z8 / person\n                  :ARG0-of (z9 / have-rel-role-91\n                        :ARG2 (z10 / father)))))', '# ::snt Leaving behind a huge debt for my m

### Compute and save WLK Wasser similarity

In [101]:
# Prepare set of sentence pairs AMR graphs (only run when using neuro data)
if pairs==False:
    graph_pair_list = []
    sent_id_pairs = list(itertools.combinations(AMR_graph_storage.keys(), 2))
    for index,id_pair in enumerate(sent_id_pairs):
        graph_1 = AMR_graph_storage[id_pair[0]][1][0].replace('::snt', '::snt-'+str(index+1))+'\n' # need to adjust naming
        graph_2 = AMR_graph_storage[id_pair[1]][1][0].replace('::snt', '::snt-'+str(index+1))+'\n' 
        graph_pair_list.append([graph_1,graph_2])
        
    WLK_sent_a_set = np.array(graph_pair_list)[:,0]
    WLK_sent_b_set = np.array(graph_pair_list)[:,1]

    np.savetxt(dataset_name+"_WLK_sent_a_AMR_parse.txt", WLK_sent_a_set, fmt='%s')
    np.savetxt(dataset_name+"_WLK_sent_b_AMR_parse.txt", WLK_sent_b_set, fmt='%s')

In [102]:
# Get filenames for the code in the cell below (first two .txt files)
glob.glob('*_AMR_parse.txt')

[]

In [10]:
# Compute WLK similarity of graphs
# Use the filenames printed above to adjust the call below.
# Note: the two .txt files containing AMR parses need to have a sent id for each (e.g. ::snt-1)

print('WLK sim:')
!python "AMR_wlk_wasser\main_wlk_wasser.py" -a "STR_processed_a_AMR_parse.txt" -b "STR_processed_b_AMR_parse.txt" > "STR_processed_WLK_Wasser_similarities.txt"

WLK sim:


