In [11]:
## This script computes the Smatch, WWLK, and AMR similarity between the AMR parses of two sentences 
# Run using spring environment with Python 3.8
# James Fodor 2023
#
# This code requires the input sentences have been parsed using the 'Calc_parse_AMRs.ipynb' file.
# Three types of similarities are computed:
# 1. Smatch similarity: a standard graph similarity measure
# 2. AMR-sim: this is a custom measure developed for this project, see Fodor 2023 for details.
# 3. WLK-Wasser: incorporates both graph distance and graded word similarity, 
#                see https://github.com/flipz357/weisfeiler-leman-amr-metrics/blob/main/README.md


# load libraries
import json
import numpy as np
import glob
import itertools
import sentence_embeds_processing as sep

from amrlib.evaluate.smatch_enhanced import match_pair # for smatch score


# base path for all data files
path_root = "D:\Study and Projects\School Work\Year 25 - PhD\Data\\"
data_pairs_path = path_root+'\\Sentence Similarity Data\\Sentence Similarities Final\\'
data_nonpaired_path = path_root+'\\Neuroimaging Data\\'
sims_path = 'Analysis Results\Sentence Similarities\\'
parses_path = 'Analysis Results\Sentence Parses\\'

# load nltk lemmatizer
from nltk.data import path
path.append(path_root+"\Frames and Structured Data\\FrameNet\\nltk_data")
from nltk.stem import WordNetLemmatizer

# numpy print options
np.set_printoptions(precision=2, threshold=2000, linewidth=200, suppress=True, floatmode='fixed')

### Load sentence datasets, word embeddings, and AMR parses

In [3]:
## Show available datasets, as specified in the sep module
pairs = False # specify if we are using paired sentences data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
print('Available datasets:')
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

Available datasets:
0 2014 Wehbe\Stimuli\Chapter_9_sentences_final
1 2017 Anderson\Stimuli\stimuli_final
2 2018 Pereira\Stimuli\stimuli_243sentences
3 2018 Pereira\Stimuli\stimuli_384sentences
4 2020 Alice Dataset\Stimuli\stimuli_sentences_final
5 2020 Zhang\Stimuli\test_sentences_final
6 2023 Fodor Dataset\Fodor2023-final240
7 2023 Fodor Dataset\Fodor2023-final192
8 2023 Fodor Dataset\Fodor2023-prelim


In [4]:
## Load sentence set (choose number from those printed above)
dataset = datasets[5]
sentences_dict = sep.load_set_of_sentences(dataset, data_pairs_path, data_nonpaired_path, pairs)
full_dataset_name = sep.fix_sentence_dataset_name(dataset)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset,'with',n,'sentences')


loaded 2020 Zhang\Stimuli\test_sentences_final with 95 sentences


In [5]:
## Load word embeddings and AMR parses

# Load ConceptNet embeddings
model_address = path_root+'\Word Embeddings\ConceptNet Embeddings\\numberbatch-en.txt'
conceptnet_embeds = sep.import_word_model(model_address)

# Load AMR parsed sentence data
with open(path_root+parses_path+'\AMR Parsing\\'+full_dataset_name+"_AMR_parse.json",'r') as file:
    AMR_graph_storage = json.load(file)

### Compute and save AMR and Smatch similarities

In [7]:
## Functions to perform AMR parsing and calculate AMR similarity

# Function to convert a string parse tree of a sentence to a dictionary of roles and values
def parse_tree_to_dict(parse_tree): 
    # extract parse tree only
    parse = parse_tree.split('.')[1] # uses the period in the sentence to separate it from the parse tree
    parse_list = parse.split('\n')
    parse_list[1]=':MAIN '+parse_list[1]

    # construct a list of parse tree terms
    new_parse_list = []
    tab_length = 6 # length of indentation in parse tree
    last_element_of_level_n = {0:'MAIN'}
    for line in parse_list:
        # skip any blank lines
        if line=='': 
            continue
        
        # extract terms
        leading_ws = line.split(':')[0] # get leading white space
        line_no_ws = line.split(':')[1] # get terms following ws
        
        # calculate current level in parse tree
        if len(leading_ws)%tab_length != 0:
            list_level = 0
        else:
            list_level = int(len(leading_ws)/6) 
        
        # extract list item at current parse tree level
        header_string = ''
        for i in range(0,list_level):
            header_string = header_string+'.'+last_element_of_level_n[i]
        header_string = header_string+'.'
        new_parse_list.append(str(header_string)+line_no_ws)
        
        # append parse tree item to list
        last_element_of_level_n[list_level] = line_no_ws.split(' ')[0]
        
    # turn parse tree list into dictionary
    parse_tree_dict = {}
    for item in new_parse_list:
        item_list = item.split(' ')
        parse_tree_dict[item_list[0]] = item_list[-1]
        
    return parse_tree_dict


# Function to calculate AMR parse similarity
def AMR_graph_sim(graphs_pair):
    semantic_filler_sims = {}
    graph_dict_1 = parse_tree_to_dict(graphs_pair[0])
    graph_dict_2 = parse_tree_to_dict(graphs_pair[1])
    total_num_roles = len(graph_dict_1.keys())+len(graph_dict_2.keys())
    
    lemmatizer = WordNetLemmatizer()
    for role in graph_dict_1.keys():
        if role in graph_dict_2.keys():
            filler_1 = graph_dict_1[role].strip('\')') # get filler for role
            filler_2 = graph_dict_2[role].strip('\')')
            
            if filler_1.find('-'):
                filler_1 = filler_1.split('-')[0] # remove sense number for verbs and lemmatise
                filler_1 = lemmatizer.lemmatize(filler_1,'v')
            if filler_2.find('-'):
                filler_2 = filler_2.split('-')[0]
                filler_2 = lemmatizer.lemmatize(filler_2,'v')
            
            try: # when embeddings are available
                word_embedding_1 = conceptnet_embeds[filler_1] # get embeddings for filler
                word_embedding_2 = conceptnet_embeds[filler_2]
                semantic_filler_sims[(role,filler_1,filler_2)] = sep.cosine_sim(word_embedding_1, word_embedding_2) # store in dict
            except KeyError: # if the word doesn't have embeddings
                if filler_1==filler_2:
                    semantic_filler_sims[(role,filler_1,filler_2)] = 1
                else:
                    semantic_filler_sims[(role,filler_1,filler_2)] = 0
        
    overall_sim = 2*np.sum(list(semantic_filler_sims.values()))/total_num_roles # average rolewise similarity
    return (overall_sim,semantic_filler_sims)


# Calculate smatch similarity
def smatch_sim(graph_pair):
    out = match_pair((graph_pair[0].split('.')[1],graph_pair[1].split('.')[1]))
    return out[0]/12

In [12]:
## Compute AMR and Smatch similarities.
# Note: If this code causes errors its almost certainly because one or more sentences don't end with a full stop.

# Prepare storage array for sentence similarities
sim_storage = {}
sim_funcs = ['AMR','smatch']
for function in sim_funcs:
    sim_storage[function] = np.array([])
    
# Compute similarities for all sentence pairs in dataset
if pairs==True:
    for sentence_id in AMR_graph_storage.keys():
        AMR_parse_trees = AMR_graph_storage[sentence_id][2:]
        
        # add periods after a sentence if needed
        if AMR_parse_trees[0].find('.')==-1: 
            AMR_parse_trees[0] = AMR_parse_trees[0].replace('\n','.\n',1)
        if AMR_parse_trees[1].find('.')==-1:
            AMR_parse_trees[1] = AMR_parse_trees[1].replace('\n','.\n',1)
        
        # compute and store similarities
        sim_storage['AMR'] = np.append(sim_storage['AMR'],AMR_graph_sim(AMR_parse_trees)[0])
        sim_storage['smatch'] = np.append(sim_storage['smatch'],smatch_sim(AMR_parse_trees))

# Compute similarities for all pairwise comparisons for lists of sentences (neuro)
elif pairs==False:
    sent_id_pairs = list(itertools.combinations(AMR_graph_storage.keys(), 2))
    for sentence_id_pair in sent_id_pairs:
        sent_id_1 = sentence_id_pair[0]
        sent_id_2 = sentence_id_pair[1]
        AMR_parse_tree_1 = AMR_graph_storage[sent_id_1][1][0]
        AMR_parse_tree_2 = AMR_graph_storage[sent_id_2][1][0]
        AMR_parse_trees = [AMR_parse_tree_1,AMR_parse_tree_2]
        sim_storage['AMR'] = np.append(sim_storage['AMR'],AMR_graph_sim(AMR_parse_trees)[0])
        sim_storage['smatch'] = np.append(sim_storage['smatch'],smatch_sim(AMR_parse_trees))
    
# Save results
for sim_type in sim_funcs:
    np.savetxt(full_dataset_name+'_'+sim_type+'_similarities.txt', sim_storage[sim_type], fmt='%f')
print('saved')

saved


### Compute and save WLK Wasser similarity

In [13]:
# Prepare set of sentence pairs AMR graphs (only run when using neuro data)
if pairs==False:
    graph_pair_list = []
    sent_id_pairs = list(itertools.combinations(AMR_graph_storage.keys(), 2))
    for index,id_pair in enumerate(sent_id_pairs):
        graph_1 = AMR_graph_storage[id_pair[0]][1][0].replace('::snt', '::snt-'+str(index+1))+'\n' # need to adjust naming
        graph_2 = AMR_graph_storage[id_pair[1]][1][0].replace('::snt', '::snt-'+str(index+1))+'\n' 
        graph_pair_list.append([graph_1,graph_2])
        
    WLK_sent_a_set = np.array(graph_pair_list)[:,0]
    WLK_sent_b_set = np.array(graph_pair_list)[:,1]

    np.savetxt(full_dataset_name+"_WLK_sent_a_AMR_parse.txt", WLK_sent_a_set, fmt='%s')
    np.savetxt(full_dataset_name+"_WLK_sent_b_AMR_parse.txt", WLK_sent_b_set, fmt='%s')

In [14]:
# Get filenames for the code in the cell below (first two .txt files)
glob.glob('*_AMR_parse.txt')

['Zhang_neuro_WLK_sent_a_AMR_parse.txt',
 'Zhang_neuro_WLK_sent_b_AMR_parse.txt']

In [16]:
# Compute WLK similarity of graphs
# Use the filenames printed above to adjust the call below.
# Note: the two .txt files containing AMR parses need to have a sent id for each (e.g. ::snt-1)

print('WLK sim:')
!python "AMR_wlk_wasser\main_wlk_wasser.py" -a "Zhang_neuro_WLK_sent_a_AMR_parse.txt" -b "Zhang_neuro_WLK_sent_b_AMR_parse.txt" > "Zhang_neuro_WLK_Wasser_similarities.txt"

WLK sim:
