In [2]:
## This script constructs the AMR parsing for a list of sentences
# Run using 'Spring' environment with Python 3.8
# James Fodor 2023
# 
# Requires the amrlib package, see docs here: https://amrlib.readthedocs.io/en/latest/
# The specific parsing model is here https://github.com/SapienzaNLP/spring
#
# The code takes in a list of sentences and returns .txt and .json files with the AMR parses of each sentence.
# See the AMRlib documentation linked above for the formatting of these files.

# load libraries
import json
import amrlib
import numpy as np
import sentence_embeds_processing as sep

from scipy.stats import spearmanr
from amrlib.evaluate.smatch_enhanced import match_pair

# load file paths
with open("file_paths.json", "r") as file:
    file_paths_dict = json.load(file)

# numpy print options
np.set_printoptions(precision=2, threshold=2000, linewidth=200, suppress=True, floatmode='fixed')

### Define key functions and load libraries

In [3]:
## Load embeddings and parsing model

# load ConceptNet embeddings
model_address = file_paths_dict['path_root']+'\Word Embeddings\ConceptNet Embeddings\\numberbatch-en.txt'
conceptnet_embeds = sep.import_word_model(model_address)

# load AMR parse model
model_address = file_paths_dict['path_root']+'\Sentence Encoders\\amrlib-parsing'
stog = amrlib.load_stog_model(model_address)

In [7]:
## Functions to perform AMR parsing and calculate AMR similarity

# Function to parse sentences (sentences must end with a full stop!!)
def AMR_parse_sent_pair(sentence_pair):
    graphs = stog.parse_sents(sentence_pair)
    # for graph in graphs:
        # print(graph)
    return graphs


# Calculate smatch similarity
def smatch_sim(graph_pair):
    out = match_pair((graph_pair[0].split('.')[1],graph_pair[1].split('.')[1]))
    return out[0]/12

### Load sentence datasets

In [6]:
## Show available datasets, as specified in the sep module
pairs = False # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
print('Available datasets:')
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

Available datasets:
0 2014 Wehbe\Stimuli\Chapter_9_sentences_final
1 2017 Anderson\Stimuli\stimuli_final
2 2018 Pereira\Stimuli\stimuli_243sentences
3 2018 Pereira\Stimuli\stimuli_384sentences
4 2020 Alice Dataset\Stimuli\stimuli_sentences_final
5 2020 Zhang\Stimuli\test_sentences_final
6 2023 Fodor Dataset\Fodor2023-final240
7 2023 Fodor Dataset\Fodor2023-final192
8 2023 Fodor Dataset\Fodor2023-prelim


In [8]:
## Load sentence set (choose number from those printed above)
dataset = datasets[5]
sentences_dict = sep.load_set_of_sentences(dataset, file_paths_dict['data_pairs_path'], file_paths_dict['data_nonpaired_path'], pairs)
full_dataset_name = sep.fix_sentence_dataset_name(dataset)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset,'with',n,'sentences')


loaded 2020 Zhang\Stimuli\test_sentences_final with 95 sentences


### Compute and save AMR parsing

In [9]:
## Parse and save AMR parses using Spring parser
# SENTENCES MUST END WITH A FULL STOP!

# Parse sentence pairs
if pairs==True:
    AMR_graph_storage = {}
    for pair_id in sentences_dict.keys():
        sent_1 = sentences_dict[pair_id][0]
        sent_2 = sentences_dict[pair_id][1]
        try:
            sent_parses = AMR_parse_sent_pair(sentences_dict[pair_id][0:2]) # get parses for both sentences in pair
        except:
            sent_parses = ['NULL','NULL'] # in case parsing fails
        AMR_graph_storage[pair_id] = [sent_1,sent_2,sent_parses[0],sent_parses[1]]
        if pair_id%20==0:
            print(pair_id)
        
    ## Reformat AMR parse dict for saving two sets of sentences separately
    AMR_parse_sent_1 = []
    AMR_parse_sent_2 = []
    for idx in AMR_graph_storage.keys():
        new_tree_1 = AMR_graph_storage[idx][2].replace('::snt', '::snt-'+str(idx)) # need to adjust naming
        new_tree_2 = AMR_graph_storage[idx][3].replace('::snt', '::snt-'+str(idx))
        AMR_parse_sent_1.append(new_tree_1)
        AMR_parse_sent_2.append(new_tree_2)
        
    # save first set of sentences
    save_file = open(full_dataset_name+"_a_AMR_parse.txt", "w", encoding='utf-8')
    for line in AMR_parse_sent_1:
        save_file.writelines(line)
        save_file.write('\n\n')
    save_file.close()

    # save second set of sentences
    save_file = open(full_dataset_name+"_b_AMR_parse.txt", "w", encoding='utf-8')
    for line in AMR_parse_sent_2:
        save_file.writelines(line)
        save_file.write('\n\n')
    save_file.close()
    
# Parse single list of sentences (neuro data)
elif pairs==False:
    AMR_graph_storage = {}
    for sent_id in sentences_dict.keys():
        sent = sentences_dict[sent_id]
        try:
            sent_parse = AMR_parse_sent_pair([sent]) # inputs needs to be a list
        except:
            sent_parse = ['NULL'] # in case parsing fails
        AMR_graph_storage[sent_id] = [sent,sent_parse]
        if sent_id%20==0:
            print(sent_id)
        
    ## Reformat AMR parse dict for saving two sets of sentences separately
    AMR_parse_sent = []
    for idx in AMR_graph_storage.keys():
        new_tree = AMR_graph_storage[idx][1][0].replace('::snt', '::snt-'+str(idx)) # need to adjust naming
        AMR_parse_sent.append(new_tree)
        
    # save first set of sentences
    save_file = open(full_dataset_name+"_AMR_parse.txt", "w", encoding='utf-8')
    for line in AMR_parse_sent:
        save_file.writelines(line)
        save_file.write('\n\n')
    save_file.close()

# Save full set of parses of sentences to json file
with open(full_dataset_name+"_AMR_parse.json", "w") as filename:
    json.dump(AMR_graph_storage, filename)


20
40
60
80
