In [2]:
## This script constructs the AMR parsing for a list of sentences
# Run using 'Spring' environment with Python 3.8
# James Fodor 2023
# 
# Requires the amrlib package, see docs here: https://amrlib.readthedocs.io/en/latest/
# The specific parsing model is here https://github.com/SapienzaNLP/spring
#
# The code takes in a list of sentences and returns .txt and .json files with the AMR parses of each sentence.
# See the AMRlib documentation linked above for the formatting of these files.

# load libraries
import json
import amrlib
import numpy as np
import sentence_embeds_processing as sep

from scipy.stats import spearmanr
from amrlib.evaluate.smatch_enhanced import match_pair

# file containing path directories
with open("D:\\My Code\\Python\\2023_02 fMRI RSA Analysis\\file_paths.json", "r") as file:
    file_paths_dict = json.load(file)

# numpy print options
np.set_printoptions(precision=2, threshold=2000, linewidth=200, suppress=True, floatmode='fixed')

### Define key functions and load libraries

In [3]:
## Load embeddings and parsing model

# load ConceptNet embeddings
model_address = file_paths_dict['path_root']+'Word Embeddings\ConceptNet Embeddings\\numberbatch-en.txt'
conceptnet_embeds = sep.import_word_model(model_address)

# load AMR parse model
model_address = file_paths_dict['path_root']+'Sentence Encoders\\amrlib-parsing'
stog = amrlib.load_stog_model(model_address)

In [4]:
## Functions to perform AMR parsing and calculate AMR similarity

# Function to parse sentences (sentences must end with a full stop!!)
def AMR_parse_sent_pair(sentence_pair):
    graphs = stog.parse_sents(sentence_pair)
    # for graph in graphs:
        # print(graph)
    return graphs


# Calculate smatch similarity
def smatch_sim(graph_pair):
    out = match_pair((graph_pair[0].split('.')[1],graph_pair[1].split('.')[1]))
    return out[0]/12

### Load sentence datasets

In [8]:
## Show available datasets, as specified in the sep module
pairs = True # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
print('Available datasets:')
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

Available datasets:
0 GS2011_processed
1 KS2013_processed
2 Fodor_pilot_2022
3 STS131_processed
4 SICK_relatedness
5 STR_processed
6 STSb_captions_test
7 STSb_forums_test
8 STSb_headlines_test
9 STSb_test
10 STS3k_all


In [9]:
## Load sentence set 

# choose number from those printed above
dataset_name = datasets[5]

# load sentence set into dictionary depending on type
if pairs == True:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['data_pairs_path'], pairs)
else:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['neuro_root'], pairs)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset_name,'with',n,'sentences')

# store in list
sentences = []
if pairs==True: # use this for sentence similarity pair data
    sentences.append(list(np.array(list(sentences_dict.values()))[:,0].flatten()))
    sentences.append(list(np.array(list(sentences_dict.values()))[:,1].flatten()))
else: # use this for neuroimaging data/list of sentences
    sentences.append(list(sentences_dict.values()))


loaded STR_processed with 5500 sentences


### Compute and save AMR parsing

In [10]:
## Parse and save AMR parses using Spring parser
# SENTENCES MUST END WITH A FULL STOP!

# Parse sentence pairs
if pairs==True:
    AMR_graph_storage = {}
    for pair_id in sentences_dict.keys():
        sent_1 = sentences_dict[pair_id][0]
        sent_2 = sentences_dict[pair_id][1]
        try:
            sent_parses = AMR_parse_sent_pair(sentences_dict[pair_id][0:2]) # get parses for both sentences in pair
        except:
            sent_parses = ['NULL','NULL'] # in case parsing fails
        AMR_graph_storage[pair_id] = [sent_1,sent_2,sent_parses[0],sent_parses[1]]
        if pair_id%20==0:
            print(pair_id)
        
    ## Reformat AMR parse dict for saving two sets of sentences separately
    AMR_parse_sent_1 = []
    AMR_parse_sent_2 = []
    for idx in AMR_graph_storage.keys():
        new_tree_1 = AMR_graph_storage[idx][2].replace('::snt', '::snt-'+str(idx)) # need to adjust naming
        new_tree_2 = AMR_graph_storage[idx][3].replace('::snt', '::snt-'+str(idx))
        AMR_parse_sent_1.append(new_tree_1)
        AMR_parse_sent_2.append(new_tree_2)
        
    # save first set of sentences
    save_file = open(dataset_name+"_a_AMR_parse.txt", "w", encoding='utf-8')
    for line in AMR_parse_sent_1:
        save_file.writelines(line)
        save_file.write('\n\n')
    save_file.close()

    # save second set of sentences
    save_file = open(dataset_name+"_b_AMR_parse.txt", "w", encoding='utf-8')
    for line in AMR_parse_sent_2:
        save_file.writelines(line)
        save_file.write('\n\n')
    save_file.close()
    
# Parse single list of sentences (neuro data)
elif pairs==False:
    AMR_graph_storage = {}
    for sent_id in sentences_dict.keys():
        sent = sentences_dict[sent_id]
        try:
            sent_parse = AMR_parse_sent_pair([sent]) # inputs needs to be a list
        except:
            sent_parse = ['NULL'] # in case parsing fails
        AMR_graph_storage[sent_id] = [sent,sent_parse]
        if sent_id%20==0:
            print(sent_id)
        
    ## Reformat AMR parse dict for saving two sets of sentences separately
    AMR_parse_sent = []
    for idx in AMR_graph_storage.keys():
        new_tree = AMR_graph_storage[idx][1][0].replace('::snt', '::snt-'+str(idx)) # need to adjust naming
        AMR_parse_sent.append(new_tree)
        
    # save first set of sentences
    save_file = open(dataset_name+"_AMR_parse.txt", "w", encoding='utf-8')
    for line in AMR_parse_sent:
        save_file.writelines(line)
        save_file.write('\n\n')
    save_file.close()

# Save full set of parses of sentences to json file
with open(dataset_name+"_AMR_parse.json", "w") as filename:
    json.dump(AMR_graph_storage, filename, indent=0)


20
40
60
80
100
120
140
160
180
200
220
240
260


ignoring epigraph data for duplicate triple: ('z2', ':location', 'z1')
ignoring epigraph data for duplicate triple: ('z2', ':location', 'z1')
ignoring secondary node contexts for 'z2'
ignoring secondary node contexts for 'z2'


280
300
320
340
360
380
400
420
440
460
480
500
520
540
560
580
600
620
640
660
680
700
720
740
760
780
800
820
840
860
880
900
920
940


ignoring epigraph data for duplicate triple: ('z1', ':ARG2', 'z6')
ignoring epigraph data for duplicate triple: ('z1', ':ARG2', 'z6')
ignoring secondary node contexts for 'z6'
ignoring secondary node contexts for 'z6'


960
980
1000
1020
1040


Missing concept: ( z1 / and :op1 ( z2 / order :mod ( z3 / sell-01 ) ) :op2 ( z4 / order :mod z3 ) :op3 ( z5 / refund :mod z3 ) :op4 ( z6 / refund :mod z3 ) :op5 ( z7 / order :mod z3 ) :op6 ( z8 / order :mod ( z9 / purchase-01 ) ) :op7 ( z10 / refund :mod z9 ) :op8 ( z11 / refund :mod z9 ) :op9 ( z12 / ) ) 


1060
1080
1100
1120
1140
1160
1180
1200
1220
1240
1260
1280
1300
1320
1340
1360
1380
1400
1420
1440
1460


ignoring epigraph data for duplicate triple: ('z14', ':mod', 'z11')
ignoring epigraph data for duplicate triple: ('z14', ':mod', 'z11')


1480
1500
1520
1540
1560
1580
1600
1620
1640
1660
1680
1700
1720
1740
1760
1780
1800
1820
1840
1860
1880
1900
1920
1940
1960
1980
2000
2020
2040
2060
2080
2100
2120
2140
2160
2180
2200
2220
2240
2260
2280
2300
2320
2340
2360
2380
2400
2420
2440
2460
2480
2500
2520
2540
2560
2580
2600
2620
2640
2660
2680
2700
2720
2740
2760
2780
2800
2820
2840
2860
2880
2900
2920
2940
2960
2980
3000
3020
3040
3060
3080
3100
3120
3140
3160
3180
3200
3220
3240
3260
3280
3300
3320
3340
3360
3380
3400
3420
3440
3460
3480
3500
3520
3540
3560
3580
3600
3620
3640
3660
3680
3700
3720
3740
3760
3780
3800
3820
3840
3860
3880
3900
3920
3940
3960
3980
4000
4020
4040
4060
4080
4100
4120
4140
4160
4180
4200
4220
4240
4260
4280
4300
4320
4340
4360
4380
4400
4420
4440
4460
4480
4500
4520


ignoring epigraph data for duplicate triple: ('z1', ':ARG6', '5223')
ignoring epigraph data for duplicate triple: ('z1', ':ARG6', '5223')


4540
4560
4580
4600
4620
4640
4660
4680
4700
4720
4740
4760
4780
4800
4820
4840
4860
4880
4900
4920
4940
4960
4980
5000
5020
5040
5060
5080
5100
5120
5140
5160
5180
5200
5220
5240
5260
5280
5300
5320
5340
5360
5380
5400
5420
5440
5460
5480
5500
