In [2]:
## This script computes VerbNet pairwise sentence similarities for Behavioural Data (paired)
# Requires that sentences have already been parsed using VerbNet semantic roles, as performed by 'Calc_parse_GPT4.ipynb'.
# Run using base python 3.9
# James Fodor 2023

# load libraries
import numpy as np
import json
import random
import itertools
import sentence_embeds_processing as sep
import seaborn as sns
import pickle

from sklearn.decomposition import PCA
from scipy.stats import spearmanr
from scipy.ndimage import convolve
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor

# base paths for data files
path_root = "D:\Study and Projects\School Work\Year 25 - PhD\Data\\"
data_pairs_path = path_root+'\\Sentence Similarity Data\\Sentence Similarities Final\\'
data_nonpaired_path = path_root+'\\Neuroimaging Data\\'
embeddings_path = "Analysis Results\Sentence Embeddings\\"
parse_path = "Analysis Results\Sentence Parses\\"
conceptnet_path = '\Word Embeddings\ConceptNet Embeddings\\numberbatch-en.txt'

# set fixed params
role_weights_dict = {'Agent':2, 'Verb':3, 'Patient':2, 'Theme':2, 'Time':0.5, 'Manner':0.5, 'Location':0.5, 'Trajectory':0.5}
role_weights_list = np.array(list(role_weights_dict.values()))/11
role_weights_list_augment = np.array(list(role_weights_dict.values())+[3])/12

# numpy print options
np.set_printoptions(precision=2, threshold=2000, linewidth=200, suppress=True, floatmode='fixed')
sns.set()

# define stop words
stop_words = np.loadtxt(path_root+'\Frames and Structured Data\FrameNet\\nltk_data\corpora\stopwords\english.txt', dtype='str') # list of stop words

### Step 0: Load sentence dataset and parse

Load the dataset of sentences we want to compute VerbNet similarities for.

In [11]:
## Show available datasets, as specified in the sentence_embeds_processing module
pairs = False # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

{0: 'GS2011_processed', 1: 'KS2013_processed', 2: 'Fodor_pilot_2022', 3: 'STS131_processed', 4: 'SICK_relatedness', 5: 'STSb_captions_test', 6: 'STSb_forums_test', 7: 'STSb_headlines_test', 8: 'STSb_test', 9: 'STS3k_all'}


In [None]:
## Load sentence set (choose number from those printed above)
dataset = datasets[5]
sentences_dict = sep.load_set_of_sentences(dataset, data_pairs_path, data_nonpaired_path, pairs)
full_dataset_name = sep.fix_sentence_dataset_name(dataset)
full_dataset_name = sep.extract_text_between('^([^_]+)',full_dataset_name)[0] # remove 'neuro' from name
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset,'with',n,'sentences')

In [None]:
## load parse data for the dataset
parse_type = 'GPT4' # choose parse type ('verbnet' or 'GPT4')
json_path = path_root+parse_path+parse_type+' Parsing\\full_'+parse_type+'_pass_'+full_dataset_name+'_mod.json'
sentence_pair_parsed_dict = json.load(open(json_path)) # load verbnet parse data from json
print('parse type:',parse_type)
print('sentences parsed:',n)

### Step 1: Load experimental pairwise sentence similarities

Load the experimental (human) pairwise similarity judgements.

In [20]:
## Load experimental similarities from file

# Define key user-specified options
dataset_index_to_load = 9 # choose exprimental dataset to analyse
parse_type = 'VerbNet' # choose parse type ('verbnet' or 'GPT4')
start_index = 1 # slightly different way of storing data

# get experimental similarities for the dataset
if pairs==True:
    sim_storage = {}
    sim_storage['expr'] = np.array(list(sentences_dict.values()))[:,2].astype(float)

loaded: STS3k_all
parse type: VerbNet


### Step 2: Load embeddings model

In [6]:
## Load all the needed model embeddings for rolewise sims

# Load ConceptNet embeddings
model_address = path_root+conceptnet_path
conceptnet_embeds = sep.import_word_model(model_address)

# Load SentBERT model
# install here: https://www.sbert.net/
# from sentence_transformers import SentenceTransformer, util
# sentbert_model = SentenceTransformer(path_root+'\Sentence Embeddings\sentence-transformers-MiniLM-L6-v2')

# # Load AMRBART model
# from transformers import BartForConditionalGeneration
# config_state = AutoConfig.from_pretrained(path_root+'\Sentence Embeddings\\amrbart-large', output_hidden_states=True) # get hidden states
# amrbart_tokenizer = AutoTokenizer.from_pretrained(path_root+'\Sentence Embeddings\\amrbart-large', collapse_name_ops=False, use_pointer_tokens=True, raw_graph=False)
# amrbart_model = BartForConditionalGeneration.from_pretrained(path_root+'\Sentence Embeddings\\amrbart-large', config=config_state)

### Step 3: Compute rolewise similarities (only do once)

Compute the rolewise similarities of two sentences using the VerbNet semantic role parses. This is the key step of computation and is very slow - maybe an hour per 1000 pairs. Only do this once and save the results.

In [9]:
## Functions to compute VerbNet rolewise similarities 

# Define special dictionaries and lists of words
tag_alignment_dict = [('Location','Trajectory'), ('Theme','Patient'), ('Theme','Manner'), ('Manner','Trajectory'), ('Patient','Location'), ('Patient','Manner')]

# Split a list up into n equal sublists
def partition_list(list_in, n):
    random.shuffle(list_in)
    return [list_in[i::n] for i in range(n)]


# Try to print specific semantic role from sentence
def get_semant_ele(sentence,element):
    try:
        return(sentence[element])
    except KeyError:
        return('NONE')
    

# Load word weights to adjust similarities for word frequency
def load_word_weights():
    # Load each line of fine into list
    filename = path_root+'\Corpus Data\Lambda Corpus\lambada-vocab.txt'
    with open(filename, encoding='utf-8') as file:
        words_raw = [line.rstrip('\n').split('\t') for line in file]

    # convert to dictionary    
    word_count_dict = {}
    for word in words_raw:
        word_count_dict[word[0]] = int(word[1])
        
    # compute word weights
    n = word_count_dict['<s>'] # total number of sentences in corpus
    word_weights = {} # store weights for each word
    for word in word_count_dict.keys():
        word_weights[word]= -np.log2(word_count_dict[word]/n)/10
    return word_weights


# Compute embedding for phrases from the verbnet parse using conceptnet
def phrase_embed(pair_id, phrase):
    # Split phrase into words and define storage
    word_list = sep.tokenise_sentence(phrase, stop_words)
    word_list = sep.replace_tricky_tokens(word_list)
    embed_length = len(conceptnet_embeds['man'])
    embeds = np.empty((0,embed_length), float)
    
    # Get word embeddings for each word in phrase
    for word in word_list:
        try:
            embeds = np.vstack([embeds,conceptnet_embeds[word.lower()]])
        except KeyError:
            print(pair_id,'couldn\'t find',word)
            continue # ignore words we can't find
    
    # Save final phrase embeddings
    if len(np.ndarray.flatten(embeds)) > embed_length: # deal with phrases with multiple embeddings
        phrase_embedding = np.mean(embeds, axis=0) # average over all saved embeddings
    else:
        try:
            phrase_embedding = embeds[0] # use embeddings for single word
        except IndexError:
            phrase_embedding = np.zeros(embed_length) # return array of zeros if no words in phrase found
    
    return(phrase_embedding)


# Compute the weighted phrase similiarity for matching pair of phrases
def calc_weighted_phrase_sim(pair_id, phrase_1, phrase_2, weighting):
    phrase_embed_1 = phrase_embed(pair_id, phrase_1)
    phrase_embed_2 = phrase_embed(pair_id, phrase_2)
    try:
        word_weights = load_word_weights()
        phrase_1_weight = max(word_weights[word] for word in phrase_1.split(' '))
        phrase_2_weight = max(word_weights[word] for word in phrase_2.split(' '))
        sim = sep.cosine_sim(phrase_embed_1,phrase_embed_2) # compute phrase cosine similarity
        if weighting==True:
            return max(phrase_1_weight,phrase_2_weight)*sim # weight sim by phrase sims to account for word rarity
        else:
            return sim # version without word frequency weighting
    except KeyError:
        sim = 0 # return zero if can't find word weights
        return sim


# Compute the hybrid conceptnet-sentbert phrase similiarity for matching pair of phrases
def calc_hybrid_phrase_sim(pair_id, phrase_1, phrase_2, weighting=True, cutoff=2):
    # in the case of long phrases, long being greater than cutoff
    if (len(tokenise_sentence(phrase_1)) > cutoff) and (len(tokenise_sentence(phrase_1)) > cutoff):
        final_sim = sentbert_sim([phrase_1, phrase_2])[0]
    # otherwise use weighted conceptnet embeds
    else:
        final_sim = calc_weighted_phrase_sim(pair_id, phrase_1, phrase_2, weighting)
    return final_sim


# Compute similarity of non-identical but matching tags, as determined by the tag_alignment_dict
def align_tags(sim_storage_dict, sentence_pair_parsed_dict, pair_id, tag_1, tag_2, sent_1_tags, sent_2_tags, sim_type, cutoff):
    if ((tag_1 in sent_1_tags) and (tag_1 not in sent_2_tags) and (tag_2 in sent_2_tags) and (tag_2 not in sent_1_tags)):
        # compute similarity of newly matched tags
        phrase_1 = sentence_pair_parsed_dict[pair_id][0][tag_1].lower()
        phrase_2 = sentence_pair_parsed_dict[pair_id][1][tag_2].lower()
        
        # compute different types of similarities for phrase pairs of given role tag
        if sim_type=='sentbert':
            phrase_sim = float(sentbert_sim([phrase_1, phrase_2])[0])
        elif sim_type=='conceptnet_wtd':
            phrase_sim = float(calc_weighted_phrase_sim(pair_id, phrase_1, phrase_2, weighting=True))
        elif sim_type=='conceptnet':
            phrase_sim = float(calc_weighted_phrase_sim(pair_id, phrase_1, phrase_2, weighting=False))
        elif sim_type=='hybrid_1' or sim_type=='hybrid_2':
            phrase_sim = float(calc_hybrid_phrase_sim(pair_id, phrase_1, phrase_2, weighting=True, cutoff=cutoff))
        
        # update storage of tags
        sim_storage_dict[tag_1] = phrase_sim
        sim_storage_dict.pop(tag_2)
        sentence_pair_parsed_dict[pair_id][1][tag_1] = sentence_pair_parsed_dict[pair_id][1][tag_2] # update name
        sentence_pair_parsed_dict[pair_id][1].pop(tag_2) 
        sent_2_tags.remove(tag_2) 
        
    elif ((tag_1 in sent_2_tags) and (tag_1 not in sent_1_tags) and (tag_2 in sent_1_tags) and (tag_2 not in sent_2_tags)):
        # compute similarity of newly matched tags
        phrase_1 = sentence_pair_parsed_dict[pair_id][0][tag_2].lower()
        phrase_2 = sentence_pair_parsed_dict[pair_id][1][tag_1].lower()
        
        # compute different types of similarities for phrase pairs of given role tag
        if sim_type=='sentbert':
            phrase_sim = float(sentbert_sim([phrase_1, phrase_2])[0])
        elif sim_type=='conceptnet_wtd':
            phrase_sim = float(calc_weighted_phrase_sim(pair_id, phrase_1, phrase_2, weighting=True))
        elif sim_type=='conceptnet':
            phrase_sim = float(calc_weighted_phrase_sim(pair_id, phrase_1, phrase_2, weighting=False))
        elif sim_type=='hybrid_1' or sim_type=='hybrid_2':
            phrase_sim = float(calc_hybrid_phrase_sim(pair_id, phrase_1, phrase_2, weighting=True, cutoff=cutoff))
        
        # update storage of tags
        sim_storage_dict[tag_1] = phrase_sim
        sim_storage_dict.pop(tag_2)
        sentence_pair_parsed_dict[pair_id][0][tag_1] = sentence_pair_parsed_dict[pair_id][0][tag_2] # update name
        sentence_pair_parsed_dict[pair_id][0].pop(tag_2)
        sent_1_tags.remove(tag_2)
        
    return(sim_storage_dict)


# Compute similarity of non-identical but matching tags, as determined by the tag_alignment_dict
def align_tags_amrbart(sim_storage_dict, sentence_pair_parsed_dict, pair_id, tag_1, tag_2, sent_1_tags, sent_2_tags):
    if ((tag_1 in sent_1_tags) and (tag_1 not in sent_2_tags) and (tag_2 in sent_2_tags) and (tag_2 not in sent_1_tags)):
        # compute similarity of newly matched tags
        phrase_1 = sentence_pair_parsed_dict[pair_id][0][tag_1]
        phrase_2 = sentence_pair_parsed_dict[pair_id][1][tag_2]
        sim = sep.cosine_sim(phrase_1,phrase_2) # get weighted phrase similarity
        
        # update storage of tags
        sim_storage_dict[tag_1] = sim
        sim_storage_dict.pop(tag_2)
        sentence_pair_parsed_dict[pair_id][1][tag_1] = sentence_pair_parsed_dict[pair_id][1][tag_2] # update name
        sentence_pair_parsed_dict[pair_id][1].pop(tag_2) 
        sent_2_tags.remove(tag_2) 
        
    elif ((tag_1 in sent_2_tags) and (tag_1 not in sent_1_tags) and (tag_2 in sent_1_tags) and (tag_2 not in sent_2_tags)):
        # compute similarity of newly matched tags
        phrase_1 = sentence_pair_parsed_dict[pair_id][0][tag_2]
        phrase_2 = sentence_pair_parsed_dict[pair_id][1][tag_1]
        sim = sep.cosine_sim(phrase_1,phrase_2) # get weighted phrase similarity
        
        # update storage of tags
        sim_storage_dict[tag_1] = sim
        sim_storage_dict.pop(tag_2)
        sentence_pair_parsed_dict[pair_id][0][tag_1] = sentence_pair_parsed_dict[pair_id][0][tag_2] # update name
        sentence_pair_parsed_dict[pair_id][0].pop(tag_2)
        sent_1_tags.remove(tag_2)
        
    return(sim_storage_dict)


# Return the count of the number of sentences containing the relevant role over the entire dictionary
def get_role_counts(sentence_pair_parsed_dict, sent_id):
    all_tags = {}
    for sent_pair in sentence_pair_parsed_dict.values():
        for tag in list(sent_pair[sent_id].keys()):
            try:
                all_tags[tag]+=1
            except KeyError:
                all_tags[tag]=1
                
    return(all_tags)

# Function to convert role-wise dictionary sim storage into role-wise list storage
def dict_sim_to_list(role_weights_dict,sim_storage_dict):
    sim_storage_dict_full = {}
    for role in role_weights_dict.keys():
        try:
            sim_storage_dict_full[role] = sim_storage_dict[role]
        except KeyError:
            sim_storage_dict_full[role] = 0
    sim_storage_list = list(sim_storage_dict_full.values())
    
    return(sim_storage_list)

In [7]:
## Compute role-wise sentence similarities using VerbNet parses
word_weights = load_word_weights() # these are weights for weighted version of conceptnet sim

# initialse storage dictionaries
sim_storage_dict = {}
all_role_sim_storage_dict = {}
list_sim_funcs = ['conceptnet']
for sim_type in list_sim_funcs:
    sim_storage_dict[sim_type] = np.array([]) 

# compute all types of similarities over all sentence pairs
for sim_type in list_sim_funcs:
    # set the cutoff for hybrid sim functions
    if sim_type=='hybrid_1' or sim_type=='hybrid_2':
        hybrid_cutoff = int(sim_type[-1])
        
    # role sims storage dict
    role_sim_storage = {}
    
    sentence_pair_parsed_dict_mod = sentence_pair_parsed_dict.copy() # use a fresh copy of the parsed sentences for each sim_type
    for pair_id in sentence_pair_parsed_dict_mod.keys():
        
        # Get the sentence pair to compute similarity for
        sentence_pair = sentence_pair_parsed_dict_mod[pair_id]
        sent_1_parsed = sentence_pair[0]
        sent_2_parsed = sentence_pair[1]
        
        # Define data storage structures
        sent_1_tags = [role for role in sent_1_parsed.keys() if sent_1_parsed[role] != 'NONE'] # ignore roles with NONE value
        sent_2_tags = [role for role in sent_2_parsed.keys() if sent_2_parsed[role] != 'NONE']
        all_sent_tags = set(list(sent_1_tags)+list(sent_2_tags))
        role_sim_storage[pair_id] = {}
        
        # Loop over all role tags and store similarities for paired phrases
        for role_tag in sent_1_parsed.keys(): # loop over all tags found in either sentence
            if (role_tag in sent_1_tags) and (role_tag in sent_2_tags): # only if both sentences have the same tag
                
                # get phrases for given role tag
                phrase_1 = sent_1_parsed[role_tag].lower()
                phrase_2 = sent_2_parsed[role_tag].lower()
                
                # compute different types of similarities for phrase pairs of given role tag
                if sim_type=='sentbert':
                    role_sim_storage[pair_id][role_tag] = float(sentbert_sim([phrase_1, phrase_2])[0]) # conversion to float needed for saving to .json, as it doesn't like numpy format
                elif sim_type=='conceptnet':
                    role_sim_storage[pair_id][role_tag] = float(calc_weighted_phrase_sim(pair_id, phrase_1, phrase_2, weighting=False))
                elif sim_type=='conceptnet_wtd':
                    role_sim_storage[pair_id][role_tag] = float(calc_weighted_phrase_sim(pair_id, phrase_1, phrase_2, weighting=True))
                elif sim_type=='hybrid_1' or sim_type=='hybrid_2':
                    role_sim_storage[pair_id][role_tag] = float(calc_hybrid_phrase_sim(pair_id, phrase_1, phrase_2, weighting=True, cutoff=1))
                
            else:
                role_sim_storage[pair_id][role_tag] = np.NaN # if the tag doesn't have a match

        # Make adjustments for non-identical tags needing to be compared
        for tag_pair in tag_alignment_dict: # loop over list of tag pairs to combine
            role_sim_storage[pair_id] = align_tags(role_sim_storage[pair_id], sentence_pair_parsed_dict_mod, pair_id, tag_pair[0], tag_pair[1], sent_1_tags, sent_2_tags, sim_type, cutoff=1)
    
        # Convert dictionary storage of rolewise sims into list
        sim_storage_list = dict_sim_to_list(role_weights_dict,role_sim_storage[pair_id])
        
        # Compute overall similarity for given sentence
        try:
            masked_sim_list = np.ma.MaskedArray(sim_storage_list, mask=np.isnan(sim_storage_list)) # get masked array to remove nans
            # print(pair_id, masked_sim_list)
            overall_verbnet_sim = np.ma.average(masked_sim_list, weights=role_weights_list)
        except ZeroDivisionError:
            overall_verbnet_sim = 0
            
        # Store VerbNet similarity for given sentence
        sim_storage_dict[sim_type] = np.append(sim_storage_dict[sim_type],overall_verbnet_sim)
    
    # Store role sims dictionary
    all_role_sim_storage_dict[sim_type] = role_sim_storage

82 couldn't find seventhlargest
101 couldn't find vehicle_related
283 couldn't find two_bladed
464 couldn't find seventhlargest
627 couldn't find redbrown
630 couldn't find felttipped
665 couldn't find two_bladed
845 couldn't find seventhlargest
864 couldn't find vehicle_related
978 couldn't find highpitched
1008 couldn't find redbrown
1011 couldn't find felttipped
1046 couldn't find two_bladed
1225 couldn't find seventhlargest
1388 couldn't find redbrown
1391 couldn't find felttipped
1426 couldn't find two_bladed
1551 couldn't find airbeds
1604 couldn't find seventhlargest
1623 couldn't find vehicle_related
1770 couldn't find felttipped
1805 couldn't find two_bladed
1982 couldn't find seventhlargest
2145 couldn't find redbrown
2148 couldn't find felttipped
2183 couldn't find two_bladed
2359 couldn't find seventhlargest
2378 couldn't find vehicle_related
2560 couldn't find two_bladed
2735 couldn't find seventhlargest
2898 couldn't find redbrown
2901 couldn't find felttipped
2936 couldn

In [8]:
## Save role-wise sims to json file
with open("full_"+parse_type+"_pass_"+full_dataset_name+"_rolewise_similarities.json", "w") as file:
    json.dump(all_role_sim_storage_dict, file, indent=3)

### Step 4: Load Rolewise Similarities and Sim data

Load VerbNet rolewise similarities as computed in step 3.

In [170]:
## Load datasets and similarity functions for training VerbNet

# Load role-wise similarities from json
rolewise_sims_path = path_root+parse_path+"full_"+parse_type+"_pass_"+full_dataset_name+"_rolewise_similarities.json"
with open(rolewise_sims_path, "r") as file:
    all_role_sim_storage_dict_new = json.load(file)
list_of_sem_roles = list(all_role_sim_storage_dict_new['conceptnet']['1'].keys())

# Compositional functions to examine
sims_path = '\Analysis Results\Sentence Similarities\\'
sim_funcs = ['count','mean','mult','conv','ernie_0','ernie_5_norml','ernie_12','ernie_12_norml','infersent',
             'universal','universal_norml','sentbert','sentbert_mpnet_norml','sentbert_norml',
             'dictBERT_norml','openai','openai_norml','defsent_mean','defsent_mean_norml','defsent_cls','defsent_cls_norml',
             'amrbart','amrbart_norml','AMR','smatch','AMR_WLK_Wasser','S3BERT','S3BERT_norml']

# Load similarity data for all comp functions
for comp_func in sim_funcs:
    filename = path_root+sims_path+full_dataset_name+'_'+comp_func+'_similarities.txt'
    try:
        with open(filename, encoding='utf-8') as file:
            sim_storage[comp_func] = np.array([float(line.rstrip('\n')) for line in file])
    except FileNotFoundError:
        print(filename,'not found')
        continue

# Load full expr STS3k dataset to get dev and train split (for STS3k only)
all_expr_data = np.loadtxt(path_root+'\\Sentence Similarity Data\\Fodor2023 - STS3k Large Dataset\\3 - Experimental data\\STS3k_all_full_expr_501.txt',  delimiter=';', dtype='str')
col_dict = dict(zip(all_expr_data[0], np.arange(len(all_expr_data[0]))))
all_indices = list(np.arange(0,len(sentence_pair_parsed_dict['conceptnet'].keys())))

# A few last things
n_expr = sim_storage['expr'].shape[0]
print(sim_storage.keys()) # key storage dict

D:\Study and Projects\School Work\Year 25 - PhD\Data\\Analysis Results\Sentence Similarities\STS3k_all_count_similarities.txt not found
dict_keys(['expr', 'mean', 'mult', 'conv', 'ernie_0', 'ernie_5_norml', 'ernie_12', 'ernie_12_norml', 'infersent', 'universal', 'universal_norml', 'sentbert', 'sentbert_mpnet_norml', 'sentbert_norml', 'dictBERT_norml', 'openai', 'openai_norml', 'defsent_mean', 'defsent_mean_norml', 'defsent_cls', 'defsent_cls_norml', 'amrbart', 'amrbart_norml', 'AMR', 'smatch', 'AMR_WLK_Wasser', 'S3BERT', 'S3BERT_norml'])


In [173]:
## Define sets of sentence types and modifiers for STS3k
sent_types = ['CONST_VERB','CONST_SUBJ','CONST_DOBJ','AB_VERB','AB_SUBJ','AB_DOBJ','ADJECTIVE','OTHER','ZERO','SWAP_DOBJ_VERB','SWAP_SUBJ_DOBJ','SWAP_SUBJ_VERB','TRIPLE','QUAD','PARAPHRASE','NEGATIVE','ONLY_MOD']
sent_types_indices = [list(np.where(all_expr_data[1:,col_dict['Sent Type']] == sent_type)[0]) for sent_type in sent_types]
sent_types_dict = dict(zip(sent_types, sent_types_indices))
sent_types_dict['ONLY_MOD'] = sent_types_dict['ONLY_MOD']+[5,6,7,8,9] # add on 'CHECK' data with identical sentences

# Modifier levels (no mods, 1 mod, 2 mods)
modifier_lvl_dict = {0: list(np.where(all_expr_data[1:,col_dict['Mod 1']] == '')[0]),
                     1: np.intersect1d(list(np.where(all_expr_data[1:,col_dict['Mod 1']] != '')[0]), list(np.where(all_expr_data[1:,col_dict['Mod 2']] == '')[0])),
                     2: np.intersect1d(list(np.where(all_expr_data[1:,col_dict['Mod 2']] != '')[0]), list(np.where(all_expr_data[1:,col_dict['Mod 3']] == '')[0])),
                     3: np.intersect1d(list(np.where(all_expr_data[1:,col_dict['Mod 3']] != '')[0]), list(np.where(all_expr_data[1:,col_dict['Mod 4']] == '')[0])),
                     4: np.intersect1d(list(np.where(all_expr_data[1:,col_dict['Mod 4']] != '')[0]), list(np.where(all_expr_data[1:,col_dict['Mod 5']] == '')[0])),
                     5: list(np.where(all_expr_data[1:,col_dict['Mod 5']] != '')[0])}
average_score_by_type = np.zeros((len(sent_types),6))

# Sentence types (para, swaps, triples, quads)
para_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'PARAPHRASE')[0])
ab_verb_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'AB_VERB')[0])
ab_subj_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'AB_SUBJ')[0])
ab_dobj_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'AB_DOBJ')[0])
single_indices = ab_verb_indices+ab_subj_indices+ab_dobj_indices
const_verb_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'CONST_VERB')[0]) 
const_subj_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'CONST_SUBJ')[0]) 
const_dobj_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'CONST_DOBJ')[0]) 
swap_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'SWAP_DOBJ_VERB')[0]) + list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'SWAP_SUBJ_DOBJ')[0]) + \
    list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'SWAP_SUBJ_VERB')[0])
triple_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'TRIPLE')[0])
quad_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'QUAD')[0])
neg_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'NEGATIVE')[0])
zero_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'ZERO')[0])
other_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'OTHER')[0])
onlymod_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'ONLY_MOD')[0])
onlymod_1_indices = [x for x in modifier_lvl_dict[1] if x in onlymod_indices]
onlymod_2_indices = [x for x in modifier_lvl_dict[2] if x in onlymod_indices]
onlymod_3_indices = [x for x in modifier_lvl_dict[3] if x in onlymod_indices]
onlymod_4_indices = [x for x in modifier_lvl_dict[4] if x in onlymod_indices]
onlymod_6_indices = [x for x in modifier_lvl_dict[5] if x in onlymod_indices]
adjective_indices = list(np.where(all_expr_data[1:,col_dict['Sent Type']] == 'ADJECTIVE')[0])
simple_quads = np.intersect1d(list(np.where(all_expr_data[1:,col_dict['Mod 1']] == '')[0]), quad_indices)
simple_triples = np.intersect1d(list(np.where(all_expr_data[1:,col_dict['Mod 1']] == '')[0]), triple_indices)
simple_swaps = np.intersect1d(list(np.where(all_expr_data[1:,col_dict['Mod 1']] == '')[0]), swap_indices)

# Types of modifiers (time, manner, place)
place_indices = list(np.where(all_expr_data[1:,col_dict['Mod 1']] == 'PLACE')[0])
time_indices = list(np.where(all_expr_data[1:,col_dict['Mod 1']] == 'TIME')[0])
manner_indices = list(np.where(all_expr_data[1:,col_dict['Mod 1']] == 'MANNER')[0])
addiobj_indices = list(np.where(all_expr_data[1:,col_dict['Mod 1']] == 'add IOBJ')[0])
subja_indices = list(np.where(all_expr_data[1:,col_dict['Mod 1']] == 'SUBJ adj')[0])
dobja_indices = list(np.where(all_expr_data[1:,col_dict['Mod 1']] == 'DOBJ adj')[0])
iobja_indices = list(np.where(all_expr_data[1:,col_dict['Mod 1']] == 'IOBJ adj')[0])
passive_indices = list(np.where(all_expr_data[1:,col_dict['Mod 1']] == 'PASSIVE')[0])
only_mods_lvl_1 = np.intersect1d(modifier_lvl_dict[1], sent_types_dict['ONLY_MOD']) # mod_only sentence pairs

# Word sets (test, train, dev, adversarial)
set_test = list(np.where(all_expr_data[1:,col_dict['Set']] == 'TEST')[0])
set_dev = list(np.where(all_expr_data[1:,col_dict['Set']] == 'DEV')[0])
set_train = list(np.where(all_expr_data[1:,col_dict['Set']] == 'TRAIN')[0])
all_adversarial = list(np.where(all_expr_data[1:,col_dict['Adversarial']] == 'ADV')[0])
non_adversarial = list(np.where(all_expr_data[1:,col_dict['Adversarial']] == 'NADV')[0])
all_adversarial_no_neg = swap_indices+triple_indices+quad_indices+para_indices

revised_adversarial = swap_indices+triple_indices+quad_indices+onlymod_indices+para_indices+neg_indices
revised_adversarial_no_neg = swap_indices+triple_indices+quad_indices+onlymod_indices+para_indices
revised_adversarial_no_neg_no_para = swap_indices+triple_indices+quad_indices+onlymod_indices
revised_non_adversarial = [x for x in all_indices if x not in revised_adversarial]

### Step 5: Evaluate VerbNet Similarities (PCA analysis)

This analysis of the VerbNet similarities is described in the paper.

In [132]:
## Function to train a model to predict sentence similarities
def train_model(sim_storage, rolewise_sims, embed_model, comb_alg, train_indices, fixed_params, data_type, save_model=False, pca_comps=8, full_sent_augment=True, start_indx=1):
    # Basic regression variables
    n_expr = sim_storage['expr'].shape[0] # number of experimental observations in dataset
    ys = sim_storage['expr']

    # Get xs data for Verbnet roles
    if data_type=='verbnet' or data_type=='verbnet_&_embeds':
        sim_storage_full_list = []
        for pair_id in np.arange(start_indx,n_expr+start_indx): # start at 0 or 1 depending on dataset
            sim_storage_list = dict_sim_to_list(role_weights_dict,rolewise_sims['conceptnet'][str(pair_id)]) 
            sim_storage_full_list.append(sim_storage_list)
        xs_verbnet = np.array(sim_storage_full_list)
    # Get xs data for embeddings PCA components
    if data_type=='embeddings' or data_type=='verbnet_&_embeds':
        if embed_model.split('_')[-1] == 'norml': # remove 'norml' portion for these models as embeddings aren't saved as _norml
            embed_model_mod = embed_model[0:-6]
        else:
            embed_model_mod = embed_model
        all_sentence_embeds = sep.load_embeds(path_root+embeddings_path, dataset_name, embed_model_mod)
        embed_size = np.array(all_sentence_embeds).shape[1]
        if pca_comps>embed_size: # can't have more PCA components than the original embedding size
            pca_comps=embed_size
        pca = PCA(n_components=pca_comps) # perform PCA decomposition
        xs_embeds = pca.fit_transform(np.array(all_sentence_embeds))

    # Augment xs data with full sentence similarity data (if full_sent_augment==True)
    if data_type=='embeddings': # use only embeddings data
        if full_sent_augment==True:
            xs_final = np.column_stack((xs_embeds,sim_storage[embed_model]))
        else:
            xs_final = xs_embeds
    elif data_type=='verbnet': # use only verbnet roles
        if full_sent_augment==True:
            xs_final = np.column_stack((xs_verbnet,sim_storage[embed_model]))
        else:
            xs_final = xs_verbnet
    elif data_type=='verbnet_&_embeds': # use embeddings and verbnet roles
        if full_sent_augment==True:
            xs_agumented = np.column_stack((xs_verbnet,sim_storage[embed_model]))
            xs_final = np.column_stack((xs_agumented,xs_embeds))
        else:
            xs_final = np.column_stack((xs_verbnet,xs_embeds))
        
    # Deal with NaNs
    xs_final_nonan = np.nan_to_num(xs_final, copy=True, nan=0.0, posinf=None, neginf=None)

    # Choose between Linear regression or Neural network
    if comb_alg=='reg':
        trained_model = LinearRegression().fit(xs_final_nonan[train_indices], ys[train_indices])
        params = trained_model.coef_
    elif comb_alg=='none':
        trained_model = LinearRegression().fit(xs_final_nonan[train_indices], ys[train_indices])
        trained_model.coef_ = fixed_params # use pre-defined parameters except for intercept
        params = trained_model.coef_ 
    elif comb_alg=='nn1':
        trained_model = MLPRegressor(hidden_layer_sizes=(10), random_state=1, max_iter=500).fit(xs_final_nonan[train_indices], ys[train_indices])
        params = trained_model.coefs_
    elif comb_alg=='nn2':
        trained_model = MLPRegressor(hidden_layer_sizes=(60,10), random_state=1, max_iter=500).fit(xs_final_nonan[train_indices], ys[train_indices])
        params = trained_model.coefs_
    elif comb_alg=='nn3':
        trained_model = MLPRegressor(hidden_layer_sizes=(100,100,10), random_state=1, max_iter=500).fit(xs_final_nonan[train_indices], ys[train_indices])
        params = trained_model.coefs_
    elif comb_alg=='nn4':
        trained_model = MLPRegressor(hidden_layer_sizes=(1000,100,100,10), random_state=1, max_iter=500).fit(xs_final_nonan[train_indices], ys[train_indices])
        params = trained_model.coefs_
    elif comb_alg=='nn5':
        trained_model = MLPRegressor(hidden_layer_sizes=(100,100,100,100,10), random_state=1, max_iter=500).fit(xs_final_nonan[train_indices], ys[train_indices])
        params = trained_model.coefs_

    # Save model to file
    if save_model==True:
        saved_model_name = 'verbnet_conceptnet_'+embed_model+'_'+comb_alg+'_model.pkl'
        with open(saved_model_name, 'wb') as file:
            pickle.dump(trained_model, file)

    # Get parameter count
    all_para_count = 0
    if comb_alg=='none':
        all_para_count = 0
    elif len(params) > 0:
        for layer in params:
            all_para_count += layer.size
    else:
        all_para_count = len(params)
    
    return trained_model, xs_final_nonan, params, all_para_count

In [187]:
## Multiple models: Train multiple models using adversarial or test/train split

# Specify train and test data
# train_indices = revised_non_adversarial
# test_indices = revised_adversarial_no_neg
train_indices = set_train
test_indices = set_test

# Define models to investigate
datatype = 'verbnet' # the type of data to pass to the classifier; can be 'verbnet', 'embeddings', 'verbnet_&_embeds'
embed_model_list = ['mean', 'sentbert_mpnet_norml', 'openai_norml', 'defsent_cls_norml', 'ernie_12_norml'] # embedding models to examiine
comb_algorithms = ['none', 'reg', 'nn1', 'nn2', 'nn3','nn4'] # type of classifier to train to predict similarities

# Define parameters for training (Note: this leads to some variation run-to-run)
num_pca = 8 # PCA to extract for embeddings models (irrelevant for 'verbnet')
augment = True # whether to augment verbnet rolewise sims with full sentence similarity

# Loop to compute similarity predictions
print(datatype)
for embed_model in embed_model_list: # sentence similarity embeddings to use
    print('')
    print(embed_model)
    if datatype=='embeddings' and embed_model=='none': # this combination doesn't make sense so ignore
        continue
        
    # Define parameters for training
    if augment==True: # determine which set of fixed params to use depending on whether augment is true or false
        fixed_params = role_weights_list_augment
    else:
        fixed_params = role_weights_list
        
    for comb_alg in comb_algorithms:  # combination method, either nn for neural network or reg for simple regression

        # Train model and compute predictions
        trained_model, final_xs, params, param_count = train_model(sim_storage, all_role_sim_storage_dict_new, embed_model, comb_alg, train_indices, fixed_params,
                                                       data_type=datatype, pca_comps=num_pca, full_sent_augment=augment, start_indx=start_index)
        test_preds = trained_model.predict(final_xs)
            
        # Save average verbnet similarities to file
        save_file_name = dataset_name+'_'+datatype+'_'+str(num_pca)+'_'+embed_model+'_'+comb_alg+'_adversarial_similarities.txt'
        np.savetxt(save_file_name, test_preds, fmt='%f')

        # Compute and print correlation and network size
        correlation = spearmanr(sim_storage['expr'][test_indices], test_preds[test_indices])[0]
        print(comb_alg, '{:.3f}'.format(correlation), param_count)

verbnet

mean
none 0.682 0
reg 0.719 9
nn1 0.688 100
nn2 0.713 1150
nn3 0.787 11910
nn4 0.789 120010

sentbert_mpnet_norml
none 0.726 0
reg 0.778 9
nn1 0.756 100
nn2 0.780 1150
nn3 0.851 11910
nn4 0.851 120010

openai_norml
none 0.721 0
reg 0.774 9
nn1 0.755 100
nn2 0.772 1150
nn3 0.858 11910
nn4 0.847 120010

defsent_cls_norml
none 0.728 0
reg 0.787 9
nn1 0.780 100
nn2 0.792 1150
nn3 0.833 11910
nn4 0.854 120010

ernie_12_norml
none 0.704 0
reg 0.749 9
nn1 0.733 100
nn2 0.740 1150
nn3 0.818 11910
nn4 0.820 120010


### Code Archive (Feel free to ignore!)

In [None]:
## [DEPRECATED] Multiple models: Train set of models using 10-fold cross-validation

# Define models to investigate
datatype = 'embeddings' # the type of data to pass to the classifier; can be 'verbnet', 'embeddings', 'verbnet_&_embeds'
embed_model_list = ['none', 'mean', 'sentbert_mpnet_norml', 'openai_norml', 'defsent_cls_norml', 'ernie_12_norml']
comb_algorithms = ['reg', 'nn1', 'nn2', 'nn3']

# Define parameters for training
num_pca = 3000 # PCA to extract for embeddings models (irrelevant for 'verbnet')
n_fold = 10 # number of test/train folds

# Loop to compute similarity predictions
print(datatype,'10 fold cross-validation')
for embed_model in embed_model_list: # sentence similarity embeddings to use
    if datatype=='embeddings' and embed_model=='none': # this combination doesn't make sense so ignore
        continue
    
    # Define parameters for training
    if augment==True: # determine which set of fixed params to use depending on whether augment is true or false
        fixed_params = role_weights_list_augment
    else:
        fixed_params = role_weights_list
        
    for comb_alg in comb_algorithms:  # combination method, either nn for neural network or reg for simple regression

        # Key storage array
        full_storage_by_subset = np.empty((n_expr,0), float)
        
        # Perform n-fold regression multiple times
        for iteration in np.arange(0,1):
            
            # Define storage vectors and lists
            indices = np.arange(0,n_expr)
            indices_subsets = partition_list(indices,n_fold)
            preds_by_subset = []
            indices_by_subset = []

            for test_indices in indices_subsets:
                # Get indices to train network with
                train_indices = [index for index in indices if index not in test_indices]

                # Train model and compute predictions
                trained_model, final_xs, params, param_count = train_model(sim_storage, all_role_sim_storage_dict_new, embed_model, comb_alg, train_indices, fixed_params,
                                                               data_type=datatype, pca_comps=num_pca, full_sent_augment=augment, start_indx=start_index)
                test_preds = trained_model.predict(final_xs[test_indices])
                
                # Store results by subset
                preds_by_subset.append(test_preds)
                indices_by_subset.append(test_indices)
                
            # Store in flattened arrays and sort by index
            indices_by_subset_flat = np.array(indices_by_subset).flatten()
            preds_by_subset_flat = np.array(preds_by_subset).flatten()
            all_data_by_subset_flat = np.array([indices_by_subset_flat,preds_by_subset_flat]).T
            all_data_by_subset_flat_sort = all_data_by_subset_flat[all_data_by_subset_flat[:,0].argsort()]  # sort by index
            full_storage_by_subset = np.column_stack((full_storage_by_subset, all_data_by_subset_flat_sort[:,1]))

        # Save average verbnet similarities to file
        save_file_name = dataset_name+'_'+datatype+'_'+str(num_pca)+'_'+embed_model+'_'+comb_alg+'_'+str(n_fold)+'fold_similarities.txt'
        np.savetxt(save_file_name, full_storage_by_subset.mean(axis=1), fmt='%f')
        
        # Compute and print correlation and network size
        correlation = spearmanr(sim_storage['expr'][all_indices], full_storage_by_subset.mean(axis=1)[all_indices])[0]
        print(comb_alg, '{:.3f}'.format(correlation), param_count)

In [36]:
## Load Pre-Trained VerbNet Similarity Model

# Pre-defined weights
role_weights_dict = {'Agent':2, 'Verb':3, 'Patient':2, 'Theme':2, 'Time':0.5, 'Manner':0.5, 'Location':0.5, 'Trajectory':0.5}

# Load pre-trained model to use
verbnet_sim_func = 'conceptnet'
full_sent_sim_func = 'sentbert_mpnet_norml'
model_type = 'nn4'
saved_model_name = 'verbnet_'+verbnet_sim_func+'_'+full_sent_sim_func+'_'+model_type+'_model.pkl'
with open(saved_model_name, 'rb') as file:
    loaded_model = pickle.load(file)

# Prepare xs data for training regression and neural network
sim_storage_full_list = []
n_expr = sim_storage['expr'].shape[0]
for pair_id in np.arange(1,n_expr+1):
    sim_storage_list = dict_sim_to_list(role_weights_dict,all_role_sim_storage_dict_new[verbnet_sim_func+'_uwtd'][str(pair_id)]) 
    sim_storage_full_list.append(sim_storage_list)
xs = np.array(sim_storage_full_list)

# Augment xs data with full sentence similarity data from another model
if full_sent_sim_func=='none':    
    xs_augmented = xs
else:
    xs_augmented = np.column_stack((xs,sim_storage[full_sent_sim_func]))
    
# Deal with NaNs
xs_augmented_nonan = np.nan_to_num(xs_augmented, copy=True, nan=0.0, posinf=None, neginf=None)

# Generate predictions based on model
predictions = loaded_model.predict(xs_augmented_nonan)

# Save predicted similarities to file
# save_file_name = dataset_name+'_'+saved_model_name+'_similarities.txt'
# np.savetxt(save_file_name, predictions, fmt='%f')

# Compute correlation over test set
# spearmanr(sim_storage['expr'][test_indices], predictions[test_indices])[0]
spearmanr(sim_storage['expr'], predictions)[0]

0.8771659738958207