In [2]:
## This code computes S3BERT embeddings and saves similarities between sentence pairs
# Run using base python 3.9
# James Fodor 2023
# 
# Requires S3BERT code from https://github.com/flipz357/S3BERT

# load libraries
import sys
import numpy as np
import sentence_embeds_processing as sep
import itertools

from sentence_transformers import SentenceTransformer

# base path for all data files
path_root = "D:\Study and Projects\School Work\Year 25 - PhD\Data\\"
data_pairs_path = path_root+'\\Sentence Similarity Data\\Sentence Similarities Final\\'
data_nonpaired_path = path_root+'\\Neuroimaging Data\\'
embeddings_path = "Analysis Results\Sentence Embeddings\\"
sims_path = 'Analysis Results\Sentence Similarities\\'

# load S3BERT code
sys.path.append("./S3BERT_main/")
import config
import prediction_helpers as ph

### Define functions and load datasets

In [3]:
## Show available datasets, as specified in the sep module
pairs = False # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
print('Available datasets:')
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

Available datasets:
0 2014 Wehbe\Stimuli\Chapter_9_sentences_final
1 2017 Anderson\Stimuli\stimuli_final
2 2018 Pereira\Stimuli\stimuli_243sentences
3 2018 Pereira\Stimuli\stimuli_384sentences
4 2020 Alice Dataset\Stimuli\stimuli_sentences_final
5 2020 Zhang\Stimuli\test_sentences_final
6 2023 Fodor Dataset\Fodor2023-final240
7 2023 Fodor Dataset\Fodor2023-final192
8 2023 Fodor Dataset\Fodor2023-prelim


In [4]:
## Load sentence set (choose number from those printed above)
dataset = datasets[5]
sentences_dict = sep.load_set_of_sentences(dataset, data_pairs_path, data_nonpaired_path, pairs)
full_dataset_name = sep.fix_sentence_dataset_name(dataset)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset,'with',n,'sentences')


loaded 2020 Zhang\Stimuli\test_sentences_final with 95 sentences


### Analysis for paired (behavioural) datasets

This code is used for experimental sentence datasets, which have lists of sentence pairs.

In [None]:
## Compute similarities for experimental datasets

# load S3BERT model
model = SentenceTransformer("S3BERT_main\s3bert_all-mpnet-base-v2", device="cpu")

# sentences
sentences_a = np.array(list(sentences_dict.values()))[:,0]
sentences_b = np.array(list(sentences_dict.values()))[:,1]

# encode with s3bert
sentences_a_encoded = model.encode(sentences_a)
sentences_b_encoded = model.encode(sentences_b)

# normalise
sentences_a_norml = sep.normalise_embeddings(sentences_a_encoded)
sentences_b_norml = sep.normalise_embeddings(sentences_b_encoded)

# get similarity scores of different features
similarities = ph.get_preds(sentences_a_encoded, sentences_b_encoded, biases=None, n=config.N, dim=config.FEATURE_DIM)
similarities_norml = ph.get_preds(sentences_a_norml, sentences_b_norml, biases=None, n=config.N, dim=config.FEATURE_DIM)

In [31]:
# Save global S3BERT similarities and embeddings
global_sims = similarities[:,0]
global_sims_norml = similarities_norml[:,0]
np.savetxt(dataset+'_S3BERT_similarities.txt', global_sims, fmt='%f')
np.savetxt(dataset+'_S3BERT_norml_similarities.txt', global_sims_norml, fmt='%f')
np.savetxt(dataset+'_a_S3BERT_embeddings.txt', sentences_a_encoded, fmt='%f')
np.savetxt(dataset+'_b_S3BERT_embeddings.txt', sentences_b_encoded, fmt='%f')

In [23]:
# Print similarity scores of different features (optional)
features = ["global"] + config.FEATURES[2:] + ["residual"]
for i, sent_a in enumerate(sentences_a):
    similarity = similarities[i]
    features_text = {k:v for k,v in zip(features, similarity)}
    features_text["sent_a"] = sentences_a[i]
    features_text["sent_b"] = sentences_b[i]
    print(features_text)

{'global': 0.8568320870399475, 'Concepts ': 0.8519608974456787, 'Frames ': 0.7636246085166931, 'Named Ent. ': 0.8424161076545715, 'Negations ': 0.9437075853347778, 'Reentrancies ': 0.9733306169509888, 'SRL ': 0.6547356843948364, 'Smatch ': 0.978515088558197, 'Unlabeled ': 0.8995557427406311, 'max_indegree_sim': 0.8576575517654419, 'max_outdegree_sim': 0.9412575960159302, 'max_degree_sim': 0.6852121949195862, 'root_sim': 0.8155668377876282, 'quant_sim': 0.9001861214637756, 'score_wlk': 0.840286910533905, 'score_wwlk': 0.8543912172317505, 'residual': 0.8567565083503723, 'sent_a': 'I remained under the banyan tree, exhausted by my daily ritual of dragooning the men every two hours.', 'sent_b': 'I remained under the banyan tree, exhausted by my daily ritual of herding the cats every two hours.'}
{'global': 0.8202927112579346, 'Concepts ': 0.7236304879188538, 'Frames ': 0.6475898027420044, 'Named Ent. ': 0.7386634349822998, 'Negations ': 0.8733257055282593, 'Reentrancies ': 0.29368606209754

### Analysis for unpaired (neuro) datasets

This code is used for neuroimaging sentence datasets, which have a list of single sentences. The code therefore computes the pairwise similarity between each unique pairing of sentences.

In [6]:
## Compute similarities for neuro data

# load S3BERT model
model = SentenceTransformer("S3BERT_main\s3bert_all-mpnet-base-v2", device="cpu")

# define set of sentence pairs
sim_storage = np.array([])
sent_id_pairs = list(itertools.combinations(sentences_dict.keys(), 2))

# encode with S3BERT and store results in numpy array
for sent_id_pair in sent_id_pairs:
    sentences_a_encoded = model.encode(sentences_dict[sent_id_pair[0]])
    sentences_b_encoded = model.encode(sentences_dict[sent_id_pair[1]])
    pair_sim = sep.cosine_sim(sentences_a_encoded,sentences_b_encoded)
    sim_storage = np.append(sim_storage,pair_sim)

In [8]:
# Save global S3BERT similarities
dataset_name = dataset.split(' ')[1].split('\\')[0]
np.savetxt(dataset_name+'_neuro_S3BERT_similarities.txt', sim_storage, fmt='%f')