In [1]:
## This code computes S3BERT embeddings and saves feature-wise similarities between sentence pairs
# Run using base python 3.9
# James Fodor 2023
# 
# Requires S3BERT code from https://github.com/flipz357/S3BERT

# load libraries
import sys
import numpy as np
import sentence_embeds_processing as sep
import itertools
import json

from sentence_transformers import SentenceTransformer

# file containing path directories
with open("D:\\My Code\\Python\\2023_02 fMRI RSA Analysis\\file_paths.json", "r") as file:
    file_paths_dict = json.load(file)

# load S3BERT code
sys.path.append("./S3BERT_main/")
import config
import prediction_helpers as ph

### Define functions and load datasets

In [2]:
## Show available datasets, as specified in the sep module
pairs = True # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
print('Available datasets:')
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

Available datasets:
0 GS2011_processed
1 KS2013_processed
2 Fodor_pilot_2022
3 STS131_processed
4 SICK_relatedness
5 STR_processed
6 STSb_captions_test
7 STSb_forums_test
8 STSb_headlines_test
9 STSb_test
10 STS3k_all


In [3]:
## Load sentence set 

# choose number from those printed above
dataset_name = datasets[5]

# load sentence set into dictionary depending on type
if pairs == True:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['data_pairs_path'], pairs)
else:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['neuro_root'], pairs)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset_name,'with',n,'sentences')

# store in list
sentences = []
if pairs==True: # use this for sentence similarity pair data
    sentences.append(list(np.array(list(sentences_dict.values()))[:,0].flatten()))
    sentences.append(list(np.array(list(sentences_dict.values()))[:,1].flatten()))
else: # use this for neuroimaging data/list of sentences
    sentences.append(list(sentences_dict.values()))


loaded STR_processed with 5500 sentences


### Compute S3BERT feature-wise similarities

In [4]:
## Prepare sets of sentences for computing S3BERT feature-wise similarities

# load S3BERT model
model = SentenceTransformer("S3BERT_main\s3bert_all-mpnet-base-v2", device="cpu")

# Get two sets of sentences for paired (behaviuoral) data
if pairs==True:
    
    # sentences
    sentences_a = np.array(list(sentences_dict.values()))[:,0]
    sentences_b = np.array(list(sentences_dict.values()))[:,1]

# Get two sets of sentences for unpaired (neuroimaging) data
elif pairs==False:
    
    # define set of sentence pairs
    sentences_storage = []
    sent_id_pairs = list(itertools.combinations(sentences_dict.keys(), 2))

    # encode with S3BERT and store results in numpy array
    for sent_id_pair in sent_id_pairs:
        sentence_a = sentences_dict[sent_id_pair[0]]
        sentence_b = sentences_dict[sent_id_pair[1]]
        sentences_storage.append([sentence_a,sentence_b])
        
    # sentences
    sentences_a = np.array(sentences_storage)[:,0]
    sentences_b = np.array(sentences_storage)[:,1]

# encode with s3bert
sentences_a_encoded = model.encode(sentences_a)
sentences_b_encoded = model.encode(sentences_b)

# get similarity scores of different features
similarities = ph.get_preds(sentences_a_encoded, sentences_b_encoded, biases=None, n=config.N, dim=config.FEATURE_DIM)

In [8]:
## Save embeddings
np.savetxt(dataset_name+'_a_S3BERT_embeddings.txt', sentences_a_encoded, fmt='%f')
np.savetxt(dataset_name+'_b_S3BERT_embeddings.txt', sentences_b_encoded, fmt='%f')

In [7]:
## Extract feature-wise similarities for all sentence pairs
features = ["global"] + config.FEATURES[2:] + ["residual"]
feature_sim_storage = []
for i, sent_a in enumerate(sentences_a):
    similarity = similarities[i]
    features_text = {k:v for k,v in zip(features, similarity)}
    features_text["sent_a"] = sentences_a[i]
    features_text["sent_b"] = sentences_b[i]
    feature_sim_storage.append(features_text)

In [13]:
## Store feature-wise similarities in a dictionary
if pairs==True:
    sent_id_pairs_str = [str(id_pair) for id_pair in list(sentences_dict.keys())]
else:
    sent_id_pairs_str = [str(id_pair) for id_pair in sent_id_pairs]
feature_storage_dict = dict(zip(sent_id_pairs_str,feature_sim_storage))
with open(dataset_name+'_S3BERT_feature_similarities.json', "w") as file:
    json.dump(feature_storage_dict, file, indent=0) 