In [1]:
# load libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import sys

# load custom functions from library
import sentence_embeds_processing as sep

# file containing path directories
with open("file_paths.json", "r") as file:
    file_paths_dict = json.load(file)
    
# set display options
np.set_printoptions(precision=5, threshold=5000, linewidth=220, suppress=True, floatmode='fixed')
sns.set()

### Load dataset

In [2]:
## Show available datasets, as specified in the sep module
pairs = True # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
print('Available datasets:')
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

Available datasets:
0 GS2011_processed
1 KS2013_processed
2 Fodor_pilot_2022
3 STS131_processed
4 SICK_relatedness
5 STR_processed
6 STSb_captions_test
7 STSb_forums_test
8 STSb_headlines_test
9 STSb_test
10 STS3k_all


In [3]:
## Load sentence set 

# choose number from those printed above
dataset_name = datasets[5]

# load sentence set into dictionary depending on type
if pairs == True:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['data_pairs_path'], pairs)
else:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['neuro_root'], pairs)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset_name,'with',n,'sentences')

# store in list
sentences = []
if pairs==True: # use this for sentence similarity pair data
    sentences.append(list(np.array(list(sentences_dict.values()))[:,0].flatten()))
    sentences.append(list(np.array(list(sentences_dict.values()))[:,1].flatten()))
else: # use this for neuroimaging data/list of sentences
    sentences.append(list(sentences_dict.values()))


loaded STR_processed with 5500 sentences


In [6]:
## Open json file with S3BERT similarities
with open(file_paths_dict['parses_path']+'S3BERT Parsing\\'+dataset_name+"_S3BERT_feature_similarities.json", "r") as file:
    S3BERT_data = json.load(file)

### Compute feature-specific similarities

In [7]:
# Extract desired feature-specific similarities
sim_types = ['global', 'Concepts ', 'Frames ', 'SRL ', 'Smatch ', 'score_wwlk']
sim_types_storage = {}
for sim_type in sim_types:
    sim_types_storage[sim_type.strip()] = []
    for sent_pair in S3BERT_data.values():
        sim_types_storage[sim_type.strip()].append(sent_pair[sim_type])

In [8]:
# Save feature-specific similarities
for sim_type in sim_types:
    np.savetxt(dataset_name+'_S3BERT-'+sim_type.strip().lower()+'_similarities.txt', sim_types_storage[sim_type.strip()], fmt='%f')