In [4]:
## This script computes pairwise sentence similarities based on transformer embeddings
# Run using base python 3.9
# James Fodor 2023
#
# Requires first generating the sentence embeddings using the 'Calc_embeds_transformers.ipynb' file.
# This code generates files with a similarity score between each pair of sentences, one for file per model.
# The file contains a single similarity score (-1 to 1) on each line.

# load libraries
import numpy as np
import sentence_embeds_processing as sep
import itertools
import seaborn as sns
import matplotlib.pyplot as plt

# base path for all data files
path_root = "D:\Study and Projects\School Work\Year 25 - PhD\Data\\"
data_pairs_path = 'Sentence Similarity Data\\Sentence Similarities Final\\'
data_nonpaired_path = 'Neuroimaging Data\\'
embeddings_path = "Analysis Results\Sentence Embeddings\\"
sims_path = 'Analysis Results\Sentence Similarities\\'

# numpy print options
np.set_printoptions(precision=2, threshold=2000, linewidth=200, suppress=True, floatmode='fixed')
sns.set()

### Load sentence pairs

In [5]:
## Show available datasets, as specified in the sep module
pairs = False # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
print('Available datasets:')
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

Available datasets:
0 2014 Wehbe\Stimuli\Chapter_9_sentences_final
1 2017 Anderson\Stimuli\stimuli_final
2 2018 Pereira\Stimuli\stimuli_243sentences
3 2018 Pereira\Stimuli\stimuli_384sentences
4 2020 Alice Dataset\Stimuli\stimuli_sentences_final
5 2020 Zhang\Stimuli\test_sentences_final
6 2023 Fodor Dataset\Fodor2023-final240
7 2023 Fodor Dataset\Fodor2023-final192
8 2023 Fodor Dataset\Fodor2023-prelim


In [6]:
## Load sentence set (choose number from those printed above)
dataset = datasets[5]
sentences_dict = sep.load_set_of_sentences(dataset, path_root+data_pairs_path, path_root+data_nonpaired_path, pairs)
full_dataset_name = sep.fix_sentence_dataset_name(dataset)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset,'with',n,'sentences')


loaded 2020 Zhang\Stimuli\test_sentences_final with 95 sentences


### Compute model similarities (for paired datasets, run once)

This code is used for experimental sentence datasets, which have lists of sentence pairs.

In [7]:
## Compute sentence similarities using pre-stored embeddings

# Specify compositional functions to be examined
sim_funcs = ['mean','mult','conv','ernie_0','ernie_5','ernie_12','infersent','universal','sentbert','sentbert_mpnet','openai','defsent_mean','defsent_cls','amrbart',
             'ernie_0','ernie_5','ernie_12','infersent','universal','sentbert','sentbert_mpnet','openai','defsent_mean','defsent_cls','amrbart']

# Compute similarities for all sentence pairs in dataset
sim_storage = {}
for comp_func in sim_funcs:

    # load pre-computed sentence embeddings for relevant dataset
    comp_func_norml = comp_func+'_norml'
    sim_storage[comp_func] = np.array([])
    sim_storage[comp_func_norml] = np.array([])
    try:
        sentences_a = np.loadtxt(path_root+embeddings_path+dataset+'_a_'+comp_func+'_embeddings.txt',  delimiter=' ', dtype='float', encoding='utf-8')
        sentences_b = np.loadtxt(path_root+embeddings_path+dataset+'_b_'+comp_func+'_embeddings.txt',  delimiter=' ', dtype='float', encoding='utf-8')
    except OSError:
        continue
    
    # normalise embeddings
    sentences_a_norml = sep.normalise_embeddings(sentences_a)
    sentences_b_norml = sep.normalise_embeddings(sentences_b)
    
    # compute and store similarities
    for sent_id in sentences_dict.keys():
        pair_sim = sep.cosine_sim(sentences_a[sent_id-1],sentences_b[sent_id-1])
        pair_sim_norml = sep.cosine_sim(sentences_a_norml[sent_id-1],sentences_b_norml[sent_id-1])
        sim_storage[comp_func] = np.append(sim_storage[comp_func],pair_sim)
        sim_storage[comp_func_norml] = np.append(sim_storage[comp_func_norml],pair_sim_norml)
        
    # save similarities
    np.savetxt(dataset+'_'+comp_func+'_norml_similarities.txt', sim_storage[comp_func_norml], fmt='%f')
    np.savetxt(dataset+'_'+comp_func+'_similarities.txt', sim_storage[comp_func], fmt='%f')

### Compute model similarities (neuro data, run once)

This code is used for neuroimaging sentence datasets, which have a list of single sentences. The code therefore computes the pairwise similarity between each unique pairing of sentences.

In [7]:
## Compute and save similarities for sentence experimental similarities
sim_storage = {}
sent_id_pairs = list(itertools.combinations(sentences_dict.keys(), 2)) # pairs of sentences

# Add storage elements for functions to be used
sim_funcs = ['mean','mult','conv','ernie_0','ernie_5','ernie_12','infersent','universal','sentbert','sentbert_mpnet','openai','defsent_mean','defsent_cls','amrbart',
             'ernie_0','ernie_5','ernie_12','infersent','universal','sentbert','sentbert_mpnet','openai','defsent_mean','defsent_cls','amrbart']
# sim_funcs = ['sentbert_mpnet']
sentence_embeds_dict = {}
sentence_embeds_norm_dict = {}

# Compute similarities for all sentence pairs in dataset
for comp_func in sim_funcs:
    
    # load pre-computed sentence embeddings for relevant dataset
    comp_func_norml = comp_func+'_norml'
    sim_storage[comp_func] = np.array([])
    sim_storage[comp_func_norml] = np.array([])
    try:
        sentence_embeds = np.loadtxt(path_root+embeddings_path+full_dataset_name+'_'+comp_func+'_embeddings.txt',  delimiter=' ', dtype='float', encoding='utf-8')
        sentence_embeds_dict[comp_func] = sentence_embeds
    except OSError:
        continue
    
    # normalise embeddings
    sentence_embeds_norml = sep.normalise_embeddings(sentence_embeds)
    sentence_embeds_norm_dict[comp_func] = sentence_embeds_norml
    
    # compute and store similarities
    for sent_id_pair in sent_id_pairs:
        pair_sim = sep.cosine_sim(sentence_embeds[sent_id_pair[0]-1],sentence_embeds[sent_id_pair[1]-1])
        pair_sim_norml = sep.cosine_sim(sentence_embeds_norml[sent_id_pair[0]-1],sentence_embeds_norml[sent_id_pair[1]-1])
        sim_storage[comp_func] = np.append(sim_storage[comp_func],pair_sim)
        sim_storage[comp_func_norml] = np.append(sim_storage[comp_func_norml],pair_sim_norml)
        
    # save similarities
    np.savetxt(full_dataset_name+'_'+comp_func+'_similarities.txt', sim_storage[comp_func], fmt='%f')
    np.savetxt(full_dataset_name+'_'+comp_func+'_norml_similarities.txt', sim_storage[comp_func_norml], fmt='%f')

  new_embeddings = np.transpose(new_mean_np/std_np)
  similarity = np.dot(embed_1,embed_2)/(np.linalg.norm(embed_1)*np.linalg.norm(embed_2))
