In [1]:
## This script computes pairwise sentence similarities based on transformer embeddings
# Run using base python 3.9
# James Fodor 2023
#
# Requires first generating the sentence embeddings using the 'Calc_embeds_transformers.ipynb' file.
# This code generates files with a similarity score between each pair of sentences, one for file per model.
# The file contains a single similarity score (-1 to 1) on each line.

# load libraries
import numpy as np
import itertools
import seaborn as sns
import json
import sys
import random

# numpy print options
np.set_printoptions(precision=2, threshold=2000, linewidth=200, suppress=True, floatmode='fixed')
sns.set()

# load custom functions from library
sys.path.insert(0,'D:\\My Code\\Python\\2022_06 Compositional Semantics Paper')
sys.path.insert(0,'D:\\My Code\\Python\\2023_02 fMRI RSA Analysis')
import sentence_embeds_processing as sep
import voxels_processing as vp

# file containing path directories
with open("D:\\My Code\\Python\\2023_02 fMRI RSA Analysis\\file_paths.json", "r") as file:
    file_paths_dict = json.load(file)

In [29]:
import importlib
importlib.reload(sep)

<module 'sentence_embeds_processing' from 'D:\\My Code\\Python\\2022_06 Compositional Semantics Paper\\sentence_embeds_processing.py'>

### Load sentence pairs

In [2]:
## Show available datasets, as specified in the sep module
pairs = False # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
print('Available datasets:')
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

Available datasets:
0 Wehbe_neuro
1 Anderson_neuro
2 Pereira243_neuro
3 Pereira384_neuro
4 Alice_neuro
5 Zhang_neuro
6 Zhang_neuro_rev
7 Zhang_neuro_8s
8 Fodor2024-final108_neuro


In [3]:
## Load sentence set 

# choose number from those printed above
dataset_name = datasets[6]

# load sentence set into dictionary depending on type
if pairs == True:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict['data_pairs_path'], pairs)
else:
    sentences_dict = sep.load_set_of_sentences(dataset_name, file_paths_dict[dataset_name+'-stim'], pairs)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset_name,'with',n,'sentences')

# store in list
sentences = []
if pairs==True: # use this for sentence similarity pair data
    sentences.append(list(np.array(list(sentences_dict.values()))[:,0].flatten()))
    sentences.append(list(np.array(list(sentences_dict.values()))[:,1].flatten()))
else: # use this for neuroimaging data/list of sentences
    sentences = list(sentences_dict.values())


loaded Zhang_neuro_rev with 95 sentences


### Compute model similarities (for paired datasets)

This code is used for experimental sentence datasets, which have lists of sentence pairs.

In [32]:
## Compute sentence similarities using pre-computed embeddings

# Compute similarities for all sentence pairs in dataset
sim_storage = {}
for comp_model in vp.model_name_dict.keys():

    # load pre-computed sentence embeddings for relevant dataset
    comp_model_norml = comp_model+'_norml'
    sim_storage[comp_model] = np.array([])
    sim_storage[comp_model_norml] = np.array([])
    try:
        sentences_a = np.loadtxt(file_paths_dict['embeddings_path']+dataset_name+'_a_'+comp_model+'_embeddings.txt',  delimiter=' ', dtype='float', encoding='utf-8')
        sentences_b = np.loadtxt(file_paths_dict['embeddings_path']+dataset_name+'_b_'+comp_model+'_embeddings.txt',  delimiter=' ', dtype='float', encoding='utf-8')
    except OSError:
        continue
    
    # normalise embeddings
    sentences_a_norml = sep.normalise_embeddings(sentences_a)
    sentences_b_norml = sep.normalise_embeddings(sentences_b)
    
    # compute and store similarities
    for sent_id in sentences_dict.keys():
        pair_sim = sep.cosine_sim(sentences_a[sent_id-1],sentences_b[sent_id-1])
        pair_sim_norml = sep.cosine_sim(sentences_a_norml[sent_id-1],sentences_b_norml[sent_id-1])
        sim_storage[comp_model] = np.append(sim_storage[comp_model],pair_sim)
        sim_storage[comp_model_norml] = np.append(sim_storage[comp_model_norml],pair_sim_norml)
        
    # save similarities
    np.savetxt(dataset_name+'_'+comp_model+'_norml_similarities.txt', sim_storage[comp_model_norml], fmt='%f')
    np.savetxt(dataset_name+'_'+comp_model+'_similarities.txt', sim_storage[comp_model], fmt='%f')

In [None]:
## Compute 'vocab similarity' for paired datasets (number of words shared by the two sentences)

# get stop words
stop_words = np.loadtxt(file_paths_dict['stop_words_path'], dtype='str') # list of stop words

# loop over all word pairs
word_overlaps = []
for sent_id in sentences_dict.keys():
    sentence_1 = sentences_dict[sent_id][0]
    sentence_2 = sentences_dict[sent_id][1]
    token_list_1 = sep.tokenise_sentence(sentence_1, stop_words)
    token_list_2 = sep.tokenise_sentence(sentence_2, stop_words)
    
    shared_words = set(token_list_1) & set(token_list_2)
    all_words = list(set(token_list_1+token_list_2))
    word_overlap = len(shared_words)/len(all_words)
    word_overlaps.append(word_overlap)

# save similarities to file
np.savetxt(dataset_name+'_vocab_similarities.txt', np.array(word_overlaps), fmt='%f')
print('saved vocab similarities')

### Compute model similarities (for neuro datasets)

This code is used for neuroimaging sentence datasets, which have a list of single sentences. The code therefore computes the pairwise similarity between each unique pairing of sentences.

In [4]:
## Compute sentence similarities using pre-computed embeddings
sim_storage = {}
sent_id_pairs = list(itertools.combinations(np.arange(n), 2)) # pairs of sentences

# Add storage elements for functions to be used
# comp_models = ['mean','random','glove6b','mean_inverted','mult','conv','ernie_0','ernie_5','ernie_12','infersent','universal','sentbert','sentbert_mpnet','openai','defsent_mean','defsent_cls','amrbart',
#              'ernie_0','ernie_5','ernie_12','infersent','universal','sentbert','sentbert_mpnet','openai','defsent_mean','defsent_cls','amrbart','dictbert','S3BERT','UAE']
comp_models = ['mean','sentbert_mpnet','defsent_cls','amrbart','UAE']
sentence_embeds_dict = {}
sentence_embeds_norm_dict = {}

# Compute similarities for all sentence pairs in dataset
for comp_model in comp_models:
    
    # load pre-computed sentence embeddings for relevant dataset
    comp_model_norml = comp_model+'_norml'
    sim_storage[comp_model] = np.array([])
    sim_storage[comp_model_norml] = np.array([])
    try:
        sentence_embeds = np.loadtxt(file_paths_dict['embeddings_path']+dataset_name+'_'+comp_model+'_embeddings.txt',  delimiter=' ', dtype='float', encoding='utf-8')
        sentence_embeds_dict[comp_model] = sentence_embeds
    except OSError:
        print('Error loading',comp_model)
        continue
    
    # normalise embeddings
    sentence_embeds_norml = sep.normalise_embeddings(sentence_embeds)
    sentence_embeds_norm_dict[comp_model] = sentence_embeds_norml
    np.savetxt(dataset_name+'_'+comp_model+'_norml_embeddings.txt', sentence_embeds_norm_dict[comp_model], fmt='%f')
    
    # compute and store similarities
    for sent_id_pair in sent_id_pairs:
        pair_sim = sep.cosine_sim(sentence_embeds[sent_id_pair[0]],sentence_embeds[sent_id_pair[1]])
        pair_sim_norml = sep.cosine_sim(sentence_embeds_norml[sent_id_pair[0]],sentence_embeds_norml[sent_id_pair[1]])
        sim_storage[comp_model] = np.append(sim_storage[comp_model],pair_sim)
        sim_storage[comp_model_norml] = np.append(sim_storage[comp_model_norml],pair_sim_norml)
        
    # save similarities
    np.savetxt(dataset_name+'_'+comp_model+'_similarities.txt', sim_storage[comp_model], fmt='%f')
    np.savetxt(dataset_name+'_'+comp_model+'_norml_similarities.txt', sim_storage[comp_model_norml], fmt='%f')

In [5]:
# Compute and save random embeddings and sims
sentence_embeds_flat = sentence_embeds_norml.flatten()
sentence_embeds_flat_suffle = sentence_embeds_flat.copy()
random.shuffle(sentence_embeds_flat_suffle)
sentence_embeds_shuffle = sentence_embeds_flat_suffle.reshape(sentence_embeds_norml.shape)
np.savetxt(dataset_name+'_mean_random_embeddings.txt', sentence_embeds_shuffle, fmt='%f')

random_sims = np.array([])
for sent_id_pair in sent_id_pairs:
    pair_sim = sep.cosine_sim(sentence_embeds_shuffle[sent_id_pair[0]],sentence_embeds_shuffle[sent_id_pair[1]])
    random_sims = np.append(random_sims,pair_sim)
np.savetxt(dataset_name+'_mean_random_similarities.txt', random_sims, fmt='%f')