In [1]:
## This script computes sentence embeddings for a list of sentences, using a wide range of models
# Run using base python 3.9
# James Fodor 2023
#
# This script requires the following models to be installed in order to work:
#  1. ConceptNet: see here https://www.worldlink.com.cn/en/osdir/conceptnet-numberbatch.html
#  2. ERNIE-base: see here https://github.com/nghuyong/ERNIE-Pytorch
#  3. ERNIE-large: see here https://huggingface.co/nghuyong/ernie-2.0-large-en/tree/main
#  4. SentBERT: see here https://www.sbert.net/
#  5. InferSent: see here https://github.com/facebookresearch/InferSent
#  6. DictBERT: see here https://huggingface.co/wyu1/DictBERT/tree/main
#  7. Universal Sencence Encoder: install with 'conda install tensorflow'
#  8. AMRBART: see here https://huggingface.co/xfbai/AMRBART-large
#  9. DefSent: see here https://github.com/hppRC/defsent
# 10. OpenAI Embeddings: see docs here https://beta.openai.com/docs/guides/embeddings
#
# The basic workflow for running this script is as follows:
# 1. Download all the required models and update paths
# 2. Prepare text file of sentences to get embeddings for
# 3. Run the script to compute and save the embeddings
#
# The script generates one file per model or model variant, each line of which contains the sentence
# embedding for one sentence from the input list of sentences.
# 
# OpenAI Embeddings require a subscription to the openai api. 
# The key and organisation id should be stored, one value per line, in a file called 'openai_key.txt' in the same directory as this notebook.


# load libraries
import numpy as np
import sentence_embeds_processing as sep
import torch
import sys
import seaborn as sns
import json

from transformers import AutoTokenizer, AutoConfig, AutoModel
from scipy.stats import spearmanr
from scipy.ndimage import convolve

# load file paths
with open("file_paths.json", "r") as file:
    file_paths_dict = json.load(file)

# numpy print options
np.set_printoptions(precision=2, threshold=2000, linewidth=200, suppress=True, floatmode='fixed')
sns.set()

# stop words
stop_words = np.loadtxt(file_paths_dict['stop_words_path'], dtype='str') # list of stop words

### Define key functions

In [2]:
## Functions to compute sentence embeddings

# Calculate sentence embeddings by arithmetic operation over word token embeddings
def get_arithmetic_embedding(word_embeddings, sentences, function_name='mean', stopwords=stop_words, weights=(1,1,1)):
    
    # create empty array
    sentence_embeddings = np.empty((0,len(word_embeddings['man'])), float)
    
    for sentence in sentences:
        # get tokens for each word
        token_list = sep.tokenise_sentence(sentence, stop_words)
        token_embeds_matrix = sep.get_token_embeds(word_embeddings, WordNetLemmatizer, token_list)
        embed_dim = token_embeds_matrix.shape[1]
        
        # if there is only one word with embeddings
        if token_embeds_matrix.shape[0]==1: 
            sentence_embedding = token_embeds_matrix[0]
        
        # elementwise addition
        elif function_name=='mean': 
            sentence_embedding = token_embeds_matrix.mean(axis=0)
            
        # weighted mean; only works for three word sentences
        elif function_name=='wgtd_mean': 
            sentence_embedding = np.average(token_embeds_matrix,axis=0,weights=weights) 
        
        # elementwise multiplication
        elif function_name=='mult': 
            sentence_embedding = np.ones(embed_dim) # start with all ones
            for word_embedding in token_embeds_matrix:
                sentence_embedding = np.multiply(sentence_embedding, word_embedding)
            sentence_embedding=1000*sentence_embedding
        
        # circular convolution
        elif function_name=='conv': 
            sentence_embedding = convolve(token_embeds_matrix[0], token_embeds_matrix[1], mode='wrap') # convolve first to embeds
            if token_embeds_matrix.shape[0]>2:
                for word_embedding in token_embeds_matrix[2:]: # convolve the rest of the embeds
                    sentence_embedding = convolve(sentence_embedding, word_embedding, mode='wrap')

        # store embedding in array
        sentence_embeddings = np.vstack([sentence_embeddings, sentence_embedding])
        
    return sentence_embeddings


# Calculate sentence embedding using sentbert
def get_sentbert_embedding(sentbert_model, sentences):
    sentence_embeddings = sentbert_model.encode(sentences, convert_to_tensor=True)    
    return np.array(sentence_embeddings)


# Calculate sentence embedding using ERNIE-base
def get_ernie_embedding(ernie_tokenizer, ernie_model, layer, sentences):
    sentence_embeddings = np.empty((0,768), float)
    for sentence in sentences:
        encoded_input = ernie_tokenizer(sentence, return_tensors='pt')
        model_output = ernie_model(**encoded_input)
        sentence_embedding = model_output.hidden_states[layer].detach().numpy()[0].mean(axis=0)
        sentence_embeddings = np.vstack([sentence_embeddings, sentence_embedding])
    return np.array(sentence_embeddings)


# Calculate sentence embedding using ERNIE-large
def get_ernie_large_embedding(ernie_large_tokenizer, ernie_large_model, layer, sentences):
    sentence_embeddings = np.empty((0,1024), float)
    for sentence in sentences:
        encoded_input = ernie_large_tokenizer(sentence, return_tensors='pt')
        model_output = ernie_large_model(**encoded_input)
        sentence_embedding = model_output.hidden_states[layer].detach().numpy()[0].mean(axis=0)
        sentence_embeddings = np.vstack([sentence_embeddings, sentence_embedding])
    return np.array(sentence_embeddings)


# Calculate sentence embedding using universal sentence embeddings
def get_universal_embedding(universal_model, sentences):
    sentence_embeddings = universal_model(sentences).numpy()
    return np.array(sentence_embeddings)


# Calculate sentence embedding using amrbart
def get_amrbart_embedding(amrbart_tokenizer, amrbart_model, sentences):
    sentence_embeddings = np.empty((0,1024), float)
    for sentence in sentences:
        encoded_input = amrbart_tokenizer(sentence, return_tensors='pt')
        model_output = amrbart_model(**encoded_input)
        sentence_embedding = np.mean(model_output.encoder_last_hidden_state.detach().numpy()[0], axis=0)
        sentence_embeddings = np.vstack([sentence_embeddings, sentence_embedding])
    return sentence_embeddings


# Calculate sentence embedding using infersent
def get_infersent_embedding(infersent_embeds, infersent_path, sentences):
    infersent_embeds.set_w2v_path(infersent_path) # load word embeddings for infersent
    infersent_embeds.build_vocab(sentences, tokenize=True) # create vocab of just words used
    sentence_embeddings = infersent_embeds.encode(sentences, tokenize=True)
    return sentence_embeddings


# Calculate sentence embedding using defsent_mean
def get_defsent_mean_embedding(defsent_mean_model, sentence):
    embedding_model = defsent_mean_model.encode
    sentence_embeddings = embedding_model(sentence)
    return np.array(sentence_embeddings)


# Calculate sentence embedding using dictbert
def get_dictbert_embedding(dictbert_tokenizer, dictbert_model, layer, sentences):
    sentence_embeddings = np.empty((0,768), float)
    for sentence in sentences:
        encoded_input = dictbert_tokenizer(sentence, return_tensors='pt')
        model_output = dictbert_model(**encoded_input)
        sentence_embedding = model_output.hidden_states[layer].detach().numpy()[0].mean(axis=0)
        sentence_embeddings = np.vstack([sentence_embeddings, sentence_embedding])
    return np.array(sentence_embeddings)


# Calculate sentence embedding using defsent_mean
def get_defsent_cls_embedding(defsent_cls_model, sentence):
    embedding_model = defsent_cls_model.encode
    sentence_embeddings = defsent_cls_model.encode(sentence)
    return np.array(sentence_embeddings)


# Calculate sentence embedding using open_ai embeddings
def get_open_ai_embedding(sentences):
    sentence_embeddings = np.empty((0,1536), float)
    for sentence in sentences:
        raw_sentence_embeddings = openai.Embedding.create(input=sentence, model="text-embedding-ada-002") # get using API
        sentence_embedding = np.array(raw_sentence_embeddings['data'][0]['embedding'])
        sentence_embeddings = np.vstack([sentence_embeddings, sentence_embedding])
    return sentence_embeddings


In [3]:
## Load all the needed transformer models

# Load ConceptNet embeddings and nltk lemmatizer
from nltk.data import path # need to specify the location of the nltk data
path.append(file_paths_dict['path_root']+"\Frames and Structured Data\\FrameNet\\nltk_data")
from nltk.stem import WordNetLemmatizer
model_address = file_paths_dict['path_root']+'\Word Embeddings\ConceptNet Embeddings\\numberbatch-en.txt'
conceptnet_embeds = sep.import_word_model(model_address)

# Load ERNIE-base model
ernie_address = file_paths_dict['path_root']+'\Sentence Encoders\ernie-2.0-base-en'
config_state = AutoConfig.from_pretrained(ernie_address, output_hidden_states=True) # get hidden states
ernie_tokenizer = AutoTokenizer.from_pretrained(ernie_address)
ernie_model = AutoModel.from_pretrained(ernie_address, config=config_state)

# Load ERNIE-large model
ernie_large_address = file_paths_dict['path_root']+'\Sentence Encoders\ernie-2.0-large-en'
config_state = AutoConfig.from_pretrained(ernie_large_address, output_hidden_states=True) # get hidden states
ernie_large_tokenizer = AutoTokenizer.from_pretrained(ernie_large_address)
ernie_large_model = AutoModel.from_pretrained(ernie_large_address, config=config_state)

# Load SentBERT model
from sentence_transformers import SentenceTransformer, util
sentbert_model_mpnet = SentenceTransformer(file_paths_dict['path_root']+'\Sentence Encoders\sentence-transformers-mpnet-base-v2')
sentbert_model = SentenceTransformer(file_paths_dict['path_root']+'\Sentence Encoders\sentence-transformers-MiniLM-L6-v2')

# Load InferSent embeddings
module_path = file_paths_dict['path_root']+'\Sentence Encoders\infersent'
sys.path.insert(0, module_path) # add model location to path
from models import InferSent 
infersent_embeds = InferSent({'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048, 'pool_type': 'max', 'dpout_model': 0.0, 'version': 1})
infersent_embeds.load_state_dict(torch.load(module_path+'\infersent1.pkl'))
infersent_path = file_paths_dict['path_root']+'\Word Embeddings\Glove Word Embeddings\glove.840B.300d.txt'

# Load DictBERT model
dictbert_address = file_paths_dict['path_root']+'\Sentence Encoders\dictbert'
config_state = AutoConfig.from_pretrained(dictbert_address, output_hidden_states=True) # get hidden states
dictbert_tokenizer = AutoTokenizer.from_pretrained(dictbert_address)
dictbert_model = AutoModel.from_pretrained(dictbert_address, config=config_state)

# Load Universal Sentence Encoder embeddings
import tensorflow as tf
import tensorflow_hub as hub
from absl import logging # (use pip install absl-py)
logging.set_verbosity(logging.ERROR)
universal_model = hub.load(file_paths_dict['path_root']+'\\Sentence Encoders\\universal-sentence-encoder')

# Load AMRBART model
from transformers import BartForConditionalGeneration
config_state = AutoConfig.from_pretrained(file_paths_dict['path_root']+'\Sentence Encoders\\amrbart-large', output_hidden_states=True) # get hidden states
amrbart_tokenizer = AutoTokenizer.from_pretrained(file_paths_dict['path_root']+'\Sentence Encoders\\amrbart-large', collapse_name_ops=False, use_pointer_tokens=True, raw_graph=False)
amrbart_model = BartForConditionalGeneration.from_pretrained(file_paths_dict['path_root']+'\Sentence Encoders\\amrbart-large', config=config_state)

# Load DefSent models
from defsent import DefSent
defsent_mean_model = DefSent(file_paths_dict['path_root']+'\Sentence Encoders\\defsent-roberta-large-mean')
defsent_cls_model = DefSent(file_paths_dict['path_root']+'\Sentence Encoders\\defsent-roberta-large-cls')

# Prepare API for openai embeddings (needs a key to work)
import openai
api_key, api_org = sep.load_openai_key('openai_key.txt')
openai.organization = api_org
openai.api_key = api_key

Some weights of the model checkpoint at D:\Study and Projects\School Work\Year 25 - PhD\Data\\Sentence Encoders\ernie-2.0-base-en were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at D:\Study and Projects\School Work\Year 25 - PhD\Data\\Sentence Encoders

### Load sentence datasets

In [4]:
## Show available datasets, as specified in the sep module
pairs = False # specify if we are using paired data or list of sentences
if pairs==True:
    datasets = sep.available_pair_datasets
else:
    datasets = sep.available_nonpaired_datasets
print('Available datasets:')
for dataset in datasets.keys():
    print(dataset,datasets[dataset])

Available datasets:
0 2014 Wehbe\Stimuli\Chapter_9_sentences_final
1 2017 Anderson\Stimuli\stimuli_final
2 2018 Pereira\Stimuli\stimuli_243sentences
3 2018 Pereira\Stimuli\stimuli_384sentences
4 2020 Alice Dataset\Stimuli\stimuli_sentences_final
5 2020 Zhang\Stimuli\test_sentences_final
6 2023 Fodor Dataset\Stimuli\Fodor2023-final240
7 2023 Fodor Dataset\Stimuli\Fodor2023-final192
8 2023 Fodor Dataset\Stimuli\Fodor2023-final96


In [5]:
## Load sentence set (choose number from those printed above)
dataset = datasets[8]
sentences_dict = sep.load_set_of_sentences(dataset, file_paths_dict['data_pairs_path'], file_paths_dict['data_nonpaired_path'], pairs)
full_dataset_name = sep.fix_sentence_dataset_name(dataset)
n = len(sentences_dict.keys()) # num sentences
print('\nloaded',dataset,'with',n,'sentences')


loaded 2023 Fodor Dataset\Stimuli\Fodor2023-final96 with 96 sentences


### Compute arithmetic and transformer sentence embeddings

In [7]:
## Compute embeddings for all sentences in set

# select format of sentences needed depending on the dataset type
sentences = []
if pairs==True: # use this for sentence similarity pair data
    sentences.append(list(np.array(list(sentences_dict.values()))[:,0].flatten()))
    sentences.append(list(np.array(list(sentences_dict.values()))[:,1].flatten()))
else: # use this for neuroimaging data/list of sentences
    sentences.append(list(sentences_dict.values()))

# loop over all types of sentence embeddings
set_id = 'a' # set_id specifies whether we are doing the first or second sentence in the pair
for sentence_set in sentences:
    
    # specify the embeddings we want to compute
    embeds = {}
    embeds['mean'] = get_arithmetic_embedding(conceptnet_embeds, sentence_set, 'mean', stop_words)
    embeds['mult'] = get_arithmetic_embedding(conceptnet_embeds, sentence_set, 'mult', stop_words)
    embeds['conv'] = get_arithmetic_embedding(conceptnet_embeds, sentence_set, 'conv', stop_words)
    embeds['ernie_0'] = get_ernie_embedding(ernie_tokenizer, ernie_model, 0, sentence_set)
    embeds['ernie_5'] = get_ernie_embedding(ernie_tokenizer, ernie_model, 5, sentence_set)
    embeds['ernie_12'] = get_ernie_embedding(ernie_tokenizer, ernie_model, 12, sentence_set)
    embeds['ernie_large_12'] = get_ernie_large_embedding(ernie_large_tokenizer, ernie_large_model, 24, sentence_set)
    embeds['sentbert'] = get_sentbert_embedding(sentbert_model, sentence_set)
    embeds['sentbert_mpnet'] = get_sentbert_embedding(sentbert_model_mpnet, sentence_set)
    embeds['universal'] = get_universal_embedding(universal_model, sentence_set)
    embeds['amrbart'] = get_amrbart_embedding(amrbart_tokenizer, amrbart_model, sentence_set)
    embeds['infersent'] = get_infersent_embedding(infersent_embeds, infersent_path, sentence_set)
    embeds['defsent_mean'] = get_defsent_mean_embedding(defsent_mean_model, sentence_set)
    embeds['defsent_cls'] = get_defsent_cls_embedding(defsent_cls_model, sentence_set)
    embeds['dictBERT'] = get_dictbert_embedding(dictbert_tokenizer, dictbert_model, 12, sentence_set)
    embeds['openai'] = get_open_ai_embedding(sentence_set)

    # print embedding sizes as a check
    for model in embeds.keys():
        print(model,embeds[model].shape)

    # save file
    # for model in embeds.keys():
    #     np.savetxt(full_dataset_name+'_'+model+'_embeddings.txt', embeds[model], fmt='%f') # string formatting

    # second set of sentences in pairwise sentence sets
    set_id = 'b'

Found 275(/275) words with w2v vectors
Vocab size : 275


  sentences = np.array(sentences)[idx_sort]


mean (96, 299)
mult (96, 299)
conv (96, 299)
ernie_0 (96, 768)
ernie_5 (96, 768)
ernie_12 (96, 768)
ernie_large_12 (96, 1024)
sentbert (96, 384)
sentbert_mpnet (96, 768)
universal (96, 512)
amrbart (96, 1024)
infersent (96, 4096)
defsent_mean (96, 1024)
defsent_cls (96, 1024)
dictBERT (96, 768)
openai (96, 1536)


### Legacy code (please ignore!)

In [22]:
# all_pairs = []
# for i in np.arange(0,34):
#     all_pairs.append((0,i))
# set_a = np.arange(0,18)
# set_b = np.arange(18,35)
# all_pairs = list(itertools.product(set_a, set_b))

all_pairs = list(itertools.combinations(np.arange(34), 2))
    
pair_corrs = {}
for model in models:
    pair_corrs[model] = []
    for pair in all_pairs:
        sent_1 = norm_embeds[model][pair[0]]
        sent_2 = norm_embeds[model][pair[1]]
        pair_corrs[model].append(sep.cosine_sim(sent_1,sent_2))
        
np.corrcoef(pair_corrs['mean'],pair_corrs['defsent_mean'])[0,1]

0.7845945320670508

In [None]:
# Print correlation coefficient matrix between sentences
# xs = np.arange(norm_embeds[model].shape[0])
# num_string = ['{:3d}'.format(x+1) for x in xs]
# print('      ',num_string)
# np.corrcoef(embeds[model])

In [10]:
phrase_1 = ['bless']
phrase_2 = ['care']
embed_1 = get_arithmetic_embedding(conceptnet_embeds, phrase_1, 'mean', stop_words)
embed_2 = get_arithmetic_embedding(conceptnet_embeds, phrase_2, 'mean', stop_words)
sep.cosine_sim(embed_1[0],embed_2[0])

0.08528654188993806