In [1]:
## Fine-tuning of sentence transformer with STS3k dataset
"""
This example loads the pre-trained SentenceTransformer model 'nli-distilroberta-base-v2' from the server.
It then fine-tunes this model for some epochs on the STS benchmark dataset.
Note: In this example, you must specify a SentenceTransformer model.
sourced from https://github.com/UKPLab/sentence-transformers/blob/master/examples/training/sts/training_stsbenchmark_continue_training.py
"""

# Extra source: https://github.com/huggingface/notebooks/blob/main/examples/text_classification.ipynb
# https://joecummings.me/tutorials/bert

# Load libraries
import math
import os
import gzip
import csv
import numpy as np

from torch.utils.data import DataLoader
from sentence_transformers import SentenceTransformer, LoggingHandler, losses, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
from sentence_embeds_processing import load_sentences

# Specify paths
path_root = "D:\Study and Projects\School Work\Year 25 - PhD 1\Data\\"
embeddings_path = 'Analysis Results\Sentence Embeddings\\'
sts3k_dataset_path = 'Sentence Similarity Data\Fodor2023 - STS3k Large Dataset\\3 - Experimental data\\'
data_pairs_path = 'Sentence Similarity Data\\Sentence Similarities Final\\'

# Calculate sentence embedding using sentbert
def get_sentbert_embedding(sentbert_model, sentences):
    sentence_embeddings = sentbert_model.encode(sentences, convert_to_tensor=True)    
    return np.array(sentence_embeddings)

In [3]:
## Load pre-trained sentence transformer model
model_name = path_root+'Sentence Embeddings\sentence-transformers-mpnet-base-v2'
model = SentenceTransformer(model_name)

In [10]:
## Load experimental dataset and prepare for fine-tuning

# Define dataset to use for fine-tuning
dataset_name = 'STS3k'
# full_dataset_path = path_root+'\Sentence Similarity Data\STSb Dataset\stsbenchmark.tsv'
# full_dataset_path = path_root+sts3k_dataset_path+'STS3k_sentbert_ft_format_10waysplit.tsv'
full_dataset_path = path_root+sts3k_dataset_path+'STS3k_sentbert_ft_format_traintestsplit.tsv'
# full_dataset_path = path_root+sts3k_dataset_path+'STS3k_sentbert_ft_format_adversarialsplit.tsv'

# Define storage lists for dataset split
train_samples = []
dev_samples = []
test_samples = []
num_epochs = 4

# Specify the test and dev set names
test_set = 'test'
dev_set = 'dev'
train_set = 'train'

# Load dataset
with open(full_dataset_path, 'rt', encoding='utf8') as file:
    reader = csv.DictReader(file, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        score = float(row['score'])
        inp_example = InputExample(texts=[row['sentence1'], row['sentence2']], label=score)

        if row['split'] == test_set:
            test_samples.append(inp_example)
        elif row['split'] == dev_set:
            dev_samples.append(inp_example)
        elif row['split'] == train_set:
            train_samples.append(inp_example)

# Define dataloader
train_dataloader = DataLoader(train_samples, shuffle=True, batch_size=16)
train_loss = losses.CosineSimilarityLoss(model=model)

# Measure correlation between cosine score and gold labels on dev set
evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, name='sts-dev')

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up

In [11]:
## Perform fine-tuning of model on specified dataset

# Name of saved ft file
name = full_dataset_path.split('_')[-1].split('.')[0]
model_save_path = dataset_name+'_ft_'+name+'_4_epochs'

# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluator,
          epochs=num_epochs,
          evaluation_steps=1000,
          warmup_steps=warmup_steps,
          output_path=model_save_path)

# Load the saved model and evaluate its performance on test dataset
model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, name='STS3k_test')
test_evaluator(model, output_path=model_save_path)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Iteration:   0%|          | 0/123 [00:00<?, ?it/s]

Iteration:   0%|          | 0/123 [00:00<?, ?it/s]

Iteration:   0%|          | 0/123 [00:00<?, ?it/s]

Iteration:   0%|          | 0/123 [00:00<?, ?it/s]

0.9176653704079322

In [84]:
## Extract sentence pair data by 10-fold set and compute embeddings by trained model

# Prepare sentences data by 10-fold sets
data_raw_array = np.loadtxt(full_dataset_path,  delimiter='\t', dtype='str', encoding='utf-8', skiprows=1)
index_values = np.arange(0,10)
data_by_set_dict_a = dict(zip(index_values, [[] for x in index_values]))
data_by_set_dict_b = dict(zip(index_values, [[] for x in index_values]))

for row in data_raw_array:
    sentence_a = row[7]
    sentence_b = row[8]
    test_set = int(row[1][-1]) # 10 -> 0
    sentence_pair_id = int(row[0])
    data_by_set_dict_a[test_set].append([sentence_pair_id,sentence_a])
    data_by_set_dict_b[test_set].append([sentence_pair_id,sentence_b])
    
# Store fine-tuned sentence embeddings by 10-fold sets
directory = path_root+'\Sentence Embeddings\sentence-transformers-mpnet-base-v2-ft-STS3k-10fold'
sentbert_models_ft_a = {}
sentbert_models_ft_b = {}
for idx,model_folder in enumerate(os.listdir(directory)):
    sentbert_model = SentenceTransformer(directory+'\\'+model_folder)
    sentences_a = np.array(data_by_set_dict_a[idx])[:,1]
    sentences_b = np.array(data_by_set_dict_b[idx])[:,1]
    sentbert_models_ft_a[idx] = get_sentbert_embedding(sentbert_model, list(sentences_a))
    sentbert_models_ft_b[idx] = get_sentbert_embedding(sentbert_model, list(sentences_b))
    
# Arrange all 10-fold embeddings into a single dictionary (set a)
full_set_embeddings_a = {}
for fold_set in data_by_set_dict_a.keys():
    fold_set_sentence_ids = np.array(data_by_set_dict_a[fold_set])[:,0]
    new_dict = dict(zip(fold_set_sentence_ids, sentbert_models_ft_a[fold_set]))
    full_set_embeddings_a.update(new_dict)
    
# Arrange all 10-fold embeddings into a single dictionary (set b)
full_set_embeddings_b = {}
for fold_set in data_by_set_dict_b.keys():
    fold_set_sentence_ids = np.array(data_by_set_dict_b[fold_set])[:,0]
    new_dict = dict(zip(fold_set_sentence_ids, sentbert_models_ft_b[fold_set]))
    full_set_embeddings_b.update(new_dict)
    
# Sort dictionaries
keys_a = [int(x) for x in list(full_set_embeddings_a.keys())]
keys_a.sort()
sorted_dict = {i: full_set_embeddings_a[str(i)] for i in keys_a}
full_set_sorted_embeddings_a = np.array(list(sorted_dict.values()))
keys_b = [int(x) for x in list(full_set_embeddings_b.keys())]
keys_b.sort()
sorted_dict = {i: full_set_embeddings_b[str(i)] for i in keys_b}
full_set_sorted_embeddings_b = np.array(list(sorted_dict.values()))

# Save results
np.savetxt(dataset_name+'_a_sentbert_ft_embeddings.txt', full_set_sorted_embeddings_a, fmt='%f')
np.savetxt(dataset_name+'_b_sentbert_ft_embeddings.txt', full_set_sorted_embeddings_b, fmt='%f')

In [12]:
## Extract and store embeddings from a single fine-tuned model

# Load pre-finetuned model
sentbert_model = SentenceTransformer(model_save_path)

# Load dataset
dataset_name = 'STS3k_all'
dataset_dict = load_sentences(path_root+data_pairs_path+dataset_name+'.txt', pairs=True)
dataset_np = np.array(list(dataset_dict.values()))

# Compute embeddings
sentbert_models_ft_a = get_sentbert_embedding(sentbert_model, list(dataset_np[:,0]))
sentbert_models_ft_b = get_sentbert_embedding(sentbert_model, list(dataset_np[:,1]))

# Save results
np.savetxt(dataset_name+'_a_sentbert_mpnet_ft_'+name+'_'+str(num_epochs)+'_epochs_embeddings.txt', sentbert_models_ft_a, fmt='%f')
np.savetxt(dataset_name+'_b_sentbert_mpnet_ft_'+name+'_'+str(num_epochs)+'_epochs_embeddings.txt', sentbert_models_ft_b, fmt='%f')