In [2]:
!pip install sentence_transformers 

Collecting sentence_transformers
  Downloading sentence_transformers-3.3.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.3.1-py3-none-any.whl (268 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.3.1


**The concept is to transform the sentence (i.e. sequence of text) into a numerical vector and then come up with a linear layer to do the downstream task (classification or regression)**
![](https://thepythoncode.com/media/articles/finetune-bert-for-semantic-textual-similarity-in-python/image1-min.png)

**In this notebook we will calculate the similarity of different sentences using BERT**


## Import the Libraries

In [3]:
from datasets import load_dataset
from sentence_transformers import SentenceTransformer, models
from transformers import BertTokenizer
from transformers import get_linear_schedule_with_warmup
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from tqdm import tqdm
import time
import datetime
import random
import numpy as np
import pandas as pd

## Device configuration
**his code checks if a GPU is available using PyTorch's torch.cuda.is_available() function. If a GPU is detected, it sets the device to GPU (cuda) and prints the GPU details. Otherwise, it defaults to the CPU.**

In [4]:
# set configuration 
if torch.cuda.is_available():    
    device = torch.device("cuda")
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('We will use the GPU:', torch.cuda.get_device_name(0))
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla P100-PCIE-16GB


## Load dataser

**The code uses the Hugging Face datasets library to load the "stsb_multi_mt" dataset with the "en" (English) configuration.**  
**The STSB Multi-MT dataset is a multilingual version of the Semantic Textual Similarity Benchmark. It consists of sentence pairs with similarity scores ranging from 0 (no similarity) to 5 (identical meaning).**

In [5]:
# Load the English version of the STSB dataset
dataset = load_dataset("stsb_multi_mt", "en")

README.md:   0%|          | 0.00/11.4k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/470k [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/108k [00:00<?, ?B/s]

dev-00000-of-00001.parquet:   0%|          | 0.00/142k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5749 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1379 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [6]:
dataset 

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 5749
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1379
    })
    dev: Dataset({
        features: ['sentence1', 'sentence2', 'similarity_score'],
        num_rows: 1500
    })
})

In [7]:
dataset["train"][0]

{'sentence1': 'A plane is taking off.',
 'sentence2': 'An air plane is taking off.',
 'similarity_score': 5.0}

## Import BERT Tokenizer and Model

**In this notebook we will use the bert-base-uncased model. To use it we will need to import both the tokenizer and the model: The tokenizer will enable us to transform strings into tensors that can be then sent to the model, which in turn will give us the embeddings.**

In [8]:
# load bert from hugging  face 
# You can use larger variants of the model, here we're using the base model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
class BertForSTS(torch.nn.Module):

    def __init__(self):
        super(BertForSTS, self).__init__()
        self.bert = models.Transformer('bert-base-uncased', max_seq_length=128)
        self.pooling_layer = models.Pooling(self.bert.get_word_embedding_dimension())
        self.sts_bert = SentenceTransformer(modules=[self.bert, self.pooling_layer])

    def forward(self, input_data):
        output = self.sts_bert(input_data)['sentence_embedding']
        return output



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [9]:
# Instantiate the model and move it to GPU
model = BertForSTS()
model.to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BertForSTS(
  (bert): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
  (pooling_layer): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (sts_bert): SentenceTransformer(
    (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
    (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  )
)

## Preparing the Data
**In order to provide our model with properly prepared data, we’ll define a custom data loader class named MRPCDataset. A data loader class is necessary to efficiently load and prepare your data for training or inference in a Machine Learning setting. It provides functionality for loading data from a dataset, applying the necessary transformations or preprocessing steps, and batching the data for efficient processing**

In [11]:
class STSBDataset(torch.utils.data.Dataset):

    def __init__(self, dataset):
        # Normalize the similarity scores in the dataset
        similarity_scores = [i['similarity_score'] for i in dataset]
        self.normalized_similarity_scores = [i/5.0 for i in similarity_scores]
        self.first_sentences = [i['sentence1'] for i in dataset]
        self.second_sentences = [i['sentence2'] for i in dataset]
        self.concatenated_sentences = [[str(x), str(y)] for x,y in   zip(self.first_sentences, self.second_sentences)]

    def __len__(self):
        return len(self.concatenated_sentences)

    def get_batch_labels(self, idx):
        return torch.tensor(self.normalized_similarity_scores[idx])

    def get_batch_texts(self, idx):
        return tokenizer(self.concatenated_sentences[idx], padding='max_length', max_length=128, truncation=True, return_tensors="pt")

    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        return batch_texts, batch_y


def collate_fn(texts):
    input_ids = texts['input_ids']
    attention_masks = texts['attention_mask']
    features = [{'input_ids': input_id, 'attention_mask': attention_mask}
                for input_id, attention_mask in zip(input_ids, attention_masks)]
    return features

## Defining the Loss Function
**Since our objective is to train a model to effectively differentiate between pairs of texts based on their semantic meaning. The desired outcome is for the model to learn to separate dissimilar text pairs by assigning them a large distance or dissimilarity score while keeping similar text pairs close together with a small distance or similarity score.**

In [12]:
class CosineSimilarityLoss(torch.nn.Module):

    def __init__(self,  loss_fn=torch.nn.MSELoss(), transform_fn=torch.nn.Identity()):
        super(CosineSimilarityLoss, self).__init__()
        self.loss_fn = loss_fn
        self.transform_fn = transform_fn
        self.cos_similarity = torch.nn.CosineSimilarity(dim=1)

    def forward(self, inputs, labels):
        emb_1 = torch.stack([inp[0] for inp in inputs])
        emb_2 = torch.stack([inp[1] for inp in inputs])
        outputs = self.transform_fn(self.cos_similarity(emb_1, emb_2))
        return self.loss_fn(outputs, labels.squeeze())

## Preparing the Training and Validation Data Splits

In [13]:
train_ds = STSBDataset(dataset['train'])
val_ds = STSBDataset(dataset['dev'])

# Create a 90-10 train-validation split.
train_size = len(train_ds)
val_size = len(val_ds)

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

5,749 training samples
1,500 validation samples


In [14]:
batch_size = 8

train_dataloader = DataLoader(
            train_ds,  # The training samples.
            num_workers = 4,
            batch_size = batch_size, # Use this batch size.
            shuffle=True # Select samples randomly for each batch
        )

validation_dataloader = DataLoader(
            val_ds,
            num_workers = 4,
            batch_size = batch_size # Use the same batch size
        )

In [15]:
optimizer = AdamW(model.parameters(),
                  lr = 1e-6)
epochs = 8
# Total number of training steps is [number of batches] x [number of epochs].
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0,
                                            num_training_steps = total_steps)


In [38]:
def train():
    seed_val = 42
    random.seed(seed_val)
    torch.manual_seed(seed_val)

    criterion = CosineSimilarityLoss()
    criterion = criterion.cuda()  # Move criterion to GPU if using CUDA

    # We'll store a number of quantities such as training and validation loss,
    # validation accuracy, and timings.
    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(0, epochs):
        t0 = time.time()
        total_train_loss = 0
        model.train()  # Set model to training mode

        # For each batch of training data...
        for train_data, train_label in tqdm(train_dataloader):
            train_data['input_ids'] = train_data['input_ids'].to(device)
            train_data['attention_mask'] = train_data['attention_mask'].to(device)
            train_data = collate_fn(train_data)

            # Clear previous gradients
            model.zero_grad()

            # Forward pass
            output = [model(feature) for feature in train_data]

            # Calculate loss (CrossEntropy expects logits and labels as integers)
            loss = criterion(output, train_label.to(device))
            total_train_loss += loss.item()

            # Backward pass to calculate gradients
            loss.backward()

            # Gradient clipping
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Optimizer step
            optimizer.step()
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Measure how long this epoch took.
        training_time = time.time() - t0

        # Start validation phase
        model.eval()  # Set model to evaluation mode
        total_eval_loss = 0
        nb_eval_steps = 0

        # Disable gradient calculations during evaluation
        with torch.no_grad():
            for val_data, val_label in tqdm(validation_dataloader):
                val_data['input_ids'] = val_data['input_ids'].to(device)
                val_data['attention_mask'] = val_data['attention_mask'].to(device)
                val_data = collate_fn(val_data)


                # Calculate loss
                output = [model(feature) for feature in val_data]
                loss = criterion(output, val_label.to(device))
                total_eval_loss += loss.item()

                # Calculate accuracy (for classification)

        # Calculate average validation loss and accuracy
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # Record all statistics from this epoch.
        training_stats.append({
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Validation Loss': avg_val_loss,
            'Training Time': training_time,
        })

        # Print out the statistics for this epoch
        print(f"Epoch {epoch_i+1}/{epochs}")
        print(f"  Training Loss: {avg_train_loss:.4f}")
        print(f"  Validation Loss: {avg_val_loss:.4f}")
        print(f"  Training Time: {training_time:.2f}s")

    # Return the trained model and the training stats
    return model, training_stats


In [39]:
model ,training_stats = train()

100%|██████████| 719/719 [03:43<00:00,  3.22it/s]
100%|██████████| 188/188 [00:18<00:00, 10.19it/s]


Epoch 1/8
  Training Loss: 0.0343
  Validation Loss: 0.0397
  Training Time: 223.61s


100%|██████████| 719/719 [03:42<00:00,  3.23it/s]
100%|██████████| 188/188 [00:18<00:00, 10.30it/s]


Epoch 2/8
  Training Loss: 0.0334
  Validation Loss: 0.0372
  Training Time: 222.51s


100%|██████████| 719/719 [03:42<00:00,  3.23it/s]
100%|██████████| 188/188 [00:18<00:00, 10.32it/s]


Epoch 3/8
  Training Loss: 0.0305
  Validation Loss: 0.0354
  Training Time: 222.61s


100%|██████████| 719/719 [03:41<00:00,  3.24it/s]
100%|██████████| 188/188 [00:18<00:00, 10.33it/s]


Epoch 4/8
  Training Loss: 0.0285
  Validation Loss: 0.0346
  Training Time: 221.90s


100%|██████████| 719/719 [03:42<00:00,  3.23it/s]
100%|██████████| 188/188 [00:18<00:00, 10.28it/s]


Epoch 5/8
  Training Loss: 0.0272
  Validation Loss: 0.0338
  Training Time: 222.77s


100%|██████████| 719/719 [03:42<00:00,  3.23it/s]
100%|██████████| 188/188 [00:18<00:00, 10.23it/s]


Epoch 6/8
  Training Loss: 0.0265
  Validation Loss: 0.0336
  Training Time: 222.82s


100%|██████████| 719/719 [03:43<00:00,  3.22it/s]
100%|██████████| 188/188 [00:18<00:00, 10.19it/s]


Epoch 7/8
  Training Loss: 0.0262
  Validation Loss: 0.0335
  Training Time: 223.28s


100%|██████████| 719/719 [03:43<00:00,  3.22it/s]
100%|██████████| 188/188 [00:18<00:00, 10.24it/s]

Epoch 8/8
  Training Loss: 0.0258
  Validation Loss: 0.0335
  Training Time: 223.04s





## Results 

In [40]:
# Create a DataFrame from our training statistics
df_stats = pd.DataFrame(data=training_stats)

# Use the 'epoch' as the row index
df_stats = df_stats.set_index('epoch')

# Display the table
df_stats

Unnamed: 0_level_0,Training Loss,Validation Loss,Training Time
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.034256,0.039693,223.606177
2,0.033385,0.037189,222.512085
3,0.030503,0.035444,222.613162
4,0.028469,0.034555,221.903625
5,0.027189,0.033816,222.772918
6,0.026522,0.033556,222.815157
7,0.026193,0.03349,223.2811
8,0.025836,0.03349,223.037639


## Setting the Model for Inference

In [43]:
print(dataset["test"][0])

{'sentence1': 'A girl is styling her hair.', 'sentence2': 'A girl is brushing her hair.', 'similarity_score': 2.5}


In [45]:
# load the test set
test_dataset = dataset["test"]
# Prepare the data
first_sent = [test_dataset[i]['sentence1'] for i in range(len(test_dataset))]
second_sent = [test_dataset[i]['sentence2'] for i in range(len(test_dataset))]
full_text = [[str(x), str(y)] for x,y in zip(first_sent, second_sent)]

model.eval()

def predict_similarity(sentence_pair):
  test_input = tokenizer(sentence_pair, padding='max_length', max_length = 128, truncation=True, return_tensors="pt").to(device)
  test_input['input_ids'] = test_input['input_ids']
  test_input['attention_mask'] = test_input['attention_mask']
  del test_input['token_type_ids']
  output = model(test_input)
  sim = torch.nn.functional.cosine_similarity(output[0], output[1], dim=0).item()
  return sim 

In [49]:
example_1 = full_text[1]
print(f"Sentence 1: {first_sent[1]}")
print(f"Sentence 2: {second_sent[1]}")
print(f"Predicted similarity score: {round(predict_similarity(example_1), 2)}")

Sentence 1: A group of men play soccer on the beach.
Sentence 2: A group of boys are playing soccer on the beach.
Predicted similarity score: 0.84


In [51]:
example_1 = full_text[300]
print(f"Sentence 1: {first_sent[300]}")
print(f"Sentence 2: {second_sent[300]}")
print(f"Predicted similarity score: {round(predict_similarity(example_1), 2)}")

Sentence 1: A red and white bus drives down an England street.
Sentence 2: A red and white England bus drives down the street.
Predicted similarity score: 0.95


## Save the model

In [52]:
PATH = 'bert-sts.pt'
torch.save(model.state_dict(), PATH)