Reference Code: https://www.pinecone.io/learn/series/nlp/train-sentence-transformers-softmax/

In [1]:
import os
import math
import re
from   random import *
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import datasets
import numpy as np

# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "3"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [2]:
torch.cuda.empty_cache()

# 1.Data 

## Train, Test, Validation


In [3]:
snli = datasets.load_dataset('snli')
mnli = datasets.load_dataset('glue', 'mnli')
mnli['train'].features, snli['train'].features

({'premise': Value(dtype='string', id=None),
  'hypothesis': Value(dtype='string', id=None),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None),
  'idx': Value(dtype='int32', id=None)},
 {'premise': Value(dtype='string', id=None),
  'hypothesis': Value(dtype='string', id=None),
  'label': ClassLabel(names=['entailment', 'neutral', 'contradiction'], id=None)})

In [4]:
# List of datasets to remove 'idx' column from
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [5]:
mnli['train']

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 392702
})

In [6]:
# Remove 'idx' column from each dataset
for column_names in mnli.column_names.keys():
    mnli[column_names] = mnli[column_names].remove_columns('idx')

In [7]:
mnli.column_names.keys()

dict_keys(['train', 'validation_matched', 'validation_mismatched', 'test_matched', 'test_mismatched'])

In [8]:
snli

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
})

In [9]:
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])

(array([0, 1, 2]), array([-1,  0,  1,  2]))

In [10]:
# there are -1 values in the label feature, these are where no class could be decided so we remove
snli = snli.filter(
    lambda x: 0 if x['label'] == -1 else 1
)

In [11]:
np.unique(mnli['train']['label']), np.unique(snli['train']['label'])

(array([0, 1, 2]), array([0, 1, 2]))

In [12]:
from datasets import DatasetDict
# Merge the two DatasetDict objects
raw_dataset = DatasetDict({
    'train': datasets.concatenate_datasets([snli['train'], mnli['train']]).shuffle(seed=55).select(list(range(1000))),
    'test': datasets.concatenate_datasets([snli['test'], mnli['test_mismatched']]).shuffle(seed=55).select(list(range(100))),
    'validation': datasets.concatenate_datasets([snli['validation'], mnli['validation_mismatched']]).shuffle(seed=55).select(list(range(1000)))
})
#remove .select(list(range(1000))) in order to use full dataset
# Now, merged_dataset_dict contains the combined datasets from snli and mnli
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
})

# 2.Preprocessing

In [13]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



In [14]:
def preprocess_function(examples):
    max_seq_length = 128
    padding = 'max_length'
    # Tokenize the premise
    premise_result = tokenizer(
        examples['premise'], padding=padding, max_length=max_seq_length, truncation=True)
    #num_rows, max_seq_length
    # Tokenize the hypothesis
    hypothesis_result = tokenizer(
        examples['hypothesis'], padding=padding, max_length=max_seq_length, truncation=True)
    #num_rows, max_seq_length
    # Extract labels
    labels = examples["label"]
    #num_rows
    return {
        "premise_input_ids": premise_result["input_ids"],
        "premise_attention_mask": premise_result["attention_mask"],
        "hypothesis_input_ids": hypothesis_result["input_ids"],
        "hypothesis_attention_mask": hypothesis_result["attention_mask"],
        "labels" : labels
    }

tokenized_datasets = raw_dataset.map(
    preprocess_function,
    batched=True,
)

tokenized_datasets = tokenized_datasets.remove_columns(['premise','hypothesis','label'])
tokenized_datasets.set_format("torch")

In [15]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['premise_input_ids', 'premise_attention_mask', 'hypothesis_input_ids', 'hypothesis_attention_mask', 'labels'],
        num_rows: 1000
    })
})

# 3. Data Loader

In [16]:
from torch.utils.data import DataLoader

# initialize the dataloader
batch_size = 8
train_dataloader = DataLoader(
    tokenized_datasets['train'], 
    batch_size=batch_size, 
    shuffle=True
)
eval_dataloader = DataLoader(
    tokenized_datasets['validation'], 
    batch_size=batch_size
)
test_dataloader = DataLoader(
    tokenized_datasets['test'], 
    batch_size=batch_size
)

In [17]:
for batch in train_dataloader:
    print(batch['premise_input_ids'].shape)
    print(batch['premise_attention_mask'].shape)
    print(batch['hypothesis_input_ids'].shape)
    print(batch['hypothesis_attention_mask'].shape)
    print(batch['labels'].shape)
    break

torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8, 128])
torch.Size([8])


# 4. Model

In [18]:
from bert import *

In [19]:
# MODEL FROM TASK 1
load_path = 'model/model_bert.pth'
params, state = torch.load(load_path)
model = BERT(**params, device=device).to(device)
model.load_state_dict(state)

<All keys matched successfully>

## Pooling

In [20]:
# define mean pooling function
def mean_pool(token_embeds, attention_mask):
    # reshape attention_mask to cover 768-dimension embeddings
    in_mask = attention_mask.unsqueeze(-1).expand(
        token_embeds.size()
    ).float()
    # perform mean-pooling but exclude padding tokens (specified by in_mask)
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

# 5.Loss Function

## LATER 
Classification Objective Function
We concatenate the sentence embeddings 
 and 
 with the element-wise difference 
 and multiply the result with the trainable weight 
:


where 
 is the dimension of the sentence embeddings and k the number of labels. We optimize cross-entropy loss. This structure is depicted in Figure 1.

Regression Objective Function.
The cosine similarity between the two sentence embeddings 
 and 
 is computed. We use means quared-error loss as the objective function.

(Manhatten / Euclidean distance, semantically similar sentences can be found.)

In [21]:
def configurations(u,v):
    # build the |u-v| tensor
    uv = torch.sub(u, v)   # batch_size,hidden_dim
    uv_abs = torch.abs(uv) # batch_size,hidden_dim
    
    # concatenate u, v, |u-v|
    x = torch.cat([u, v, uv_abs], dim=-1) # batch_size, 3*hidden_dim
    return x

def cosine_similarity(u, v):
    dot_product = np.dot(u, v)
    norm_u = np.linalg.norm(u)
    norm_v = np.linalg.norm(v)
    similarity = dot_product / (norm_u * norm_v)
    return similarity

In [22]:
classifier_head = torch.nn.Linear(768*3, 3).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=2e-5)
optimizer_classifier = torch.optim.Adam(classifier_head.parameters(), lr=2e-5)

criterion = nn.CrossEntropyLoss()

In [23]:
from transformers import get_linear_schedule_with_warmup

# and setup a warmup for the first ~10% steps
total_steps = int(len(raw_dataset) / batch_size)
warmup_steps = int(0.1 * total_steps)
scheduler = get_linear_schedule_with_warmup(
		optimizer, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler.step()

scheduler_classifier = get_linear_schedule_with_warmup(
		optimizer_classifier, num_warmup_steps=warmup_steps,
  	num_training_steps=total_steps - warmup_steps
)

# then during the training loop we update the scheduler per step
scheduler_classifier.step()

  scheduler.step()
  scheduler_classifier.step()


# 6. Training

In [24]:
max_seq_length = 128

In [25]:
from tqdm.auto import tqdm

num_epoch = 2
# 1 epoch should be enough, increase if wanted
for epoch in range(num_epoch):
    model.train()
    classifier_head.train()
    # initialize the dataloader loop with tqdm (tqdm == progress bar)
    for step, batch in enumerate(tqdm(train_dataloader, leave=True)):
        # zero all gradients on each new step
        optimizer.zero_grad()
        optimizer_classifier.zero_grad()

        # prepare batches and more all to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        segment_ids = torch.zeros(batch_size, max_seq_length, dtype=torch.int32).to(device)  # each input contains only one sentence hence we define them all as sentence '0'
        label = batch['labels'].to(device)

        # extract token embeddings from BERT at last_hidden_state
        u_last_hidden_state = model.get_last_hidden_state(inputs_ids_a, segment_ids)
        v_last_hidden_state = model.get_last_hidden_state(inputs_ids_b, segment_ids)

        # get the mean pooled vectors
        u_mean_pool = mean_pool(u_last_hidden_state, attention_a) # batch_size, hidden_dim
        v_mean_pool = mean_pool(v_last_hidden_state, attention_b) # batch_size, hidden_dim

        # build the |u-v| tensor
        uv = torch.sub(u_mean_pool, v_mean_pool)   # batch_size,hidden_dim
        uv_abs = torch.abs(uv) # batch_size,hidden_dim

        # concatenate u, v, |u-v|
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1) # batch_size, 3*hidden_dim

        # process concatenated tensor through classifier_head
        x = classifier_head(x) #batch_size, classifer

        # calculate the 'softmax-loss' between predicted and true label
        loss = criterion(x, label)

        # using loss, calculate gradients and then optimizerize
        loss.backward()
        optimizer.step()
        optimizer_classifier.step()

        scheduler.step() # update learning rate scheduler
        scheduler_classifier.step()

    print(f'Epoch: {epoch + 1} | loss = {loss.item():.6f}')

  0%|          | 0/125 [00:00<?, ?it/s]

Epoch: 1 | loss = 2.930530


  0%|          | 0/125 [00:00<?, ?it/s]

Epoch: 2 | loss = 1.892005


In [26]:
labels = []
predictions = []
probabilities = []
classes = ["entailment", "neutral", "contradiction"]

In [27]:
model.eval()
classifier_head.eval()
total_similarity = 0
num_samples = 0

with torch.no_grad():
    for step, batch in enumerate(eval_dataloader):
        # Move batches to the active device
        inputs_ids_a = batch['premise_input_ids'].to(device)
        inputs_ids_b = batch['hypothesis_input_ids'].to(device)
        attention_a = batch['premise_attention_mask'].to(device)
        attention_b = batch['hypothesis_attention_mask'].to(device)
        segment_ids = torch.zeros(inputs_ids_a.shape[0], inputs_ids_a.shape[1], dtype=torch.int32).to(device)
        label = batch['labels'].to(device)

        # Extract token embeddings from BERT
        u = model.get_last_hidden_state(inputs_ids_a, segment_ids)  # (batch_size, seq_len, hidden_dim)
        v = model.get_last_hidden_state(inputs_ids_b, segment_ids)  # (batch_size, seq_len, hidden_dim)

        # Get the mean pooled vectors
        u_mean_pool = mean_pool(u, attention_a)  # (batch_size, hidden_dim)
        v_mean_pool = mean_pool(v, attention_b)  # (batch_size, hidden_dim)

        # Computing cosine similarity using PyTorch (more efficient)
        similarity_scores = F.cosine_similarity(u_mean_pool, v_mean_pool, dim=-1)  # (batch_size,)
        total_similarity += similarity_scores.sum().item()
        num_samples += len(similarity_scores)

        # Concatenate [u, v, |u - v|]
        uv_abs = torch.abs(u_mean_pool - v_mean_pool)  # (batch_size, hidden_dim)
        x = torch.cat([u_mean_pool, v_mean_pool, uv_abs], dim=-1)  # (batch_size, 3*hidden_dim)

        # Classification
        logit_fn = classifier_head(x)  # (batch_size, num_classes)
        probs = F.softmax(logit_fn, dim=-1)
        preds = torch.argmax(logit_fn, dim=-1)

        labels.extend(label.cpu().tolist())
        probabilities.extend(probs.cpu().tolist())
        predictions.extend(preds.cpu().tolist())

# Calculate average similarity
average_similarity = total_similarity / num_samples
print(f"Average Cosine Similarity: {average_similarity:.4f}")

Average Cosine Similarity: 0.9983


In [28]:
from sklearn.metrics import classification_report

print(classification_report(labels, predictions, target_names=classes))

               precision    recall  f1-score   support

   entailment       0.34      0.99      0.51       338
      neutral       0.43      0.02      0.04       328
contradiction       0.00      0.00      0.00       334

     accuracy                           0.34      1000
    macro avg       0.26      0.34      0.18      1000
 weighted avg       0.26      0.34      0.18      1000



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [29]:
# saving the model
torch.save([model.params, model.state_dict()], 'model/sen_bert.pth')

# Inference

In [30]:
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(model, tokenizer, sentence_a, sentence_b, device):
    # Tokenize and convert sentences to input IDs and attention masks
    inputs_a = tokenizer(sentence_a, return_tensors='pt', max_length=max_seq_length, truncation=True, padding='max_length').to(device)
    inputs_b = tokenizer(sentence_b, return_tensors='pt', max_length=max_seq_length, truncation=True, padding='max_length').to(device)

    # Move input IDs and attention masks to the active device
    inputs_ids_a = inputs_a['input_ids']
    attention_a = inputs_a['attention_mask']
    inputs_ids_b = inputs_b['input_ids']
    attention_b = inputs_b['attention_mask']
    segment_ids = torch.zeros(1, max_seq_length, dtype=torch.int32).to(device)

    # Extract token embeddings from BERT
    u = model.get_last_hidden_state(inputs_ids_a, segment_ids)
    v = model.get_last_hidden_state(inputs_ids_b, segment_ids)

    # Get the mean-pooled vectors
    u_mean_pool = mean_pool(u, attention_a)  # (1, hidden_dim)
    v_mean_pool = mean_pool(v, attention_b)  # (1, hidden_dim)

    # Calculate cosine similarity using PyTorch
    similarity_score = F.cosine_similarity(u_mean_pool, v_mean_pool, dim=-1).item()

    return similarity_score

In [31]:
# Example usage:
sentence_a = 'Your contribution helped make it possible for us to provide our students with a quality education.'
sentence_b = "Your contributions were of no help with our students' education."
similarity = calculate_similarity(model, tokenizer, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.9989


In [32]:
sentence_a = 'An older man is drinking orange juice at a restaurant.'
sentence_b = "A man is drinking juice."
similarity = calculate_similarity(model, tokenizer, sentence_a, sentence_b, device)
print(f"Cosine Similarity: {similarity:.4f}")

Cosine Similarity: 0.9989


# Task-3

In [33]:
from transformers import AutoTokenizer, AutoModel
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
pre_trained_model = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')



In [34]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [35]:
pos_sentence = ["The cat is sleeping on the couch.", "The feline is resting on the sofa."]
opp_sentence = ["He is very punctual and reliable.", "You can never count on him to be on time."]

In [36]:
encoded_input = tokenizer(pos_sentence, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    model_output = pre_trained_model(**encoded_input)

In [37]:
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

sent_a_emb = sentence_embeddings[0].cpu().numpy().reshape(1, -1)
sent_b_emb = sentence_embeddings[1].cpu().numpy().reshape(1, -1)
cosine_similarity(sent_a_emb, sent_b_emb)[0][0]

0.73199284

In [38]:
encoded_input = tokenizer(opp_sentence, padding=True, truncation=True, return_tensors='pt')

with torch.no_grad():
    model_output = pre_trained_model(**encoded_input)

In [39]:
sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

sent_a_emb = sentence_embeddings[0].cpu().numpy().reshape(1, -1)
sent_b_emb = sentence_embeddings[1].cpu().numpy().reshape(1, -1)
cosine_similarity(sent_a_emb, sent_b_emb)[0][0]

0.48303467

# Evaluation of the model

## Evaluation of our model

| Model Type  | Training Loss with SNLI and MNLI | Cosine Similarity(SNLI and MNLI) | Cosine Similarity (Similar Sentences) | Cosine Similarity (Dissimilar Sentences) |
|------------|------------------------------|----------------------------------------------|--------------------------------------------------|--------------------------------------------------|
| Custom model   | 1.89                        | 0.9983                                         | 0.9989                                             | 0.9989                                             |



### Classification Report

| Class          | Precision | Recall | F1-score | Support |
|---------------|-----------|--------|----------|---------|
| entailment    | 0.34      | 0.99   | 0.51     | 338     |
| neutral       | 0.43      | 0.02   | 0.04     | 328     |
| contradiction | 0.00      | 0.00   | 0.00     | 334     |
| **Accuracy**  |           |        | **0.34** | 1000    |
| **Macro Avg** | 0.26      | 0.34   | 0.18     | 1000    |
| **Weighted Avg** | 0.26   | 0.34   | 0.18     | 1000    |


## Comparison of our model with pre-trained model

| Model Type | Cosine Similarity (Similar sentence) | Cosine Similarity (Dissisimilar sentence) |
|----------|----------|----------|
| Our Model    | 0.9989    | 0.9989    |
| Pre-trained    | 0.731     | 0.483     |

### Observation

Our custom model achieves a cosine similarity score of `0.9992` for similar sentences and `0.999` for dissimilar sentences. The fact that both values are almost identical indicates that the model fails to meaningfully differentiate between semantically similar and dissimilar sentence pairs. This suggests that the learned embeddings lack discriminative power and may have collapsed into highly similar vector representations regardless of semantic content. Such behavior is commonly associated with poor representation learning or overfitting to limited patterns in the training data.

In contrast, the pre-trained model achieves a cosine similarity of `0.731` for similar sentences and `0.483` for dissimilar sentences. The clear separation between these two values demonstrates that the pre-trained model captures semantic relationships more effectively, producing embeddings that meaningfully distinguish similar from dissimilar inputs.

Furthermore, the classification report reinforces this observation. Our custom model achieves an overall accuracy of `34%`, which is close to random guessing for a 3-class classification problem. The macro-average F1-score (`0.18`) and weighted-average F1-score (`0.18`) indicate consistently weak performance across all classes.

Looking at the class-level performance:

Entailment shows very high recall (`0.99`) but low precision (`0.34`), meaning the model predicts entailment for most samples, including many incorrect ones.

Neutral has extremely low recall (`0.02`), indicating that the model almost never correctly identifies neutral cases.

Contradiction has precision and recall of `0.00`, meaning the model completely fails to recognize contradiction instances.

Overall, the results indicate that our custom model does not learn robust semantic representations and struggles to distinguish between classes. The pre-trained model, benefiting from large-scale pretraining, demonstrates significantly stronger semantic discrimination and generalization capability.

# Discussion

The implementation of BERT from scratch was carried out with reference to the professor’s provided notebook. The *Wikipedia* dataset from Hugging Face was used as the primary corpus for pretraining. Due to hardware limitations (RTX 2080 Ti with limited GPU memory), the dataset was reduced to approximately **100k samples** to make training feasible.

During pretraining, several computational constraints were encountered. Large sequence lengths and batch sizes resulted in out-of-memory (OOM) errors. Consequently, the batch size was reduced to **3**, and the maximum sequence length was adjusted to a manageable size. Additionally, the number of training epochs was limited to avoid GPU memory crashes. Although initial attempts were made to train for a large number of epochs, the training loss plateaued while memory issues persisted, indicating diminishing returns from prolonged training under constrained resources.

For **Task 2**, the model was fine-tuned on the SNLI and MNLI datasets to classify logical relationships between sentence pairs (*entailment*, *neutral*, *contradiction*). After preprocessing, the model was trained for **5 epochs**. Again, memory limitations required reducing the batch size from 32 to **8** in order to successfully complete training.

During evaluation (**Task 3**), the performance of the custom model was significantly below expectations. The classification accuracy was **34%**, which is close to random guessing for a three-class problem (baseline ≈ 33%). The macro-average F1-score (**0.18**) and weighted-average F1-score (**0.18**) further confirm weak overall performance.

## Class-level Analysis

- **Entailment**: Very high recall (**0.99**) but low precision (**0.34**), indicating the model predicts entailment for most examples.
- **Neutral**: Extremely low recall (**0.02**), meaning neutral cases are rarely identified correctly.
- **Contradiction**: Precision and recall of **0.00**, indicating complete failure to recognize contradiction instances.

Additionally, cosine similarity analysis shows that the custom model produces nearly identical similarity scores for similar (**0.9992**) and dissimilar (**0.999**) sentence pairs. This indicates poor semantic discrimination and suggests that the learned embeddings lack meaningful representational structure.

In contrast, the pre-trained `all-mpnet-base-v2` model demonstrates clear separation between similar (**0.731**) and dissimilar (**0.483**) sentence pairs, reflecting significantly stronger semantic representation and generalization capability.



## Challenges Faced

- Limited GPU memory restricting batch size and sequence length  
- Reduced dataset size compared to standard BERT pretraining  
- Insufficient training scale for robust representation learning  
- Difficulty achieving stable optimization from scratch  



## Proposed Improvements

To improve the performance of the custom model, the following enhancements are recommended:

- Increase the size and diversity of the pretraining corpus  
- Train for more epochs with stable memory management  
- Use larger effective batch sizes via gradient accumulation  
- Experiment with optimized learning rates and schedulers  
- Increase model depth or hidden dimensions  
- Apply mixed precision training to better utilize GPU memory  
- Limit vocabulary size to reduce embedding dimensionality  



In conclusion, the results highlight the substantial gap between training a transformer model from scratch under constrained resources and leveraging large-scale pre-trained models. Large-scale pretraining plays a critical role in learning meaningful and discriminative semantic representations.
