In [1]:
# %%capture
# !pip install -q accelerate>=0.27.2
# !pip install -q peft>=0.9.0
# !pip install -q bitsandbytes>=0.43.0
# !pip install -q transformers>=4.38.2
# !pip install -q trl>=0.7.11
# !pip install -q sentencepiece>=0.1.99
# !pip install -q sentence-transformers>=3.0.0
# !pip install -q mteb>=1.1.2
# !pip install -q datasets>=2.18.0

In [2]:
%%capture

!pip install -U \
  torch>=2.1 \
  transformers==4.41.2 \
  accelerate==0.30.1 \
  # peft==0.10.0 \
  sentence-transformers==3.1.0 \
  datasets==2.20.0 \
  mteb==1.1.2 \
  # bitsandbytes==0.43.1 \
  # trl==0.8.6

!pip uninstall -y bitsandbytes peft trl

In [3]:
import numpy as np
import pandas as pd

## Creating an embedding model


### Data


In [4]:
from datasets import load_dataset

# Load MNLI dataset from GLUE
# 0 = entrailment, 1 = neutral, 2 = contradiction
train_dataset = load_dataset('glue', 'mnli', split='train').select(range(50_000))
train_dataset = train_dataset.remove_columns('idx')

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


In [5]:
train_dataset[2]

{'premise': 'One of our number will carry out your instructions minutely.',
 'hypothesis': 'A member of my team will execute your orders with immense precision.',
 'label': 0}

### Model


In [6]:
from sentence_transformers import SentenceTransformer

# Use a base model
embedding_model = SentenceTransformer('bert-base-uncased')



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

### Loss function


In [7]:
from sentence_transformers import losses

# Define the loss function. In soft-max loss, we will also need to explicitly set the number of labels
train_loss = losses.SoftmaxLoss(
    model=embedding_model,
    sentence_embedding_dimension=embedding_model.get_sentence_embedding_dimension(),
    num_labels=3
)



### Evaluation


In [12]:
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator

# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score / 5 for score in val_sts['label']]
)

### Training


In [13]:
from sentence_transformers.training_args import SentenceTransformerTrainingArguments

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir = 'base_embeddding_model',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

In [14]:
from sentence_transformers.trainer import SentenceTransformerTrainer

# Train embedding model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Use an existing W&B account'
[34m[1mwandb[0m: Logging into https://api.wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: Find your API key here: https://wandb.ai/authorize?ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33makshaysoam8[0m ([33makshaysoam8-dan-com-a-godaddy-brand[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,1.0666
200,0.943
300,0.8996
400,0.854
500,0.8434
600,0.8445
700,0.8368
800,0.8109
900,0.8043
1000,0.8072


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=1563, training_loss=0.8353727564015453, metrics={'train_runtime': 572.8705, 'train_samples_per_second': 87.28, 'train_steps_per_second': 2.728, 'total_flos': 0.0, 'train_loss': 0.8353727564015453, 'epoch': 1.0})

In [15]:
evaluator(embedding_model)

{'pearson_cosine': 0.3863382135097644,
 'spearman_cosine': 0.4697721800666814,
 'pearson_manhattan': 0.43500507451245074,
 'spearman_manhattan': 0.46969341132904824,
 'pearson_euclidean': 0.4216060375280885,
 'spearman_euclidean': 0.4642096121709544,
 'pearson_dot': 0.3703433952380605,
 'spearman_dot': 0.3830957750546397,
 'pearson_max': 0.43500507451245074,
 'spearman_max': 0.4697721800666814}

## MTEB


In [16]:
from mteb import MTEB

# Choose the evaluation task
evaluation = MTEB(tasks=['Banking77Classification'])

# Calculate results
results = evaluation.run(embedding_model)
results

{'Banking77Classification': {'mteb_version': '1.1.2',
  'dataset_revision': '0fd18e25b25c072e09e0d92ab615fda904d66300',
  'mteb_dataset_name': 'Banking77Classification',
  'test': {'accuracy': 0.514512987012987,
   'f1': 0.5135513322534722,
   'accuracy_stderr': 0.013312519771309043,
   'f1_stderr': 0.013380497422222443,
   'main_score': 0.514512987012987,
   'evaluation_time': 30.51}}}

In [17]:
# Empty and delete trainer / model
trainer.accelerator.clear()
del trainer, embedding_model

# Garbage collection and empty cache
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

## Loss functions


### Cosine similarity


In [18]:
from datasets import Dataset, load_dataset

# Load MNLI dataset from GLUE
# 0 = entailment, 1 = neutral, 2 = contradiction
train_dataset =  load_dataset('glue', 'mnli', split='train').select(range(50_000))
train_dataset = train_dataset.remove_columns('idx')

# (neutral / contradiction) = 0 and (entailment) = 1
mapping = {2:0, 1:0, 0:1}
train_dataset = Dataset.from_dict({
    'sentence1': train_dataset['premise'],
    'sentence2': train_dataset['hypothesis'],
    'label': [float(mapping[label]) for label in train_dataset['label']]
})

In [19]:
# Create an embedding similarity evaluator for stsb
val_sts = load_dataset('glue', 'stsb', split='validation')

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']]
)

In [20]:
# Define model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss function
train_loss = losses.CosineSimilarityLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir = 'cosineloss_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

# Trainer model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
100,0.2293
200,0.1705
300,0.1719
400,0.1587
500,0.1517
600,0.157
700,0.1492
800,0.158
900,0.1484
1000,0.1465


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=1563, training_loss=0.15686305760574584, metrics={'train_runtime': 407.314, 'train_samples_per_second': 122.755, 'train_steps_per_second': 3.837, 'total_flos': 0.0, 'train_loss': 0.15686305760574584, 'epoch': 1.0})

In [21]:
evaluator(embedding_model)

{'pearson_cosine': 0.7318090477811238,
 'spearman_cosine': 0.7346247062606067,
 'pearson_manhattan': 0.7405310471794844,
 'spearman_manhattan': 0.7393916241410549,
 'pearson_euclidean': 0.7403520159301146,
 'spearman_euclidean': 0.7393262433130168,
 'pearson_dot': 0.673291176665717,
 'spearman_dot': 0.6759749362484078,
 'pearson_max': 0.7405310471794844,
 'spearman_max': 0.7393916241410549}

In [22]:
gc.collect()
torch.cuda.empty_cache()

### Multiple negatives ranking loss


In [24]:
import random
from tqdm import tqdm
from datasets import Dataset, load_dataset

# Load MNLI dataset from GLUE
mnli = load_dataset('glue', 'mnli', split='train').select(range(50_000))
mnli = mnli.remove_columns('idx')
mnli = mnli.filter(lambda x: True if x['label'] == 0 else False)

# Prepare data and add a soft negative
train_dataset = {
    'anchor': [],
    'positive': [],
    'negative': []
}

soft_negatives = mnli['hypothesis']
random.shuffle(soft_negatives)

for row, soft_negative in tqdm(zip(mnli, soft_negatives)):
    train_dataset['anchor'].append(row['premise'])
    train_dataset['positive'].append(row['hypothesis'])
    train_dataset['negative'].append(soft_negative)
    
train_dataset = Dataset.from_dict(train_dataset)
len(train_dataset)

16875it [00:01, 14183.14it/s]


16875

In [25]:
val_sts = load_dataset('glue', 'stsb', split='validation')

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']]
)

In [26]:
# Define model
embedding_model = SentenceTransformer('bert-base-uncased')

# Loss function
train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

# Define the training arguments
args = SentenceTransformerTrainingArguments(
    output_dir='mnr_loss_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=32,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

# Train model
trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
100,0.2447
200,0.0971
300,0.0744
400,0.0652
500,0.0661


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=528, training_loss=0.10698446654009097, metrics={'train_runtime': 158.5625, 'train_samples_per_second': 106.425, 'train_steps_per_second': 3.33, 'total_flos': 0.0, 'train_loss': 0.10698446654009097, 'epoch': 1.0})

In [27]:
evaluator(embedding_model)

{'pearson_cosine': 0.8076489711386212,
 'spearman_cosine': 0.8102235288947944,
 'pearson_manhattan': 0.820786344772883,
 'spearman_manhattan': 0.8164625189655126,
 'pearson_euclidean': 0.820260945107055,
 'spearman_euclidean': 0.8158641431077273,
 'pearson_dot': 0.7474544375818565,
 'spearman_dot': 0.7357596625598664,
 'pearson_max': 0.820786344772883,
 'spearman_max': 0.8164625189655126}

## Fine tuning


In [28]:
gc.collect()
torch.cuda.empty_cache()

### Supervised


In [29]:
train_dataset = load_dataset('glue', 'mnli', split='train').select(range(50_000))
train_dataset = train_dataset.remove_columns('idx')

val_sts = load_dataset('glue', 'stsb', split='validation')

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']]
)

In [31]:
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

train_loss = losses.MultipleNegativesRankingLoss(model=embedding_model)

args = SentenceTransformerTrainingArguments(
    output_dir='finetuned_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
dataset = dataset.select_columns(['hypothesis', 'entailment', 'contradiction'])


Step,Training Loss
100,0.1583
200,0.1131
300,0.1224
400,0.1196
500,0.1104
600,0.1017
700,0.1213
800,0.1017
900,0.1025
1000,0.1043


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=1563, training_loss=0.11035686079233942, metrics={'train_runtime': 115.7601, 'train_samples_per_second': 431.928, 'train_steps_per_second': 13.502, 'total_flos': 0.0, 'train_loss': 0.11035686079233942, 'epoch': 1.0})

In [32]:
evaluator(embedding_model)

{'pearson_cosine': 0.8492841775353083,
 'spearman_cosine': 0.8491177984837988,
 'pearson_manhattan': 0.8516385664743567,
 'spearman_manhattan': 0.8479618182835865,
 'pearson_euclidean': 0.852709419803894,
 'spearman_euclidean': 0.849117635807207,
 'pearson_dot': 0.8492841775679763,
 'spearman_dot': 0.8491177984837988,
 'pearson_max': 0.852709419803894,
 'spearman_max': 0.8491177984837988}

In [33]:
original_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
evaluator(original_model)



{'pearson_cosine': 0.8696194518832261,
 'spearman_cosine': 0.8671631197908374,
 'pearson_manhattan': 0.8670399003909526,
 'spearman_manhattan': 0.8663946139224048,
 'pearson_euclidean': 0.8678715924178553,
 'spearman_euclidean': 0.8671631197908374,
 'pearson_dot': 0.8696194534675575,
 'spearman_dot': 0.8671631197908374,
 'pearson_max': 0.8696194534675575,
 'spearman_max': 0.8671631197908374}

In [34]:
gc.collect()
torch.cuda.empty_cache()

### Augmented SBERT


In [None]:
# Step 1: Fine tune a cross encoder
from sentence_transformers import InputExample
from torch.utils.data import DataLoader

dataset = load_dataset('glue', 'mnli', split='train').select(range(10_000))

# Convert MNLI labels:
# 0 = entailment → positive
# 1 = neutral    → negative
# 2 = contradiction → negative
mapping = {
    2: 0,
    1: 0,
    0: 1
}

gold_examples = [
    InputExample(
        texts=[row["premise"], row["hypothesis"]],
        label=mapping[row["label"]]
    )
    for row in tqdm(dataset)
]

gold_dataloader = DataLoader(
    gold_examples,
    shuffle=True,
    batch_size=32
)

100%|██████████| 10000/10000 [00:00<00:00, 26769.46it/s]


In [40]:
# Pandas DataFrame for easier data handling
gold = pd.DataFrame({
    'sentence1': dataset['premise'],
    'sentence2': dataset['hypothesis'],
    'label': [mapping[label] for label in dataset['label']]
})

In [49]:
gold.head()

Unnamed: 0,sentence1,sentence2,label
0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,0
1,you know during the season and i guess at at y...,You lose the things to the following level if ...,1
2,One of our number will carry out your instruct...,A member of my team will execute your orders w...,1
3,How do you know? All this is their information...,This information belongs to them.,1
4,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.,0


In [54]:
from sentence_transformers.cross_encoder import CrossEncoder

cross_encoder = CrossEncoder('bert-base-uncased', num_labels=2)

cross_encoder.fit(
    train_dataloader=gold_dataloader,
    epochs=1,
    show_progress_bar=True,
    warmup_steps=100,
    use_amp=False
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/313 [00:00<?, ?it/s]

In [55]:
# Step 2: Creating new sentence pairs
silver_dataset = load_dataset('glue', 'mnli', split='train').select(range(10_000, 50_000))
pairs = list(zip(silver_dataset['premise'], silver_dataset['hypothesis']))

In [56]:
silver_dataset

Dataset({
    features: ['premise', 'hypothesis', 'label', 'idx'],
    num_rows: 40000
})

In [57]:
# Step 3: Label new sentence pairs with the fine-tuned cross encoder
output = cross_encoder.predict(pairs, apply_softmax=True, show_progress_bar=True)

Batches:   0%|          | 0/1250 [00:00<?, ?it/s]

In [58]:
silver = pd.DataFrame({
    'sentence1': silver_dataset['premise'],
    'sentence2': silver_dataset['hypothesis'],
    'label': np.argmax(output, axis=1)
})

In [59]:
silver.head()

Unnamed: 0,sentence1,sentence2,label
0,Hindus and Buddhists still bathe where he bathed.,Hindus and Buddhists bathe in the same location.,1
1,"Probably no one will even notice you at all.""",Everyone will know who you are.,0
2,well what what do you mean if they can prove i...,You don't need to say anymore about the matter...,0
3,I feel dizzy.,The dizziness I feel is from drinking.,0
4,"Well, he did, sir.","Sir, well, he did complete it before he left l...",0


In [60]:
# Step 4: Train a bi-encoder (SBERT) on the extended dataset (gold+silver) dataset
data = pd.concat([gold, silver], ignore_index=True, axis=0)

data = data.drop_duplicates(subset=['sentence1', 'sentence2'], keep='first')

train_dataset = Dataset.from_pandas(data, preserve_index=False)

In [61]:
val_sts = load_dataset('glue', 'stsb', split='validation')

evaluator = EmbeddingSimilarityEvaluator(
    sentences1=val_sts['sentence1'],
    sentences2=val_sts['sentence2'],
    scores=[score/5 for score in val_sts['label']]
)

In [62]:
embedding_model = SentenceTransformer('bert-base-uncased')

train_loss = losses.CosineSimilarityLoss(model=embedding_model)

args = SentenceTransformerTrainingArguments(
    output_dir='augmented_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
100,0.2134
200,0.1568
300,0.1451
400,0.1434
500,0.1404
600,0.1378
700,0.1376
800,0.1319
900,0.1357
1000,0.1334


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

TrainOutput(global_step=1563, training_loss=0.14198706658010063, metrics={'train_runtime': 332.6275, 'train_samples_per_second': 150.312, 'train_steps_per_second': 4.699, 'total_flos': 0.0, 'train_loss': 0.14198706658010063, 'epoch': 1.0})

In [63]:
evaluator(embedding_model)

{'pearson_cosine': 0.7059703606091826,
 'spearman_cosine': 0.7212813500771252,
 'pearson_manhattan': 0.7133607512559957,
 'spearman_manhattan': 0.7128231840977057,
 'pearson_euclidean': 0.7130173245947472,
 'spearman_euclidean': 0.7127357338594196,
 'pearson_dot': 0.6795552046302962,
 'spearman_dot': 0.685944192238618,
 'pearson_max': 0.7133607512559957,
 'spearman_max': 0.7212813500771252}

In [64]:
trainer.accelerator.clear()

[]

In [65]:
# Step 5: Evaluate without silver dataset

# Combine gold + silver

data = pd.concat([gold], ignore_index=True, axis=0)
data = data.drop_duplicates(subset=['sentence1', 'sentence2'], keep='first')
train_dataset = Dataset.from_pandas(data, preserve_index=False)

embedding_model = SentenceTransformer('bert-base-uncased')

train_loss = losses.CosineSimilarityLoss(model=embedding_model)

args = SentenceTransformerTrainingArguments(
    output_dir='gold_only_embedding_model',
    num_train_epochs=1,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=100,
    fp16=True,
    eval_steps=100,
    logging_steps=100
)

trainer = SentenceTransformerTrainer(
    model=embedding_model,
    args=args,
    train_dataset=train_dataset,
    loss=train_loss,
    evaluator=evaluator
)

trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Step,Training Loss
100,0.2269
200,0.172
300,0.1596


TrainOutput(global_step=313, training_loss=0.18540488188259138, metrics={'train_runtime': 60.9433, 'train_samples_per_second': 164.087, 'train_steps_per_second': 5.136, 'total_flos': 0.0, 'train_loss': 0.18540488188259138, 'epoch': 1.0})

In [66]:
evaluator(embedding_model)

{'pearson_cosine': 0.655947157295788,
 'spearman_cosine': 0.6749810245304704,
 'pearson_manhattan': 0.6802668020358761,
 'spearman_manhattan': 0.6850184175783788,
 'pearson_euclidean': 0.6802861494278656,
 'spearman_euclidean': 0.685017275228836,
 'pearson_dot': 0.5714606253935371,
 'spearman_dot': 0.5698544929067957,
 'pearson_max': 0.6802861494278656,
 'spearman_max': 0.6850184175783788}

In [94]:
gc.collect()
torch.cuda.empty_cache()