In [1]:
"""
The system trains BERT (or any other transformer model like RoBERTa, DistilBERT etc.) on the SNLI + MultiNLI (AllNLI) dataset
with MultipleNegativesRankingLoss. Entailnments are poisitive pairs and the contradiction on AllNLI dataset is added as a hard negative.
At every 10% training steps, the model is evaluated on the STS benchmark dataset
Usage:
python training_nli_v2.py
OR
python training_nli_v2.py pretrained_transformer_model_name
"""
import math
from sentence_transformers import models, losses, datasets
from sentence_transformers import LoggingHandler, SentenceTransformer, util, InputExample
from sentence_transformers.evaluation import EmbeddingSimilarityEvaluator
import logging
from datetime import datetime
import sys
import os
import gzip
import csv
import random
import torch

#### Just some code to print debug information to stdout
logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])
#### /print debug information to stdout

model_name = 'distilroberta-base'
train_batch_size = 128          #The larger you select this, the better the results (usually). But it requires more GPU memory
max_seq_length = 75
num_epochs = 1

# Save path of the model
model_save_path = 'output/training_nli_v2_'+model_name.replace("/", "-")+'-'+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")


class LearnedPooling(torch.nn.Module):
    def __init__(self):
        super(LearnedPooling, self).__init__()
        self.dense = torch.nn.Linear(75 * 768, 768)
        self.dense.weight.data.normal_(mean=1.0 / (75 * 768), std=1.0 / (75 * 768))
        self.dense.bias.data.normal_(mean=0, std=1.0 / (75 * 768))

        self.dropout = torch.nn.Dropout(0.1)

    def forward(self, features):
        #features: [batch_size, num_tokens, hidden_size]
        #output: [batch_size, hidden_size]
        x = features['token_embeddings']
        attention_mask = features['attention_mask']
        x = x * attention_mask.unsqueeze(-1).float()
        size = x.size(1)
        if size < 75:
            x = torch.nn.functional.pad(x, (0, 0, 0, 75 - size, 0, 0), "constant", value=0)
        x = x.flatten(start_dim=1)
        x = self.dense(x)
        x = self.dropout(x)

        #x: [batch_size, hidden_size]
        #output: [batch_size, hidden_size]
        return {'sentence_embedding': x}
    
    def save(self, *args, **kwargs):
        pass

# Here we define our SentenceTransformer model
word_embedding_model = models.Transformer(model_name, max_seq_length=max_seq_length)
pooling_model = LearnedPooling()
model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

pooling_parameters = pooling_model.parameters()

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2023-03-08 13:57:55 - Use pytorch device: cuda


In [2]:

#Check if dataset exsist. If not, download and extract  it
nli_dataset_path = 'data/AllNLI.tsv.gz'
sts_dataset_path = 'data/stsbenchmark.tsv.gz'

if not os.path.exists(nli_dataset_path):
    util.http_get('https://sbert.net/datasets/AllNLI.tsv.gz', nli_dataset_path)

if not os.path.exists(sts_dataset_path):
    util.http_get('https://sbert.net/datasets/stsbenchmark.tsv.gz', sts_dataset_path)

In [3]:

# Read the AllNLI.tsv.gz file and create the training dataset
logging.info("Read AllNLI train dataset")

def add_to_samples(sent1, sent2, label):
    if sent1 not in train_data:
        train_data[sent1] = {'contradiction': set(), 'entailment': set(), 'neutral': set()}
    train_data[sent1][label].add(sent2)


train_data = {}
with gzip.open(nli_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'train':
            sent1 = row['sentence1'].strip()
            sent2 = row['sentence2'].strip()

            add_to_samples(sent1, sent2, row['label'])
            add_to_samples(sent2, sent1, row['label'])  #Also add the opposite


train_samples = []
for sent1, others in train_data.items():
    if len(others['entailment']) > 0 and len(others['contradiction']) > 0:
        train_samples.append(InputExample(texts=[sent1, random.choice(list(others['entailment'])), random.choice(list(others['contradiction']))]))
        train_samples.append(InputExample(texts=[random.choice(list(others['entailment'])), sent1, random.choice(list(others['contradiction']))]))

logging.info("Train samples: {}".format(len(train_samples)))



# Special data loader that avoid duplicates within a batch
train_dataloader = datasets.NoDuplicatesDataLoader(train_samples, batch_size=train_batch_size)


# Our training loss
train_loss = losses.MultipleNegativesRankingLoss(model)


2023-03-08 13:57:56 - Read AllNLI train dataset
2023-03-08 13:58:08 - Train samples: 563648


In [4]:
#Read STSbenchmark dataset and use it as development set
logging.info("Read STSbenchmark dev dataset")
dev_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'dev':
            score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1
            dev_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))

dev_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(dev_samples, batch_size=train_batch_size, name='sts-dev')



2023-03-08 13:58:09 - Read STSbenchmark dev dataset


In [7]:

# Configure the training
warmup_steps = math.ceil(len(train_dataloader) * num_epochs * 0.1) #10% of train data for warm-up
logging.info("Warmup-steps: {}".format(warmup_steps))


# Train the model
model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=dev_evaluator,
          epochs=num_epochs,
          evaluation_steps=int(len(train_dataloader)*0.01),
          warmup_steps=warmup_steps,
          output_path=model_save_path,
          use_amp=False,          #Set to True, if your GPU supports FP16 operations
          extra_parameter_group={'params': pooling_parameters, 'lr': 1e-4}
          )

2023-03-08 14:11:40 - Warmup-steps: 441


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Iteration:   0%|          | 0/4403 [00:00<?, ?it/s]

2023-03-08 14:11:58 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 44 steps:
2023-03-08 14:11:59 - Cosine-Similarity :	Pearson: 0.8424	Spearman: 0.8557
2023-03-08 14:11:59 - Manhattan-Distance:	Pearson: 0.6824	Spearman: 0.6913
2023-03-08 14:11:59 - Euclidean-Distance:	Pearson: 0.6824	Spearman: 0.6911
2023-03-08 14:11:59 - Dot-Product-Similarity:	Pearson: 0.5147	Spearman: 0.6035
2023-03-08 14:11:59 - Save model to output/training_nli_v2_distilroberta-base-2023-03-08_13-57-52
2023-03-08 14:12:19 - EmbeddingSimilarityEvaluator: Evaluating the model on sts-dev dataset in epoch 0 after 88 steps:
2023-03-08 14:12:19 - Cosine-Similarity :	Pearson: 0.8429	Spearman: 0.8566
2023-03-08 14:12:19 - Manhattan-Distance:	Pearson: 0.6809	Spearman: 0.6898
2023-03-08 14:12:19 - Euclidean-Distance:	Pearson: 0.6809	Spearman: 0.6897
2023-03-08 14:12:19 - Dot-Product-Similarity:	Pearson: 0.5146	Spearman: 0.6030
2023-03-08 14:12:19 - Save model to output/training_nli_

In [None]:

##############################################################################
#
# Load the stored model and evaluate its performance on STS benchmark dataset
#
##############################################################################

test_samples = []
with gzip.open(sts_dataset_path, 'rt', encoding='utf8') as fIn:
    reader = csv.DictReader(fIn, delimiter='\t', quoting=csv.QUOTE_NONE)
    for row in reader:
        if row['split'] == 'test':
            score = float(row['score']) / 5.0 #Normalize score to range 0 ... 1
            test_samples.append(InputExample(texts=[row['sentence1'], row['sentence2']], label=score))

model = SentenceTransformer(model_save_path)
test_evaluator = EmbeddingSimilarityEvaluator.from_input_examples(test_samples, batch_size=train_batch_size, name='sts-test')
test_evaluator(model, output_path=model_save_path)

2023-03-08 13:57:10 - Load pretrained SentenceTransformer: output/training_nli_v2_distilroberta-base-2023-03-08_13-55-17


In [17]:
list(list(model.modules())[0][1].dense.parameters())[0]

Parameter containing:
tensor([[ 9.7746e-05,  2.6604e-04,  1.0380e-03,  ...,  6.8808e-05,
          5.6014e-04, -1.3104e-04],
        [ 7.2840e-04, -1.3008e-03, -8.4996e-05,  ...,  1.6934e-04,
          5.7050e-04, -7.1757e-04],
        [ 5.0579e-04,  8.9666e-04,  4.9768e-04,  ...,  2.5305e-04,
         -3.7609e-05,  1.0197e-04],
        ...,
        [ 7.9973e-05,  3.2136e-03,  6.8894e-04,  ...,  1.8319e-03,
          4.5109e-04,  3.5373e-04],
        [-2.2889e-04, -1.4041e-03, -1.1984e-03,  ...,  4.9773e-04,
          6.0275e-04, -1.0861e-03],
        [ 5.8650e-04,  5.8228e-04, -5.9429e-04,  ...,  1.3394e-04,
         -1.1174e-03,  5.8718e-04]], device='cuda:0', requires_grad=True)