In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
from transformers import DistilBertConfig, DistilBertTokenizerFast, DistilBertForSequenceClassification

model_path = 'distilbert-base-uncased'
config = DistilBertConfig.from_pretrained(model_path, num_labels=1)
tokenizer = DistilBertTokenizerFast.from_pretrained(model_path)

model = DistilBertForSequenceClassification.from_pretrained(
    model_path, config=config)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import datasets
from datasets import load_dataset

stsb_train = load_dataset('glue', 'stsb', split='train')
stsb_validation = load_dataset('glue', 'stsb', split='validation')
stsb_validation = stsb_validation.shuffle(seed=42)
stsb_val = datasets.Dataset.from_dict(stsb_validation[:750])
stsb_test = datasets.Dataset.from_dict(stsb_validation[750:])

Downloading and preparing dataset glue/stsb (download: 784.05 KiB, generated: 1.09 MiB, post-processed: Unknown size, total: 1.86 MiB) to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=802872.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.


Reusing dataset glue (/root/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


In [None]:
import pandas as pd
pd.DataFrame(stsb_train)

Unnamed: 0,sentence1,sentence2,label,idx
0,A plane is taking off.,An air plane is taking off.,5.00,0
1,A man is playing a large flute.,A man is playing a flute.,3.80,1
2,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...,3.80,2
3,Three men are playing chess.,Two men are playing chess.,2.60,3
4,A man is playing the cello.,A man seated is playing the cello.,4.25,4
...,...,...,...,...
5744,Severe Gales As Storm Clodagh Hits Britain,Merkel pledges NATO solidarity with Latvia,0.00,5744
5745,Dozens of Egyptians hostages taken by Libyan t...,Egyptian boat crash death toll rises as more b...,0.00,5745
5746,President heading to Bahrain,President Xi: China to continue help to fight ...,0.00,5746
5747,"China, India vow to further bilateral ties",China Scrambles to Reassure Jittery Stock Traders,0.00,5747


In [None]:
stsb_train.shape, stsb_val.shape, stsb_test.shape

((5749, 4), (750, 4), (750, 4))

In [None]:
enc_train = stsb_train.map(
    lambda e: tokenizer(e['sentence1'], e['sentence2'], padding=True, truncation=True), 
    batched=True, batch_size=1000)

enc_val = stsb_val.map(
    lambda e: tokenizer(e['sentence1'], e['sentence2'], padding=True, truncation=True), 
    batched=True, batch_size=1000)

enc_test = stsb_test.map(
    lambda e: tokenizer(e['sentence1'], e['sentence2'], padding=True, truncation=True), 
    batched=True, batch_size=1000)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=6.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1.0), HTML(value='')))




In [None]:
pd.DataFrame(enc_train)

Unnamed: 0,attention_mask,idx,input_ids,label,sentence1,sentence2
0,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",0,"[101, 1037, 4946, 2003, 2635, 2125, 1012, 102,...",5.00,A plane is taking off.,An air plane is taking off.
1,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",1,"[101, 1037, 2158, 2003, 2652, 1037, 2312, 8928...",3.80,A man is playing a large flute.,A man is playing a flute.
2,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",2,"[101, 1037, 2158, 2003, 9359, 14021, 5596, 209...",3.80,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",3,"[101, 2093, 2273, 2024, 2652, 7433, 1012, 102,...",2.60,Three men are playing chess.,Two men are playing chess.
4,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",4,"[101, 1037, 2158, 2003, 2652, 1996, 10145, 101...",4.25,A man is playing the cello.,A man seated is playing the cello.
...,...,...,...,...,...,...
5744,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5744,"[101, 5729, 14554, 2015, 2004, 4040, 18856, 13...",0.00,Severe Gales As Storm Clodagh Hits Britain,Merkel pledges NATO solidarity with Latvia
5745,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5745,"[101, 9877, 1997, 23437, 19323, 2579, 2011, 19...",0.00,Dozens of Egyptians hostages taken by Libyan t...,Egyptian boat crash death toll rises as more b...
5746,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5746,"[101, 2343, 5825, 2000, 15195, 102, 2343, 8418...",0.00,President heading to Bahrain,President Xi: China to continue help to fight ...
5747,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...",5747,"[101, 2859, 1010, 2634, 19076, 2000, 2582, 177...",0.00,"China, India vow to further bilateral ties",China Scrambles to Reassure Jittery Stock Traders


In [None]:
from transformers import TrainingArguments, Trainer 

training_args = TrainingArguments(
    output_dir='./stsb-model',
    do_train=True,
    do_eval=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=100,
    weight_decay=0.01,
    logging_strategy='epoch',
    logging_dir='./logs',
    logging_steps=50,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    fp16=True,
    load_best_model_at_end=True
)

In [None]:
import numpy as np
from scipy.stats import pearsonr
from scipy.stats import spearmanr

def compute_metrics(pred):
    preds = np.squeeze(pred.predictions)
    return {
        "MSE": ((preds - pred.label_ids) ** 2).mean().item(),
        "RMSE": (np.sqrt(((preds - pred.label_ids) ** 2).mean())).item(),
        "MAE": (np.abs(preds - pred.label_ids)).mean().item(),
        "Pearson": pearsonr(preds, pred.label_ids)[0],
        "Spearman's Rank": spearmanr(preds, pred.label_ids)[0]
    }

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=enc_train,
    eval_dataset=enc_val,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

Using amp fp16 backend


In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence1, sentence2.


***** Running training *****


  Num examples = 5749


  Num Epochs = 3


  Instantaneous batch size per device = 8


  Total train batch size (w. parallel, distributed & accumulation) = 8


  Gradient Accumulation steps = 1


  Total optimization steps = 2157


Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33madrianmoses[0m (use `wandb login --relogin` to force relogin)


[34m[1mwandb[0m: wandb version 0.12.5 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


Epoch,Training Loss,Validation Loss,Mse,Rmse,Mae,Pearson,Spearman's rank
1,1.4973,0.698019,0.698019,0.835476,0.644279,0.851078,0.848007
2,0.4796,0.577492,0.577492,0.759929,0.601091,0.873337,0.868375
3,0.2464,0.522315,0.522315,0.722714,0.552892,0.873993,0.866944


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence1, sentence2.


***** Running Evaluation *****


  Num examples = 750


  Batch size = 8


Saving model checkpoint to ./stsb-model/checkpoint-719


Configuration saved in ./stsb-model/checkpoint-719/config.json


Model weights saved in ./stsb-model/checkpoint-719/pytorch_model.bin


tokenizer config file saved in ./stsb-model/checkpoint-719/tokenizer_config.json


Special tokens file saved in ./stsb-model/checkpoint-719/special_tokens_map.json


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence1, sentence2.


***** Running Evaluation *****


  Num examples = 750


  Batch size = 8


Saving model checkpoint to ./stsb-model/checkpoint-1438


Configuration saved in ./stsb-model/checkpoint-1438/config.json


Model weights saved in ./stsb-model/checkpoint-1438/pytorch_model.bin


tokenizer config file saved in ./stsb-model/checkpoint-1438/tokenizer_config.json


Special tokens file saved in ./stsb-model/checkpoint-1438/special_tokens_map.json


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence1, sentence2.


***** Running Evaluation *****


  Num examples = 750


  Batch size = 8


Saving model checkpoint to ./stsb-model/checkpoint-2157


Configuration saved in ./stsb-model/checkpoint-2157/config.json


Model weights saved in ./stsb-model/checkpoint-2157/pytorch_model.bin


tokenizer config file saved in ./stsb-model/checkpoint-2157/tokenizer_config.json


Special tokens file saved in ./stsb-model/checkpoint-2157/special_tokens_map.json




Training completed. Do not forget to share your model on huggingface.co/models =)




Loading best model from ./stsb-model/checkpoint-2157 (score: 0.5223150253295898).


TrainOutput(global_step=2157, training_loss=0.7411218253229854, metrics={'train_runtime': 256.7816, 'train_samples_per_second': 67.166, 'train_steps_per_second': 8.4, 'total_flos': 522206667215532.0, 'train_loss': 0.7411218253229854, 'epoch': 3.0})

In [None]:
q = [trainer.evaluate(eval_dataset=data) for data in [enc_train, enc_val, enc_test]]
pd.DataFrame(q, index=["train", "val", "test"]).iloc[:, :5]

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence1, sentence2.


***** Running Evaluation *****


  Num examples = 5749


  Batch size = 8


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence1, sentence2.


***** Running Evaluation *****


  Num examples = 750


  Batch size = 8


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: idx, sentence1, sentence2.


***** Running Evaluation *****


  Num examples = 750


  Batch size = 8


Unnamed: 0,eval_loss,eval_MSE,eval_RMSE,eval_MAE,eval_Pearson
train,0.158867,0.158867,0.398581,0.301548,0.963016
val,0.522315,0.522315,0.722714,0.552892,0.873993
test,0.502323,0.502323,0.708747,0.549059,0.883875


In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# run model for inference

s1, s2 = "A plane is taking off.", "An air plane is taking off."
encoding = tokenizer(s1, s2, return_tensors='pt', padding=True, truncation=True,
    max_length=512)
input_ids = encoding['input_ids'].to(device)
attention_mask = encoding['attention_mask'].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
outputs.logits.item()

4.518992900848389

In [None]:
# save the model

model_path = "sentence-pair-regression-model"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

Saving model checkpoint to sentence-pair-regression-model


Configuration saved in sentence-pair-regression-model/config.json


Model weights saved in sentence-pair-regression-model/pytorch_model.bin


tokenizer config file saved in sentence-pair-regression-model/tokenizer_config.json


Special tokens file saved in sentence-pair-regression-model/special_tokens_map.json


tokenizer config file saved in sentence-pair-regression-model/tokenizer_config.json


Special tokens file saved in sentence-pair-regression-model/special_tokens_map.json


('sentence-pair-regression-model/tokenizer_config.json',
 'sentence-pair-regression-model/special_tokens_map.json',
 'sentence-pair-regression-model/vocab.txt',
 'sentence-pair-regression-model/added_tokens.json',
 'sentence-pair-regression-model/tokenizer.json')