In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch

np.random.seed(42)

In [40]:
from datasets import load_dataset, Dataset
import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding


GLUE, the General Language Understanding Evaluation benchmark (https://gluebenchmark.com/) is a collection of resources for training, evaluating, and analyzing natural language understanding systems.

https://huggingface.co/datasets/nyu-mll/glue

The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.

sentence1: a string feature.  
sentence2: a string feature.  
label: a classification label, with possible values including not_equivalent (0), equivalent (1).  
idx: a int32 feature.  

In [3]:
dataset = load_dataset("glue", "mrpc", split="train")
vdataset = load_dataset("glue", "mrpc", split="validation")

In [4]:
for feature in dataset.features.keys():
    print(feature)
    print(dataset[feature][:2])
    print()

sentence1
['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion ."]

sentence2
['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 ."]

label
[1, 0]

idx
[0, 1]



In [5]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [6]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=2, 
    id2label=id2label, 
    label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
for name, value in model.named_parameters():
    print(name)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [8]:
model.can_generate()

False

In [9]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [10]:
list(tokenizer.get_vocab().keys())[-10:]

['freeing',
 'conflicts',
 'facilitates',
 'soundtrack',
 '##anza',
 'priscilla',
 '301',
 'swing',
 'act',
 '[unused255]']

In [11]:
tokenizer.decode(token_ids=2000)

'to'

In [12]:
tokenizer.encode(text='to')

[101, 2000, 102]

In [13]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [14]:
def encode(examples):
    return tokenizer(
        examples["sentence1"], 
        examples["sentence2"], 
        truncation=True, 
        padding="max_length"
        )

In [15]:
dataset = dataset.map(encode, batched=True)
vdataset = vdataset.map(encode, batched=True)

Map: 100%|██████████| 408/408 [00:00<00:00, 2794.31 examples/s]


We must change the label from 'labels' to 'label'

In [16]:
dataset = dataset.map(lambda examples: {"labels": examples["label"]}, batched=True)
vdataset = vdataset.map(lambda examples: {"labels": examples["label"]}, batched=True)

Map: 100%|██████████| 408/408 [00:00<00:00, 17744.65 examples/s]


In [17]:
dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])
vdataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])


input_ids refers to the tokens present in each document.

In [18]:
print(dataset["input_ids"][:2])

tensor([[ 101, 2572, 3217,  ...,    0,    0,    0],
        [ 101, 9805, 3540,  ...,    0,    0,    0]])


Token_type_ids contains an integer for every token. These indicate which text segment they correspond to.

In [19]:
print(dataset["token_type_ids"][:2])

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])


The attention_mask is a binary tensor.  
Its purpose is to distinguish between real tokens and padding.  
It is created by the tokenizer to capture long-range dependencies in text relationships

In [20]:
print(dataset["attention_mask"][:2])

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


In [21]:
len(dataset["attention_mask"][2])

512

In [22]:
#Aca iria dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)

Evaluation of predictions:

In [23]:
accuracy = evaluate.load("accuracy")

In [24]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Now we define train hyperparams. 

In [25]:
training_args = TrainingArguments(
    output_dir="GLUEBERT",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [26]:
dataset.split

NamedSplit('train')

Now we  Pass training arguments to Trainer (model, dataset, tokenizer, data collator metrics function).


In [27]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=vdataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [28]:
trainer.train()

                                                         
 50%|█████     | 459/918 [4:55:58<1:49:39, 14.33s/it]

{'eval_loss': 0.38046321272850037, 'eval_accuracy': 0.8333333333333334, 'eval_runtime': 370.6677, 'eval_samples_per_second': 1.101, 'eval_steps_per_second': 0.07, 'epoch': 1.0}


 54%|█████▍    | 500/918 [5:07:35<2:01:25, 17.43s/it]  

{'loss': 0.5195, 'grad_norm': 13.121167182922363, 'learning_rate': 9.106753812636166e-06, 'epoch': 1.09}


                                                     
100%|██████████| 918/918 [7:04:58<00:00, 11.89s/it]

{'eval_loss': 0.3679942190647125, 'eval_accuracy': 0.8700980392156863, 'eval_runtime': 328.6869, 'eval_samples_per_second': 1.241, 'eval_steps_per_second': 0.079, 'epoch': 2.0}


100%|██████████| 918/918 [7:05:00<00:00, 27.78s/it]

{'train_runtime': 25500.627, 'train_samples_per_second': 0.288, 'train_steps_per_second': 0.036, 'train_loss': 0.42271552989685457, 'epoch': 2.0}





TrainOutput(global_step=918, training_loss=0.42271552989685457, metrics={'train_runtime': 25500.627, 'train_samples_per_second': 0.288, 'train_steps_per_second': 0.036, 'total_flos': 1930182702120960.0, 'train_loss': 0.42271552989685457, 'epoch': 2.0})

We now create a homebrewed test set

In [73]:
test_ex1="These are the same sentence"
test_ex2="These aren't the same sentence"

test_ex3="i am alfredo-sampron"
test_ex4="i don't know who alfredo-sampron is"

In [82]:
homebrewed_test_set = Dataset.from_dict({"sentence1": [test_ex1, test_ex3],
                                         "sentence2": [test_ex2, test_ex4], 
                                         "idx": [100000, 100001]})

We must also encode the test set

In [83]:
homebrewed_test_set = homebrewed_test_set.map(encode, batched=True)

Map: 100%|██████████| 2/2 [00:00<00:00, 391.94 examples/s]


trainer.predict returns a namedtuple with
1) predictions (np.ndarray)
2) label_ids 
3) metrics

In [84]:
predictions = trainer.predict(homebrewed_test_set)

100%|██████████| 1/1 [00:00<00:00, 1988.76it/s]


In [86]:
predictions[0]

array([[-2.5633454,  1.658151 ],
       [-1.1505736,  0.9426273]], dtype=float32)