In [2]:
import numpy as np
import matplotlib.pyplot as plt
import torch

np.random.seed(42)

In [3]:
from datasets import load_dataset
import evaluate
from transformers import AutoModelForSequenceClassification, AutoTokenizer, TrainingArguments, Trainer, DataCollatorWithPadding


GLUE, the General Language Understanding Evaluation benchmark (https://gluebenchmark.com/) is a collection of resources for training, evaluating, and analyzing natural language understanding systems.

https://huggingface.co/datasets/nyu-mll/glue

The Microsoft Research Paraphrase Corpus (Dolan & Brockett, 2005) is a corpus of sentence pairs automatically extracted from online news sources, with human annotations for whether the sentences in the pair are semantically equivalent.

sentence1: a string feature.  
sentence2: a string feature.  
label: a classification label, with possible values including not_equivalent (0), equivalent (1).  
idx: a int32 feature.  

In [4]:
dataset = load_dataset("glue", "mrpc", split="train")
vdataset = load_dataset("glue", "mrpc", split="validation")

In [5]:
for feature in dataset.features.keys():
    print(feature)
    print(dataset[feature][:2])
    print()

sentence1
['Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .', "Yucaipa owned Dominick 's before selling the chain to Safeway in 1998 for $ 2.5 billion ."]

sentence2
['Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .', "Yucaipa bought Dominick 's in 1995 for $ 693 million and sold it to Safeway for $ 1.8 billion in 1998 ."]

label
[1, 0]

idx
[0, 1]



In [6]:
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased", 
    num_labels=2, 
    id2label=id2label, 
    label2id=label2id)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
for name, value in model.named_parameters():
    print(name)

bert.embeddings.word_embeddings.weight
bert.embeddings.position_embeddings.weight
bert.embeddings.token_type_embeddings.weight
bert.embeddings.LayerNorm.weight
bert.embeddings.LayerNorm.bias
bert.encoder.layer.0.attention.self.query.weight
bert.encoder.layer.0.attention.self.query.bias
bert.encoder.layer.0.attention.self.key.weight
bert.encoder.layer.0.attention.self.key.bias
bert.encoder.layer.0.attention.self.value.weight
bert.encoder.layer.0.attention.self.value.bias
bert.encoder.layer.0.attention.output.dense.weight
bert.encoder.layer.0.attention.output.dense.bias
bert.encoder.layer.0.attention.output.LayerNorm.weight
bert.encoder.layer.0.attention.output.LayerNorm.bias
bert.encoder.layer.0.intermediate.dense.weight
bert.encoder.layer.0.intermediate.dense.bias
bert.encoder.layer.0.output.dense.weight
bert.encoder.layer.0.output.dense.bias
bert.encoder.layer.0.output.LayerNorm.weight
bert.encoder.layer.0.output.LayerNorm.bias
bert.encoder.layer.1.attention.self.query.weight
bert.enc

In [9]:
model.can_generate()

False

In [10]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [11]:
list(tokenizer.get_vocab().keys())[-10:]

['rendered',
 'ashamed',
 '180',
 'appetite',
 '##ist',
 'slick',
 'organization',
 'conserved',
 '##ele',
 'fun']

In [12]:
tokenizer.decode(token_ids=2000)

'to'

In [13]:
tokenizer.encode(text='to')

[101, 2000, 102]

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
def encode(examples):
    return tokenizer(
        examples["sentence1"], 
        examples["sentence2"], 
        truncation=True, 
        padding="max_length"
        )

In [16]:
dataset = dataset.map(encode, batched=True)
vdataset = vdataset.map(encode, batched=True)

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

We must change the label from 'labels' to 'label'

In [17]:
dataset = dataset.map(lambda examples: {"labels": examples["label"]}, batched=True)
vdataset = vdataset.map(lambda examples: {"labels": examples["label"]}, batched=True)

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

In [18]:
dataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])
vdataset.set_format(type="torch", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])


input_ids refers to the tokens present in each document.

In [19]:
print(dataset["input_ids"][:2])

tensor([[ 101, 2572, 3217,  ...,    0,    0,    0],
        [ 101, 9805, 3540,  ...,    0,    0,    0]])


Token_type_ids contains an integer for every token. These indicate which text segment they correspond to.

In [20]:
print(dataset["token_type_ids"][:2])

tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])


The attention_mask is a binary tensor.  
Its purpose is to distinguish between real tokens and padding.  
It is created by the tokenizer to capture long-range dependencies in text relationships

In [21]:
print(dataset["attention_mask"][:2])

tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])


In [22]:
len(dataset["attention_mask"][2])

512

In [23]:
#Aca iria dataloader = torch.utils.data.DataLoader(dataset, batch_size=32)

Evaluation of predictions:

In [24]:
accuracy = evaluate.load("accuracy")

In [25]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

Now we define train hyperparams. 

In [None]:
training_args = TrainingArguments(
    output_dir="GLUEBERT",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [27]:
dataset.split

NamedSplit('train')

Now we  Pass training arguments to Trainer (model, dataset, tokenizer, data collator metrics function).


In [28]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=vdataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [29]:
trainer.train()

  0%|          | 0/460 [00:00<?, ?it/s]

: 

Sigue en https://huggingface.co/docs/transformers/tasks/sequence_classification