In [1]:
# !pip install transformers
# !pip install datasets
# !pip install evaluate

# Full Training

## Load Dataset

In [2]:
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
checkpoint = 'bert-base-uncased'

In [4]:
dataset = load_dataset("glue", "mrpc"); dataset

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Downloading data: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx'],
        num_rows: 1725
    })
})

In [5]:
num_labels = len(set(dataset['test']['label']))

## Tokenization

In [6]:
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [8]:
def tokenize(entity):
    return tokenizer(entity['sentence1'], entity['sentence2'], truncation=True)

In [9]:
tokenized_dataset = dataset.map(tokenize, batched=True); tokenized_dataset

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

In [10]:
tokenized_dataset = tokenized_dataset.remove_columns(['sentence1', 'sentence2', 'idx'])
tokenized_dataset = tokenized_dataset.rename_column('label', 'labels')

In [11]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3668
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 408
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1725
    })
})

## Training & Evaluation

In [12]:
from transformers import DataCollatorWithPadding

In [13]:
data_collator = DataCollatorWithPadding(tokenizer)

In [14]:
from torch.utils.data import DataLoader

In [15]:
train_dl = DataLoader(tokenized_dataset['train'], batch_size = 32, collate_fn = data_collator)
valid_dl = DataLoader(tokenized_dataset['validation'], batch_size = 64, collate_fn = data_collator)

In [16]:
for batch in train_dl:
    batch = batch.to(device)
    break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([32]),
 'input_ids': torch.Size([32, 79]),
 'token_type_ids': torch.Size([32, 79]),
 'attention_mask': torch.Size([32, 79])}

In [17]:
outputs = model(**batch); outputs.logits.shape, outputs.loss

(torch.Size([32, 2]),
 tensor(0.7167, device='cuda:0', grad_fn=<NllLossBackward0>))

In [18]:
from transformers import AdamW
from torch.nn import functional as F

In [20]:
from transformers import get_scheduler

In [25]:
num_epochs = 3
num_training_steps = num_epochs * len(train_dl)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

345


In [26]:
from tqdm.auto import tqdm

In [27]:
progress_bar = tqdm(range(num_training_steps))
metric = evaluate.load("glue", "mrpc")

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels = num_labels).to(device)
optimizer = AdamW(model.parameters(), lr=5e-5)

model.train()
for epoch in range(num_epochs):
    for batch in train_dl:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        
        progress_bar.update(1)

  0%|          | 0/345 [00:00<?, ?it/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
model.eval()
for batch in valid_dl:
    batch = batch.to(device)
    with torch.no_grad():
        outputs = model(**batch)
    predictions = torch.argmax(outputs.logits, dim=-1)
    actuals = batch['labels']
    metric.add_batch(predictions = predictions, references = actuals)
print(metric.compute())

{'accuracy': 0.8455882352941176, 'f1': 0.887298747763864}


In [29]:
model.save_pretrained('mrpc')

In [30]:
tokenizer.save_pretrained('mrpc')

('mrpc/tokenizer_config.json',
 'mrpc/special_tokens_map.json',
 'mrpc/vocab.txt',
 'mrpc/added_tokens.json',
 'mrpc/tokenizer.json')