In [25]:
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from accelerate import Accelerator

In [26]:
accelerator=Accelerator()

In [None]:
raw_datasets=load_dataset("glue","mrpc")
checkpoint="bert-base-uncased"
tokenizer=AutoTokenizer.from_pretrained(checkpoint)

README.md: 0.00B [00:00, ?B/s]

mrpc/train-00000-of-00001.parquet:   0%|          | 0.00/649k [00:00<?, ?B/s]

mrpc/validation-00000-of-00001.parquet:   0%|          | 0.00/75.7k [00:00<?, ?B/s]

mrpc/test-00000-of-00001.parquet:   0%|          | 0.00/308k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3668 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/408 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1725 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def tokenize_function(example):
  return tokenizer(example['sentence1'],example['sentence2'],truncation=True)

In [None]:
tokenized_datasets=raw_datasets.map(tokenize_function,batched=True)
data_collator=DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/3668 [00:00<?, ? examples/s]

Map:   0%|          | 0/408 [00:00<?, ? examples/s]

Map:   0%|          | 0/1725 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets=tokenized_datasets.remove_columns(['sentence1','sentence2','idx'])
tokenized_datasets=tokenized_datasets.rename_column("label","labels")
tokenized_datasets.set_format("torch")


In [None]:
tokenized_datasets['train'].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
from torch.utils.data import DataLoader
train_dataloader=DataLoader(tokenized_datasets['train'],shuffle=True,batch_size=8,collate_fn=data_collator)
eval_dataloader=DataLoader(tokenized_datasets['validation'],batch_size=8,collate_fn=data_collator)



In [None]:
for batch in train_dataloader:
  pass

{k: v.shape for k,v in batch.items()}

{'labels': torch.Size([4]),
 'input_ids': torch.Size([4, 57]),
 'token_type_ids': torch.Size([4, 57]),
 'attention_mask': torch.Size([4, 57])}

In [None]:
from transformers import AutoModelForSequenceClassification
model=AutoModelForSequenceClassification.from_pretrained(checkpoint,num_labels=2)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
outputs=model(**batch)
print(outputs.loss,outputs.logits.shape)

tensor(1.1245, grad_fn=<NllLossBackward0>) torch.Size([4, 2])


In [None]:
from torch.optim import AdamW
optimizer=AdamW(model.parameters(),lr=5e-5)

In [None]:
from transformers import get_scheduler
num_epochs=3
num_training_steps=num_epochs * len(train_dataloader)
lr_scheduler=get_scheduler("linear",optimizer=optimizer,num_warmup_steps=0,num_training_steps=num_training_steps)

In [None]:
print(num_training_steps)

1377


In [None]:
import torch
device=torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [17]:
from tqdm.auto import tqdm
progress_bar=tqdm(range(num_training_steps))

  0%|          | 0/1377 [00:00<?, ?it/s]

In [19]:
model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch={k:v.to(device) for k, v in batch.items()}
    outputs=model(**batch)
    loss=outputs.loss
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

In [24]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8725490196078431, 'f1': 0.9090909090909091}