In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from datasets import load_dataset

In [2]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [3]:
raw_dataset = load_dataset("glue", "mrpc")

def tokenize_fn(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation=True)

tokenized_datasets = raw_dataset.map(tokenize_fn, batched=True)
data_collator = DataCollatorWithPadding(tokenizer)

Found cached dataset glue (/home/asankar/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

Loading cached processed dataset at /home/asankar/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-a38a86d74df4e729.arrow


  0%|          | 0/1 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/asankar/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad/cache-e4fe20fbdd854ec9.arrow


### PrepAre data for pytorch training

In [4]:
# * remove unwanted columns
tokenized_datasets = tokenized_datasets.remove_columns(["sentence1", "sentence2", "idx"])
# * Model expects argument to be named labels.
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
#  * return torch tensors instead of lists
tokenized_datasets.set_format("torch")

tokenized_datasets["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

### Create DataLoader

In [5]:
from torch.utils.data import DataLoader

train_loader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=data_collator, drop_last=True)
val_loader = DataLoader(tokenized_datasets["validation"], shuffle=True, batch_size=8, collate_fn=data_collator, drop_last=True)

In [6]:
# * Test dataset
sample_data = next(iter(train_loader))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


### Train Model using pytorch

In [7]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [8]:
outputs = model(**sample_data)

In [9]:
# * When labels is provided the transformer models will return a loss as well!
outputs.keys()

odict_keys(['loss', 'logits'])

In [10]:
outputs.loss

tensor(0.8452, grad_fn=<NllLossBackward0>)

In [11]:
from transformers import get_scheduler
import torch
from tqdm import tqdm, trange
# * Create train function
def train_model(model, train_loader, val_loader, optimizer ,n_epochs=3):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    n_train_steps = n_epochs * len(train_loader)
    lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=n_train_steps)
    
    model.to(device)
    model.train()
    for _ in trange(n_epochs, desc="Epoch"):
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()



In [12]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)

In [None]:

# * start model training
train_model(model, train_loader, val_loader, optimizer)

In [18]:
!pip install scipy sklearn scikit-learn

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting scikit-learn
  Downloading scikit_learn-1.2.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.6/9.6 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Collecting joblib>=1.1.1
  Downloading joblib-1.2.0-py3-none-any.whl (297 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.2.0 scikit-learn-1.2.1 threadpoolctl-3.1.0


In [19]:
import evaluate
metric = evaluate.load("glue", "mrpc")
model.eval().to("cuda")
for batch in val_loader:
    batch = {k: v.to("cuda") for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.8504901960784313, 'f1': 0.8946459412780656}