# Fine-tuning a pre-trained Transformer model on a custom dataset

In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset
from transformers import BertTokenizer, BertForSequenceClassification

In [2]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=2
)

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
text = [
    "I love this movie",
    "This film was terrible",
    "What a fantastic experience",
    "I hate this so much"
]

labels = [1, 0, 1, 0]

In [5]:
encoding= tokenizer(
    text, 
    padding=True, 
    truncation=True, 
    max_length=32, 
    return_tensors='pt'
)

labels = torch.tensor(labels)

dataset = TensorDataset(
    encoding['input_ids'], 
    encoding['attention_mask'], 
    labels
)

dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

  labels = torch.tensor(labels)


In [11]:
# Freeze Full Bert
for param in model.bert.parameters():
    param.requires_grad = False

# Unfreeze last 2 layers of BERT
for layer in model.bert.encoder.layer[-2:]:
    for param in layer.parameters():
        param.requires_grad = True

# Optimizer
optimizer = torch.optim.AdamW(
    filter(lambda p: p.requires_grad, model.parameters()),
    lr = 2e-5
)

In [12]:
# Training Loop
for epoch in range(5):
    print(f"Epoch {epoch+1}")
    for batch in dataloader:
        input_ids, attention_mask, labels = batch

        optimizer.zero_grad()

        output = model(
            input_ids=input_ids,
            attention_mask = attention_mask,
            labels = labels
        )

        loss = output.loss 
        loss.backward()
        optimizer.step()

        print("loss: ", loss.item())

Epoch 1
loss:  0.697765588760376
Epoch 2
loss:  0.6590269804000854
Epoch 3
loss:  0.6246843338012695
Epoch 4
loss:  0.594849169254303
Epoch 5
loss:  0.5677815079689026


In [31]:
# Inference
model.eval()

test_text = [
    "I really enjoyed this movie",
    "Terrible film this was"
]

text_encoding = tokenizer(
    test_text,
    padding = True,
    truncation = True,
    max_length = 32,
    return_tensors = 'pt'
)

with torch.no_grad():
    output = model(
        input_ids = text_encoding['input_ids'],
        attention_mask = text_encoding['attention_mask']
    )
logits = output.logits
prediction = torch.argmax(logits, dim=1)

print("Prediction class:", prediction.tolist())

Prediction class: [1, 0]


In [24]:
# checking weights were trained or not in different layers

for name, param in model.named_parameters():
    if 'encoder.layer.11' in name or "classifier" in name:
        print(name, param.requires_grad)

bert.encoder.layer.11.attention.self.query.weight True
bert.encoder.layer.11.attention.self.query.bias True
bert.encoder.layer.11.attention.self.key.weight True
bert.encoder.layer.11.attention.self.key.bias True
bert.encoder.layer.11.attention.self.value.weight True
bert.encoder.layer.11.attention.self.value.bias True
bert.encoder.layer.11.attention.output.dense.weight True
bert.encoder.layer.11.attention.output.dense.bias True
bert.encoder.layer.11.attention.output.LayerNorm.weight True
bert.encoder.layer.11.attention.output.LayerNorm.bias True
bert.encoder.layer.11.intermediate.dense.weight True
bert.encoder.layer.11.intermediate.dense.bias True
bert.encoder.layer.11.output.dense.weight True
bert.encoder.layer.11.output.dense.bias True
bert.encoder.layer.11.output.LayerNorm.weight True
bert.encoder.layer.11.output.LayerNorm.bias True
classifier.weight True
classifier.bias True


(even 2 BERT layers â‰ˆ 15M)
With extremely small datasets, linear probing often outperforms fine-tuning because frozen pretrained representations are already strong, while fine-tuning introduces overfitting and representation drift