In [22]:
import torch
import numpy as np
from datasets import load_dataset, load_metric
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
from transformers import EarlyStoppingCallback, TrainerCallback


**Fine-tuning only the classifier can be effective in the following situations:**

- Your task is similar to BERT's pre-training data.
- You have a small dataset.
- You need fast training with limited resources.
- You're building a baseline model or handling a simple task.


In [42]:
# Load only 10% of the IMDB dataset for training
train_dataset = load_dataset("imdb", split='train[:100%]')
val_dataset = load_dataset("imdb", split='test[:90%]') 
test_dataset = load_dataset("imdb", split='test[90%:]')

# Define the tokenizer and model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# # Freeze all layers except the last classification layer
# for name, param in model.named_parameters():
#     if "classifier" not in name:
#         param.requires_grad = False
        
        
# for name, param in model.named_parameters():
#     if "classifier" in name or "encoder.layer.11" in name or "encoder.layer.10" in name:
#         param.requires_grad = True
#     else:
#         param.requires_grad = False
        
class CustomEarlyStoppingCallback(EarlyStoppingCallback):
    def __init__(self, early_stopping_patience=3, target_accuracy=0.98):
        super().__init__(early_stopping_patience=early_stopping_patience)
        self.target_accuracy = target_accuracy

    def on_evaluate(self, args, state, control, **kwargs):
        # Call the parent method to keep the original early stopping functionality
        super().on_evaluate(args, state, control, **kwargs)

        # Get the latest evaluation accuracy
        eval_metric = kwargs.get('metrics', {})
        accuracy = eval_metric.get('eval_accuracy', 0)

        # If the accuracy reaches the target, stop training
        if accuracy >= self.target_accuracy:
            print(f"Stopping training as accuracy reached {accuracy}")
            control.should_training_stop = True
        
for name, param in model.named_parameters():
    if "classifier" in name:
        param.requires_grad = True
    else:
        param.requires_grad = False

# Tokenize the datasets
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=128)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/22500 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]

In [47]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="steps",
    eval_steps=50,
    save_steps=100,
    save_total_limit=2,
    load_best_model_at_end=True,
    fp16=True,  # Enable mixed precision training
    learning_rate=5e-5,  # Try lowering the learning rate
)


# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=lambda p: load_metric("accuracy").compute(predictions=p.predictions.argmax(-1), references=p.label_ids),
)

# Add EarlyStoppingCallback to the Trainer
trainer.add_callback(CustomEarlyStoppingCallback(early_stopping_patience=6, target_accuracy=0.90))

# Train the model
trainer.train()
trainer.save_model("./fine-tuned-bert-last-layer")

# Evaluate the model
eval_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Validation loss: {eval_results['eval_loss']}")
print(f"Validation accuracy: {eval_results['eval_accuracy']}")



Step,Training Loss,Validation Loss,Accuracy
50,0.6569,0.664926,0.639644
100,0.6547,0.665366,0.636089
150,0.6741,0.664224,0.639733
200,0.6588,0.664395,0.6368
250,0.6626,0.667881,0.612489
300,0.6699,0.66672,0.615689
350,0.6644,0.666015,0.618044
400,0.6608,0.662108,0.636756
450,0.6689,0.669052,0.588222
500,0.6472,0.65565,0.657556


Validation loss: 0.6718271374702454
Validation accuracy: 0.5904


In [29]:
# Evaluate the model
eval_results = trainer.evaluate(eval_dataset=test_dataset)
print(f"Validation loss: {eval_results['eval_loss']}")
print(f"Validation accuracy: {eval_results['eval_accuracy']}")


Validation loss: 0.5301107168197632
Validation accuracy: 0.998


- Weights

In [22]:
# from transformers import BertTokenizer, BertForSequenceClassification

# # Load the tokenizer and fine-tuned model from the saved directory
# model_name = "bert-base-uncased"
# tokenizer = BertTokenizer.from_pretrained(model_name)
# model = BertForSequenceClassification.from_pretrained("./fine-tuned-bert-last-layer")

# After training, print the last layer (classifier) weights and biases
classifier_layer = model.classifier

# Print the weights and biases of the last layer
print("Classifier weights:\n", classifier_layer.weight)
print("\nClassifier biases:\n", classifier_layer.bias)

# Print the shapes of the weights and biases
print("\nClassifier weights shape:", classifier_layer.weight.size())
print("Classifier biases shape:", classifier_layer.bias.size())


Classifier weights:
 Parameter containing:
tensor([[ 0.0054,  0.0080, -0.0289,  ...,  0.0033, -0.0055,  0.0132],
        [ 0.0151,  0.0332, -0.0240,  ...,  0.0010,  0.0351, -0.0088]],
       device='cuda:0', requires_grad=True)

Classifier biases:
 Parameter containing:
tensor([ 0.0100, -0.0100], device='cuda:0', requires_grad=True)

Classifier weights shape: torch.Size([2, 768])
Classifier biases shape: torch.Size([2])


In [16]:
from transformers import BertModel

# Load BERT-base model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained("./fine-tuned-bert-last-layer")  # Path to your saved model

# Print the structure of the BERT-base model
for name, param in model.named_parameters():
    print(name, param.size())




bert.embeddings.word_embeddings.weight torch.Size([30522, 768])
bert.embeddings.position_embeddings.weight torch.Size([512, 768])
bert.embeddings.token_type_embeddings.weight torch.Size([2, 768])
bert.embeddings.LayerNorm.weight torch.Size([768])
bert.embeddings.LayerNorm.bias torch.Size([768])
bert.encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.self.query.bias torch.Size([768])
bert.encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.self.key.bias torch.Size([768])
bert.encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.self.value.bias torch.Size([768])
bert.encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
bert.encoder.layer.0.attention.output.dense.bias torch.Size([768])
bert.encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
bert.encoder.layer.0.attention.output.LayerNorm.bias torch.Size([768])
bert.encoder

In [39]:
# Load the tokenizer and fine-tuned model
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertForSequenceClassification.from_pretrained("./fine-tuned-bert-last-layer")  # Path to your saved model
# Function to predict the sentiment of a review
def predict_sentiment(review_text):
    # Tokenize the input review
    inputs = tokenizer(review_text, truncation=True, padding='max_length', max_length=128, return_tensors="pt")

    # Get the model predictions
    with torch.no_grad():
        outputs = model(**inputs)

    # Get the predicted label (0 for negative, 1 for positive)
    predicted_label = torch.argmax(outputs.logits, dim=1).item()

    # Map label to sentiment
    sentiment = "Positive" if predicted_label == 1 else "Negative"
    return sentiment

# Example usage: Write a review and predict its sentiment
review = "I like this movie, really good."
predicted_sentiment = predict_sentiment(review)
print(f"Review: {review}")
print(f"Predicted Sentiment: {predicted_sentiment}")

Review: I like this movie, really good.
Predicted Sentiment: Positive
