<a href="https://colab.research.google.com/github/advik-7/NLP_projects/blob/main/fine_tuning_distilberwidLORA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install torch transformers datasets scikit-learn


Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m24.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score

# Check if GPU is available and set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load AG News dataset
dataset = load_dataset("ag_news")

# Load tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenization function
def tokenize(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

# Apply tokenization and set format
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.rename_column("label", "labels")
dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

# Define LoRA Layer for Low-Rank Adaptation
class LoRALayer(nn.Module):
    def __init__(self, in_features, out_features, rank=4):
        super(LoRALayer, self).__init__()
        self.down_proj = nn.Linear(in_features, rank, bias=False)
        self.up_proj = nn.Linear(rank, out_features, bias=False)

    def forward(self, x):
        return self.up_proj(self.down_proj(x))

# Modify DistilBERT to include LoRA layers in the attention heads
class LoRADistilBERT(nn.Module):
    def __init__(self, distilbert_model, rank=4):
        super(LoRADistilBERT, self).__init__()
        self.distilbert = distilbert_model

        # Replace attention layers with LoRA layers
        for name, module in self.distilbert.transformer.layer.named_modules():
            if name.endswith("attention.q_lin"):
                in_features, out_features = module.in_features, module.out_features
                lora_layer = LoRALayer(in_features, out_features, rank=rank)
                setattr(self.distilbert.transformer.layer, name, lora_layer)

    def forward(self, input_ids, attention_mask=None, labels=None):
        return self.distilbert(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

# Load pre-trained DistilBERT model and apply LoRA modifications
distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)
model = LoRADistilBERT(distilbert)
model.to(device)

# Metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": accuracy, "f1": f1}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True
)

# Initialize Trainer with model and data
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"].select(range(500)),  # Limit to 500 samples for practice
    eval_dataset=dataset["test"].select(range(100)),   # Limit to 100 samples for practice
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Test Results:", results)


Using device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/8.07k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/18.6M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/1.23M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/120000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/7600 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]



Map:   0%|          | 0/120000 [00:00<?, ? examples/s]

Map:   0%|          | 0/7600 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'DistilBertForSequenceClassification' object has no attribute 'transformer'

In [4]:
# Modify DistilBERT to include LoRA layers in the attention heads
class LoRADistilBERT(nn.Module):
    def __init__(self, distilbert_model, rank=4):
        super(LoRADistilBERT, self).__init__()
        self.distilbert = distilbert_model.distilbert  # Access base transformer layers
        self.classifier = distilbert_model.classifier

        # Replace attention layers with LoRA layers
        for layer in self.distilbert.transformer.layer:
            in_features, out_features = layer.attention.q_lin.in_features, layer.attention.q_lin.out_features
            layer.attention.q_lin = LoRALayer(in_features, out_features, rank=rank)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.distilbert(input_ids=input_ids, attention_mask=attention_mask)
        logits = self.classifier(outputs.last_hidden_state[:, 0, :])  # Classifier on [CLS] token output
        return torch.nn.functional.cross_entropy(logits, labels) if labels is not None else logits


# Load pre-trained DistilBERT model and apply LoRA modifications
distilbert = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=4)
model = LoRADistilBERT(distilbert)
model.to(device)

# Metrics for evaluation
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": accuracy, "f1": f1}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True
)

# Initialize Trainer with model and data
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"].select(range(500)),  # Limit to 500 samples for practice
    eval_dataset=dataset["test"].select(range(100)),   # Limit to 100 samples for practice
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Test Results:", results)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: [32m[41mERROR[0m API key must be 40 characters long, yours was 3


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:


Abort: 

In [7]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
import torch

# Define optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)


    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 2.3117177486419678
Epoch 2, Loss: 4.293351650238037
Epoch 3, Loss: 1.6298283338546753


In [9]:

optimizer = AdamW(model.parameters(), lr=5e-6)


In [10]:

# Training loop
model.train()
for epoch in range(10):
    for batch in train_dataloader:
        optimizer.zero_grad()

        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)

        # Check if the output is a tuple with loss as the first element or just the loss
        if isinstance(outputs, tuple):
            # If it's a tuple (loss, logits)
            loss = outputs[0]
        else:
            # If it's just the loss
            loss = outputs

        # Backward pass
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")


Epoch 1, Loss: 3.5418100357055664
Epoch 2, Loss: 3.51240611076355
Epoch 3, Loss: 3.4717671871185303
Epoch 4, Loss: 3.4287099838256836
Epoch 5, Loss: 3.5165562629699707
Epoch 6, Loss: 3.4337897300720215
Epoch 7, Loss: 3.4048819541931152
Epoch 8, Loss: 3.380728244781494
Epoch 9, Loss: 3.377145767211914
Epoch 10, Loss: 3.3931360244750977
