<a href="https://colab.research.google.com/github/Yash-Yelave/LLM/blob/main/LLM_PR1_Transformer_Finetuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
#------------------------------
# 1. Load Synthetic Dataset
#------------------------------
data = pd.read_csv("synthetic_dataset.csv")
texts = data["text"].tolist()
labels = data["label"].tolist()
#------------------------------
# 2. Train-Test Split
#------------------------------
X_train, X_test, y_train, y_test = train_test_split(
texts, labels, test_size=0.2, random_state=42
)
#------------------------------
# 3. Load Tokenizer
#------------------------------
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
#------------------------------
# 4. Custom Dataset Class
#------------------------------
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=128,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }
#------------------------------
# 5. DataLoaders
#------------------------------
train_dataset = TextDataset(X_train, y_train, tokenizer)
test_dataset = TextDataset(X_test, y_test, tokenizer)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)
#------------------------------
# 6. Load Pre-trained Model
#------------------------------
model = BertForSequenceClassification.from_pretrained(
"bert-base-uncased",
num_labels=3
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
#------------------------------
# 7. Evaluation Function
#------------------------------
def evaluate(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            predictions = torch.argmax(outputs.logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)
    return correct / total
#------------------------------
# 8. Evaluation BEFORE Fine-Tuning
#------------------------------
before_accuracy = evaluate(model, test_loader, device)
print(f"Accuracy BEFORE fine-tuning: {before_accuracy:.4f}")
#------------------------------
# 9. Optimizer
#------------------------------
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
#------------------------------
# 10. Fine-Tuning (Training Loop)
#------------------------------
model.train()
epochs = 3
for epoch in range(epochs):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}/{epochs}, Loss: {total_loss:.4f}")
#------------------------------
# 11. Evaluation AFTER Fine-Tuning
#------------------------------
after_accuracy = evaluate(model, test_loader, device)
print(f"Accuracy AFTER fine-tuning: {after_accuracy:.4f}")
#------------------------------
# 12. Save Fine-Tuned Model
#------------------------------
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")
print("Model and tokenizer saved successfuly.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Accuracy BEFORE fine-tuning: 0.0000
Epoch 1/3, Loss: 1.0104
Epoch 2/3, Loss: 0.9539
Epoch 3/3, Loss: 1.0065
Accuracy AFTER fine-tuning: 0.0000


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Model and tokenizer saved successfuly.
