In [None]:
!pip install --upgrade transformers accelerate bitsandbytes

Collecting transformers
  Downloading transformers-4.57.1-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate
  Downloading accelerate-1.11.0-py3-none-any.whl.metadata (19 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.48.2-py3-none-manylinux_2_24_x86_64.whl.metadata (10 kB)
Collecting tokenizers<=0.23.0,>=0.22.0 (from transformers)
  Downloading tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelera

## Import libraries

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
### Data pre-processing imports ###

import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split

### Custom Model Imports ###
from typing import Dict, List, Optional, Tuple

import torch
import torch.nn as nn
from torch.nn import functional as F

from transformers import (
    AutoTokenizer,
    AutoModel,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
    TrainerCallback
)

import wandb
import huggingface_hub
from huggingface_hub import PyTorchModelHubMixin

from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_kbit_training,
)

In [None]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

# I have saved my API token with "wandb_api" as Label. 
# If you use some other Label make sure to change the same below. 
wandb_api = user_secrets.get_secret("wandb_api") 

wandb.login(key=wandb_api)

huggingface_api = user_secrets.get_secret("huggingface_api") 
huggingface_hub.login(token=huggingface_api)

In [None]:
# from huggingface_hub import HfApi
# api = HfApi()
# api.create_repo(repo_id="bheshaj/deberta-v3-base-pairwise-sequence-classifier", private=False)

In [None]:
print(torch.cuda.is_available())

In [None]:
torch.cuda.is_bf16_supported()

## Set config

Set configurations for model

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Backbone model name
MODEL_NAME = "microsoft/deberta-v3-base"

# Max sequence length for input
MAX_SEQUENCE_LENGTH = 512

# Use quantization
USE_4BIT = True

# PEFT/LoRA Params
LORA_TARGETS = ["query_proj", "key_proj", "value_proj", "dense"]
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

## Import dataset

In [None]:
raw_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/train.csv")
test_df = pd.read_csv("/kaggle/input/llm-classification-finetuning/test.csv")

In [None]:
raw_df.head()

In [None]:
test_df.head()

## EDA

In [None]:
# Check column data types
raw_df.dtypes

Count plot of different types models

In [None]:
plt.figure(figsize=(12,5))
pd.concat([raw_df['model_a'], raw_df['model_b']]).value_counts().plot(kind='bar', stacked=True)

plt.show()

## Data pre-processing

Prompt and responses are in json format and need parsing

In [None]:
import json

def safe_parse_json(x):
    if not isinstance(x, str):
        return x
    try:
        val = json.loads(x)
        # If it's a list, return first non-null element
        if isinstance(val, list):
            if val:
                return [item if item is not None else '' for item in val]
            else:
                return ''
        return val
    except json.JSONDecodeError:
        return ""

raw_df["response_a_processed"] = raw_df["response_a"].apply(safe_parse_json)
raw_df["response_b_processed"] = raw_df["response_b"].apply(safe_parse_json)
raw_df["prompt_processed"] = raw_df["prompt"].apply(safe_parse_json)

In [None]:
test_df["response_a_processed"] = test_df["response_a"].apply(safe_parse_json)
test_df["response_b_processed"] = test_df["response_b"].apply(safe_parse_json)
test_df["prompt_processed"] = test_df["prompt"].apply(safe_parse_json)

Format query and response in a question-answer format like in a chat

Ex-

Query:
What is the difference between marriage license and marriage certificate?

Response:
A marriage license is a legal document that allows a couple to get married. It is issued by a government agency, such as a county clerk's office or a state government, and is valid for a certain period of time, usually one year. After the marriage has taken place, the couple must obtain a marriage certificate, which is a document that records the marriage and is used to prove that the marriage took place. The marriage certificate is usually issued by the same government agency that issued the marriage license, and it is typically used for legal purposes, such as to change a name on a driver's license or to prove that a couple is married when applying for government benefits.

In [None]:
def format_conversation(query_list, response_list):
    parts = []
    for i, (q, r) in enumerate(zip(query_list, response_list)):
        parts.append((f"Query:\n{q}\n\nResponse:\n{r}"))
    return '\n\n'.join(parts)

raw_df['text_a'] = raw_df.apply(lambda x: format_conversation(x['prompt_processed'], x['response_a_processed']), axis=1)
raw_df['text_b'] = raw_df.apply(lambda x: format_conversation(x['prompt_processed'], x['response_b_processed']), axis=1)

In [None]:
test_df['text_a'] = test_df.apply(lambda x: format_conversation(x['prompt_processed'], x['response_a_processed']), axis=1)
test_df['text_b'] = test_df.apply(lambda x: format_conversation(x['prompt_processed'], x['response_b_processed']), axis=1)

Summary statistics for the number of words in the conversation texts. This helps to determine the maximum input sequence length to the model and hence helps decide the model to choose.

The conversations mostly have < 550 words in each conversation. Assuming $ \text{Tokens per conversation} = 1.5 \times \text{Words per conversation} $, we would ideally need a model which can handle ~850 tokens. However, to keep the model easy and simple to train we will use a model with smaller max sequence limit.

In [None]:
word_split = raw_df["text_a"].apply(lambda x: x.split(' '))
word_split.apply(lambda x: len(x)).describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.90])

In [None]:
word_split = raw_df["text_b"].apply(lambda x: x.split(' '))
word_split.apply(lambda x: len(x)).describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.90])

### Create target label

Create a single target column which can help determine the true class

In [None]:
def create_target_col(encoding):
    """
    Create column for target labels
    """

    if encoding == [0, 0, 1]:
        return 'tie'
    elif encoding == [0, 1, 0]:
        return 'model_b'
    elif encoding == [1, 0, 0]:
        return 'model_a'

    return np.nan

raw_df['target'] = raw_df[['winner_model_a', 'winner_model_b', 'winner_tie']].apply(lambda x: create_target_col(list(x)), axis=1)

raw_df['label'] = raw_df['target'].map({'model_a': 0, 'model_b': 1, 'tie': 2})

### Train-test split

In [None]:

train_df, eval_df = train_test_split(raw_df, test_size=0.2, random_state=42, stratify=raw_df["label"])

## Model Architecture

In [None]:
def mean_pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
    summed = (last_hidden_state * mask).sum(dim=1)
    count = mask.sum(dim=1).clamp(min=1e-9)
    return summed / count

class PairwiseBiEncoder(
    nn.Module, 
    PyTorchModelHubMixin # Required to push the model to huggingface hub
    ):
    """
    Two independent encodes + comparison head -> 3-way logits.
    Expects tokenized dicts for A and B: {input_ids, attention_mask}.
    """
    def __init__(self, encoder: nn.Module, hidden_size: int, num_labels: int = 3, dropout: float = 0.2):
        super().__init__()
        self.num_labels = num_labels
        self.encoder = encoder
        self.dropout = nn.Dropout(dropout)
        self.config = getattr(encoder, "config", None) # Needed for peft/LoRA config
        self.classifier = nn.Sequential(
            nn.Linear(4 * hidden_size, 2*hidden_size),
            nn.GELU(),
            nn.Linear(2 * hidden_size, hidden_size),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size // 2, self.num_labels)
        )

    def freeze_backbone(self):
        for name, param in self.encoder.named_parameters():
            # skip non-floating params (e.g., int8/quantized wrappers)
            dt = getattr(param, "dtype", None)
            if dt is None or not dt.is_floating_point:
                # leave as-is (cannot require grad)
                continue
            param.requires_grad = False

    def unfreeze_backbone(self):
        skipped = []
        for name, param in self.encoder.named_parameters():
            dt = getattr(param, "dtype", None)
            if dt is None or not dt.is_floating_point:
                skipped.append(name)
                continue
            param.requires_grad = True
        if skipped:
            # small debug print — in heavy logging environments prefer logger.warning
            print(f"Warning: skipped unfreezing {len(skipped)} non-float params (examples): {skipped[:6]}")


    def is_backbone_frozen(self) -> bool:
        return not any(p.requires_grad for p in self.encoder.parameters())

    def _encode(self, input_ids, attention_mask):
        # ensure return_dict=True for HF models; some wrappers still return tuple
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)

        # handle tuple outputs (PEFT/LoRA/backbone wrappers sometimes return tuple)
        if isinstance(out, tuple):
            last_hidden_state = out[0]
            pooler_output = out[1] if len(out) > 1 else None
        else:
            last_hidden_state = getattr(out, "last_hidden_state", None)
            pooler_output = getattr(out, "pooler_output", None)

        if pooler_output is not None:
            return pooler_output
        if last_hidden_state is not None:
            return mean_pool(last_hidden_state, attention_mask)

        raise ValueError("Encoder output missing last_hidden_state and pooler_output.")

    def forward(
        self,
        a_input_ids: torch.Tensor,
        a_attention_mask: torch.Tensor,
        b_input_ids: torch.Tensor,
        b_attention_mask: torch.Tensor,
        labels: Optional[torch.Tensor] = None,
        **kwargs,  # catch everything else, needed with peft
    ) -> Dict[str, torch.Tensor]:
        """
        Standard forward pass compatible with HF Trainer.
        """
        hA = self._encode(a_input_ids, a_attention_mask)  # [B, H]
        hB = self._encode(b_input_ids, b_attention_mask)  # [B, H]

        comb = torch.cat([hA, hB, torch.abs(hA - hB), hA * hB], dim=-1)
        logits = self.classifier(self.dropout(comb))

        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return {"loss": loss, "logits": logits}

class FreezeUnfreezeCallback(TrainerCallback):
    """
    Freezes backbone for initial epochs, then unfreezes later.
    """

    def __init__(self, unfreeze_at_epoch: int = 1):
        self.unfreeze_at_epoch = unfreeze_at_epoch
        self.has_unfrozen = False

    def on_epoch_begin(self, args, state, control, model=None, **kwargs):
        if state.epoch < self.unfreeze_at_epoch:
            if not model.is_backbone_frozen():
                print(f"Epoch {int(state.epoch)}: Freezing backbone.")
                model.freeze_backbone()
        elif not self.has_unfrozen:
            print(f"Epoch {int(state.epoch)}: Unfreezing backbone.")
            model.unfreeze_backbone()
            self.has_unfrozen = True

### Quantization and LoRA config

In [None]:
quant_config = BitsAndBytesConfig(
    load_in_4bit=USE_4BIT,
    load_in_8bit=not USE_4BIT,
    bnb_4bit_quant_type="nf4" if USE_4BIT else None,
    bnb_4bit_use_double_quant=True if USE_4BIT else None,
    bnb_4bit_compute_dtype=torch.bfloat16 if USE_4BIT and torch.cuda.is_available() else None,
)

# Apply LoRA
lora_cfg = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=LORA_TARGETS,
    bias="none",
    task_type="SEQ_CLS",
)

### Initialize backbone model and tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
backbone = AutoModel.from_pretrained(MODEL_NAME, quantization_config=quant_config, device_map="auto")

hidden_size = backbone.config.hidden_size

assert MAX_SEQUENCE_LENGTH <= backbone.config.max_position_embeddings, f"Config 'max_sequence_length' must be <= the max sequence length allowed by the model i.e. {backbone.config.max_position_embeddings}"

### Tokenize and format data as per requirement in huggingface trainer

In [None]:
from dataclasses import dataclass
from typing import Dict, List, Any
import torch

def tokenize_pairwise(batch):
    a_encodings = tokenizer(
        batch["text_a"],
        padding="max_length",
        truncation=True,
        max_length=MAX_SEQUENCE_LENGTH,
    )
    b_encodings = tokenizer(
        batch["text_b"],
        padding="max_length",
        truncation=True,
        max_length=MAX_SEQUENCE_LENGTH,
    )

    out_dict = {
        "a_input_ids": a_encodings["input_ids"],
        "a_attention_mask": a_encodings["attention_mask"],
        "b_input_ids": b_encodings["input_ids"],
        "b_attention_mask": b_encodings["attention_mask"]
    }

    if "label" in batch:
        out_dict["labels"] = batch["label"]
    
    return out_dict


In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

In [None]:
from dataclasses import dataclass
from typing import List, Any, Dict
import torch

@dataclass
class PairwiseDataCollator:
    tokenizer: Any
    padding: str = "longest"   # "longest" or "max_length"
    max_length: int = 128

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        # defensive: filter out None
        features = [f for f in features if f is not None and isinstance(f, dict)]
        if len(features) == 0:
            raise ValueError("Empty batch after filtering None features in collator.")

        # ensure required keys exist
        for i, f in enumerate(features):
            for k in ("a_input_ids", "a_attention_mask", "b_input_ids", "b_attention_mask"):
                if k not in f:
                    raise ValueError(f"Missing key {k} in feature at batch pos {i}: keys={list(f.keys())}")

        a_feats = [{"input_ids": f["a_input_ids"], "attention_mask": f["a_attention_mask"]} for f in features]
        b_feats = [{"input_ids": f["b_input_ids"], "attention_mask": f["b_attention_mask"]} for f in features]

        # NOTE: truncation should have been done during tokenization (when creating the dataset).
        # pad() does not take `truncation` argument on many tokenizers, so we remove it here.
        pad_kwargs = {"padding": self.padding, "return_tensors": "pt"}
        if self.padding == "max_length":
            pad_kwargs["max_length"] = self.max_length

        a_batch = self.tokenizer.pad(a_feats, **pad_kwargs)
        b_batch = self.tokenizer.pad(b_feats, **pad_kwargs)

        out = {
            "a_input_ids": a_batch["input_ids"],
            "a_attention_mask": a_batch["attention_mask"],
            "b_input_ids": b_batch["input_ids"],
            "b_attention_mask": b_batch["attention_mask"],
        }

        # optional labels
        if "labels" in features[0]:
            labels = [int(f["labels"]) for f in features]
            out["labels"] = torch.tensor(labels, dtype=torch.long)

        # carry ids if present (not tensorized)
        if "id" in features[0]:
            out["ids"] = [f.get("id") for f in features]

        return out


In [None]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

train_dataset = train_dataset.map(tokenize_pairwise, batched=True, remove_columns=list(train_df.columns))
eval_dataset = eval_dataset.map(tokenize_pairwise, batched=True, remove_columns=list(eval_df.columns))


In [None]:
train_dataset.set_format(
    type="torch",
    columns=["a_input_ids","a_attention_mask","b_input_ids","b_attention_mask","labels"]
)

eval_dataset.set_format(
    type="torch",
    columns=["a_input_ids","a_attention_mask","b_input_ids","b_attention_mask","labels"]
)

In [None]:
test_dataset = Dataset.from_pandas(test_df)

test_dataset = test_dataset.map(tokenize_pairwise, batched=True, remove_columns=list(test_df.columns))

test_dataset.set_format(
    type="torch",
    columns=["a_input_ids","a_attention_mask","b_input_ids","b_attention_mask"]
)

In [None]:
# Prepare backbone for k-bit training
backbone = prepare_model_for_kbit_training(backbone)

# Create custom model
custom_model = PairwiseBiEncoder(encoder=backbone, hidden_size=backbone.config.hidden_size, num_labels=3)

# Ensure the custom model has a .config (PEFT expects it)
custom_model.config = backbone.config

# Prepare model for quantization and LoRA
custom_model = get_peft_model(custom_model, lora_cfg)

In [None]:
data_collator = PairwiseDataCollator(tokenizer=tokenizer)

training_args = TrainingArguments(
    output_dir="/kaggle/working/deberta-v3-base-pairwise-sequence-classifier",
    num_train_epochs=6,
    per_device_train_batch_size=16,
    eval_strategy="epoch",
    learning_rate=5e-5,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    report_to='wandb',
    push_to_hub=True,
    hub_model_id='bheshaj/deberta-v3-base-pairwise-sequence-classifier',
    hub_private_repo=False,
    save_strategy="epoch",
    save_total_limit=1,           # keep last n checkpoints (optional)
    load_best_model_at_end=True,    # optional
    save_safetensors=True,           # recommended
)

trainer = Trainer(
    model=custom_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    callbacks=[FreezeUnfreezeCallback(unfreeze_at_epoch=3)],  # ⬅️ add callback here
)

trainer.train()

In [None]:
trainer.push_to_hub()