In [None]:
!pip install --upgrade transformers accelerate bitsandbytes

## Import libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import torch

In [None]:
print(torch.cuda.is_available())

In [None]:
torch.cuda.is_bf16_supported()

## Set config

In [None]:
MAX_SEQUENCE_LENGTH = 512

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
MODEL_NAME = "microsoft/deberta-v3-base"
USE_4BIT = True

LORA_TARGETS = ["query_proj", "key_proj", "value_proj", "dense"]
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

## Import dataset

In [None]:
import os
os.chdir('/content/')
os.getcwd()

In [None]:
raw_df = pd.read_csv("/content/drive/MyDrive/Data Science Projects/llm-finetuning/nlp-playground/data/raw/llm-classification-finetuning/train.csv")
# raw_df = pd.read_csv("./data/raw/llm-classification-finetuning/train.csv")

In [None]:
raw_df.head()

## EDA

In [None]:
raw_df.dtypes

In [None]:
print(type(raw_df['prompt'].iloc[0]))
print(type(raw_df['response_a'].iloc[0]))
print(type(raw_df['response_b'].iloc[0]))

In [None]:
plt.figure(figsize=(12,5))
pd.concat([raw_df['model_a'], raw_df['model_b']]).value_counts().plot(kind='bar', stacked=True)

plt.show()

In [None]:
raw_df.loc[raw_df['response_a'].str.len() < 10, 'response_a'].unique()

## Data pre-processing

In [None]:
import json

def safe_parse_json(x):
    if not isinstance(x, str):
        return x
    try:
        val = json.loads(x)
        # If it's a list, return first non-null element
        if isinstance(val, list):
            if val:
                return [item if item is not None else '' for item in val]
            else:
                return ''
        return val
    except json.JSONDecodeError:
        return ""

raw_df["response_a_processed"] = raw_df["response_a"].apply(safe_parse_json)
raw_df["response_b_processed"] = raw_df["response_b"].apply(safe_parse_json)
raw_df["prompt_processed"] = raw_df["prompt"].apply(safe_parse_json)

In [None]:
# Check the number of queries and responses in each row
len_resp = raw_df["response_a_processed"].apply(lambda x: len(x))

len_resp.value_counts()

In [None]:
def format_conversation(query_list, response_list):
    parts = []
    for i, (q, r) in enumerate(zip(query_list, response_list)):
        parts.append((f"Query:\n{q}\n\nResponse:\n{r}"))
    return '\n\n'.join(parts)

raw_df['conversation_a'] = raw_df.apply(lambda x: format_conversation(x['prompt_processed'], x['response_a_processed']), axis=1)
raw_df['conversation_b'] = raw_df.apply(lambda x: format_conversation(x['prompt_processed'], x['response_b_processed']), axis=1)

In [None]:
word_split = raw_df["conversation_a"].apply(lambda x: x.split(' '))
word_split.apply(lambda x: len(x)).describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.90])

In [None]:
word_split = raw_df["conversation_b"].apply(lambda x: x.split(' '))
word_split.apply(lambda x: len(x)).describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.90])

The conversations mostly have < 1000 words in each conversation. Assuming $ \text{Tokens per conversation} = 1.5 \times \text{Words per conversation} $, we would need a model which can handle ~1500 tokens

In [None]:
def create_target_col(encoding):
    """
    Create column for target labels
    """

    if encoding == [0, 0, 1]:
        return 'tie'
    elif encoding == [0, 1, 0]:
        return 'model_b'
    elif encoding == [1, 0, 0]:
        return 'model_a'

    return np.nan

raw_df['target'] = raw_df[['winner_model_a', 'winner_model_b', 'winner_tie']].apply(lambda x: create_target_col(list(x)), axis=1)

## Setting up modelling architecture

In [None]:
from transformers import AutoTokenizer, AutoModel

# Get model for embeddings
MODEL_NAME = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

quant_config = BitsAndBytesConfig(
load_in_4bit=USE_4BIT,
load_in_8bit=not USE_4BIT,
bnb_4bit_quant_type="nf4" if USE_4BIT else None,
bnb_4bit_use_double_quant=True if USE_4BIT else None,
bnb_4bit_compute_dtype=torch.bfloat16 if USE_4BIT and torch.cuda.is_available() else None,
)

backbone = AutoModel.from_pretrained(MODEL_NAME, quantization_config=quant_config, device_map="auto")

# Prepare for k-bit training (fixes layer norms, casts, etc.)
backbone = prepare_model_for_kbit_training(backbone)

# Apply LoRA
lora_cfg = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT,
target_modules=LORA_TARGETS,
bias="none",
task_type="SEQ_CLS", # generic; works for encoder models
)
backbone = get_peft_model(backbone, lora_cfg)
backbone.print_trainable_parameters()


hidden_size = backbone.config.hidden_size

In [None]:
assert max_sequence_length <= model.config.max_position_embeddings, f"Config 'max_sequence_length' must be <= the max sequence length allowed by the model i.e. {model.config.max_position_embeddings}"

## Create Model Architecture

In [None]:
from typing import Dict, List, Optional, Tuple

import torch
import torch.nn as nn
from torch.nn import functional as F

def mean_pool(last_hidden_state: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
    mask = attention_mask.unsqueeze(-1).type_as(last_hidden_state)
    summed = (last_hidden_state * mask).sum(dim=1)
    count = mask.sum(dim=1).clamp(min=1e-9)
    return summed / count

class PairwiseBiEncoder(nn.Module):
    """
    Two independent encodes + comparison head -> 3-way logits.
    Expects tokenized dicts for A and B: {input_ids, attention_mask}.
    """
    def __init__(self, encoder: nn.Module, hidden_size: int, num_labels: int = 3, dropout: float = 0.2):
        super().__init__()
        self.encoder = encoder
        self.dropout = nn.Dropout(dropout)
        self.classifier = nn.Sequential(
            nn.Linear(4 * hidden_size, 2*hidden_size),
            nn.GELU(),
            nn.Linear(2 * hidden_size, hidden_size),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size, hidden_size // 2),
            nn.GELU(),
            nn.Dropout(0.2),
            nn.Linear(hidden_size // 2, num_labels)
        )

    @torch.no_grad()
    def _encode(self, input_ids, attention_mask):
        out = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        # Prefer mean pooling for stability across backbones
        if hasattr(out, "last_hidden_state"):
            pooled = mean_pool(out.last_hidden_state, attention_mask)
        else:
            # Some models expose .pooler_output
            pooled = out.pooler_output
        return pooled

    def forward(self, batch: Dict[str, torch.Tensor]) -> torch.Tensor:
        # batch keys expected: a_input_ids, a_attention_mask, b_input_ids, b_attention_mask
        hA = self._encode(batch["a_input_ids"], batch["a_attention_mask"])  # [B, H]
        hB = self._encode(batch["b_input_ids"], batch["b_attention_mask"])  # [B, H]
        comb = torch.cat([hA, hB, torch.abs(hA - hB), hA * hB], dim=-1)
        logits = self.classifier(self.dropout(comb))
        return logits

In [None]:
out = model(input_ids=input_ids, attention_mask=attention_mask)
# Prefer mean pooling for stability across backbones
if hasattr(out, "last_hidden_state"):
    pooled = mean_pool(out.last_hidden_state, attention_mask)

In [None]:
inputs_a

In [None]:
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", quantization_config=quantization_config)

In [None]:
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)

In [None]:
from peft import LoraConfig

config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="lora_only",
    task_type="SEQ_CLS",
    use_rslora = True,
    init_lora_weights = 'eva',

)