In [None]:
!pip install --upgrade transformers accelerate bitsandbytes

## Import libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import torch

In [None]:
print(torch.cuda.is_available())

In [None]:
torch.cuda.is_bf16_supported()

## Set config

In [None]:
max_sequence_length = 2048

## Import dataset

In [None]:
import os
os.chdir('..')
os.getcwd()

In [None]:
# raw_df = pd.read_csv("/content/drive/MyDrive/Data Science Projects/llm-finetuning/nlp-playground/data/raw/llm-classification-finetuning/train.csv")
raw_df = pd.read_csv("./data/raw/llm-classification-finetuning/train.csv")

In [None]:
raw_df.head()

## EDA

In [None]:
raw_df.dtypes

In [None]:
print(type(raw_df['prompt'].iloc[0]))
print(type(raw_df['response_a'].iloc[0]))
print(type(raw_df['response_b'].iloc[0]))

In [None]:
plt.figure(figsize=(12,5))
pd.concat([raw_df['model_a'], raw_df['model_b']]).value_counts().plot(kind='bar', stacked=True)

plt.show()

In [None]:
raw_df.loc[raw_df['response_a'].str.len() < 10, 'response_a'].unique()

## Data pre-processing

In [None]:
import json

def safe_parse_json(x):
    if not isinstance(x, str):
        return x
    try:
        val = json.loads(x)
        # If it's a list, return first non-null element
        if isinstance(val, list):
            if val:
                return [item if item is not None else '' for item in val]
            else:
                return ''
        return val
    except json.JSONDecodeError:
        return ""

raw_df["response_a_processed"] = raw_df["response_a"].apply(safe_parse_json)
raw_df["response_b_processed"] = raw_df["response_b"].apply(safe_parse_json)
raw_df["prompt_processed"] = raw_df["prompt"].apply(safe_parse_json)

In [None]:
# Check the number of queries and responses in each row
len_resp = raw_df["response_a_processed"].apply(lambda x: len(x))

len_resp.value_counts()

In [None]:
def format_conversation(query_list, response_list):
    parts = []
    for i, (q, r) in enumerate(zip(query_list, response_list)):
        parts.append((f"Query:\n{q}\n\nResponse:\n{r}"))
    return parts

raw_df['conversation_a'] = raw_df.apply(lambda x: format_conversation(x['prompt_processed'], x['response_a_processed']), axis=1)
raw_df['conversation_b'] = raw_df.apply(lambda x: format_conversation(x['prompt_processed'], x['response_b_processed']), axis=1)

In [None]:
word_split = raw_df["conversation_a"].apply(lambda x: ' /n '.join(x).split(' '))
word_split.apply(lambda x: len(x)).describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])

In [None]:
word_split = raw_df["conversation_b"].apply(lambda x: ' /n '.join(x).split(' '))
word_split.apply(lambda x: len(x)).describe(percentiles=[0.05, 0.25, 0.5, 0.75, 0.95])

The conversations mostly have < 1000 words in each conversation. Assuming $ \text{Tokens per conversation} = 1.5 \times \text{Words per conversation} $, we would need a model which can handle ~1500 tokens

In [None]:
def create_target_col(encoding):
    """
    Create column for target labels
    """

    if encoding == [0, 0, 1]:
        return 'tie'
    elif encoding == [0, 1, 0]:
        return 'model_b'
    elif encoding == [1, 0, 0]:
        return 'model_a'

    return np.nan

raw_df['target'] = raw_df[['winner_model_a', 'winner_model_b', 'winner_tie']].apply(lambda x: create_target_col(list(x)), axis=1)

In [None]:
from transformers import AutoTokenizer, AutoModel

# Get model for embeddings
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModel.from_pretrained(model_name)

In [None]:
max_sequence_length=512

In [None]:
text = raw_df['conversation_a'].to_list()
inputs_a = tokenizer(text, return_tensors="pt", padding='max_length', truncation=True, max_length=max_sequence_length)

In [None]:
from transformers import BitsAndBytesConfig

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("microsoft/deberta-v3-base", quantization_config=quantization_config)

In [None]:
from peft import prepare_model_for_kbit_training
model = prepare_model_for_kbit_training(model)

In [None]:
from peft import LoraConfig

config = LoraConfig(
    r=16,
    lora_alpha=8,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="lora_only",
    task_type="SEQ_CLS",
    use_rslora = True,
    init_lora_weights = 'eva',

)