In [1]:
# Install libs
!pip install -qq peft
!pip install -qq bitsandbytes
!pip install -qq accelerate
!pip install --upgrade transformers
!pip install -qq torch~=2.1.0 --index-url https://download.pytorch.org/whl/cpu -q 
!pip install -qq torch_xla[tpu]~=2.1.0 -f https://storage.googleapis.com/libtpu-releases/index.html -q
!pip uninstall -qq tensorflow -y # If we don't do this, TF will take over TPU and cause permission error for PT
!cp /kaggle/input/utils-xla/spmd_util.py . # From this repo: https://github.com/HeegyuKim/torch-xla-SPMD
!pip show transformers

Collecting transformers
  Downloading transformers-4.44.1-py3-none-any.whl (9.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.5/9.5 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.44.0
    Uninstalling transformers-4.44.0:
      Successfully uninstalled transformers-4.44.0
Successfully installed transformers-4.44.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torchvision 0.19.0 requires torch==2.4.0, but you have torch 2.1.2+cpu which i

In [2]:
!pip show transformers

Name: transformers
Version: 4.44.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft


In [3]:
import os
import gc
import re
from time import time
import random
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm

import torch
import transformers
from sklearn.metrics import accuracy_score
from transformers import AutoTokenizer, LlamaModel, LlamaForSequenceClassification, BitsAndBytesConfig
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import torch.nn.functional as F

import torch_xla.debug.profiler as xp
import torch_xla.core.xla_model as xm
import torch_xla.experimental.xla_sharding as xs
import torch_xla.runtime as xr

xr.use_spmd()

from torch_xla.experimental.xla_sharded_tensor import XLAShardedTensor
from torch_xla.experimental.xla_sharding import Mesh
from spmd_util import partition_module

tqdm.pandas()

print(f'Torch Version: {torch.__version__}')

  from .autonotebook import tqdm as notebook_tqdm


Torch Version: 2.1.2+cpu


In [4]:
class CFG:
    NUM_EPOCHS = 1
    BATCH_SIZE = 16
    DROPOUT = 0.05 
    MODEL_NAME = '/kaggle/input/llama-3.1/transformers/8b-instruct/1'
    WEIGHTS_PATH = '/kaggle/input/llama-train-output/llama_3_finetuned_model_25000.pth'
    SEED = 1024 
    MAX_LENGTH = 1024 
    NUM_WARMUP_STEPS = 128
    LR_MAX = 5e-5 
    NUM_LABELS = 3 
    LORA_RANK = 4
    LORA_ALPHA = 8
    LORA_MODULES = ['o_proj', 'v_proj']
    
DEVICE = xm.xla_device() # Initialize TPU Device
DEVICE

device(type='xla', index=0)

In [5]:
def set_seeds(seed):
    """Set seeds for reproducibility """
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
        
    # Set seed for all TPU cores
    xm.set_rng_state(seed, device=xm.xla_device())  

set_seeds(seed=CFG.SEED)

In [6]:
tokenizer = AutoTokenizer.from_pretrained(CFG.MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
tokenizer.add_eos_token = True

# save tokenizer to load offline during inference
tokenizer.save_pretrained('tokenizer')

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/tokenizer.json')

In [7]:
# Utility function giving token length
def get_token_lengths(texts):
    # tokenize and receive input_ids for reach text
    input_ids = tokenizer(texts.tolist(), return_tensors='np')['input_ids']
    # return length of inputs_ids for each text
    return [len(t) for t in input_ids]

In [8]:
train = pd.read_csv("/kaggle/input/chatbot-arena/Data/train.csv")
# train = train.iloc[:25000, :]
# train = train.iloc[25000:, :]


def process(input_str, str_type):
    list_str = eval(input_str, {"null": ""})
    list_str_processed = [
        f"<<{str_type} {i}>>: {text}" + "\n" + "-" * 10
        for i, text in enumerate(list_str)
        if text != ""
    ]
    return "\n".join(list_str_processed)


train.loc[:, "prompt"] = train["prompt"].apply(lambda x: process(x, "prompt"))
train.loc[:, "response_a"] = train["response_a"].apply(
    lambda x: process(x, "Model A's response to prompt")
)
train.loc[:, "response_b"] = train["response_b"].apply(
    lambda x: process(x, "Model B's response to prompt")
)

# Drop 'Null' for training
indexes = train[(train.response_a == "") & (train.response_b == "")].index
train.drop(indexes, inplace=True)
train.reset_index(inplace=True, drop=True)

print(f"Total {len(indexes)} Null response rows dropped")
print("Total train samples: ", len(train))

Total 19 Null response rows dropped
Total train samples:  57458


In [9]:
train["text"] = (
    "INSTRUCTION: "
    "Below are user's prompts and responses from Model A and Model B for the corresponding prompts. "
    "Which model grenerate the better responses for the prompts? "
    "Be carefull about Hallucinations, long answer with explaination in details do not means that response is better. "
    "The answer must be in one of three form: Model A, Model B, or Tie.\n"
    + "=" * 50
    + "\n\nUser prompt: "
    + train["prompt"]
    + "\n"
    + "=" * 50
    + "\n\nModel A :\n"
    + train["response_a"]
    + "\n"
    + "=" * 50
    + "\n\nModel B:\n"
    + train["response_b"]
)
print(train["text"][24993])

INSTRUCTION: Below are user's prompts and responses from Model A and Model B for the corresponding prompts. Which model grenerate the better responses for the prompts? Be carefull about Hallucinations, long answer with explaination in details do not means that response is better. The answer must be in one of three form: Model A, Model B, or Tie.

User prompt: <<prompt 0>>: Who did listen to lana del rey in 2012
----------
<<prompt 1>>: What were likely those people listening in 2010
----------
<<prompt 2>>: What did fans of indie pop, alternative pop, and dream pop listen to in 2010
----------
<<prompt 3>>: What were fashionable to wear among such fans
----------
<<prompt 4>>: Did baroque pop exist in 2010
----------
<<prompt 5>>: What baroque pop were popular in late 60 early 70
----------

Model A :
<<Model A's response to prompt 0>>: In 2012, Lana Del Rey gained significant popularity with the release of her second studio album, "Born to Die." At that time, her music appealed to a w

In [None]:
train.loc[:, 'token_count'] = get_token_lengths(train['text'])

# prepare label for model
train.loc[:, 'label'] = np.argmax(train[['winner_model_a','winner_model_b','winner_tie']].values, axis=1)

# Display data
display(train.head())

In [None]:
train.label.value_counts()

In [None]:
# token Count
display(train['token_count'].describe().to_frame().astype(int))

In [None]:
# get length of tokens which covers 90% of data, we'll still take 1024 length!
np.percentile(train['token_count'], 90)

In [None]:
# Tokenize Data
tokens = tokenizer(
    train['text'].tolist(), 
    padding='max_length', 
    max_length=CFG.MAX_LENGTH, 
    truncation=True, 
    return_tensors='np')

# Input IDs are the token IDs
INPUT_IDS = tokens['input_ids']
# Attention Masks to Ignore Padding Tokens
ATTENTION_MASKS = tokens['attention_mask']
# Label of Texts
LABELS = train[['winner_model_a','winner_model_b','winner_tie']].values

print(f'INPUT_IDS shape: {INPUT_IDS.shape}, ATTENTION_MASKS shape: {ATTENTION_MASKS.shape}')
print(f'LABELS shape: {LABELS.shape}')

In [None]:
def train_dataset(batch_size):
    N_SAMPLES = LABELS.shape[0]
    IDXS = np.arange(N_SAMPLES - (N_SAMPLES % batch_size))
    while True:
        # Shuffle Indices
        np.random.shuffle(IDXS)
        # Iterate Over All Indices Once
        for idxs in IDXS.reshape(-1, batch_size):
            input_ids = torch.tensor(INPUT_IDS[idxs]).to(DEVICE)
            attention_mask = torch.tensor(ATTENTION_MASKS[idxs]).to(DEVICE)
            labels = torch.tensor(LABELS[idxs]).to(DEVICE)  # Multi-label output
            
            # Shard Over TPU Nodes if applicable (you need to define mesh appropriately)
            xs.mark_sharding(input_ids, mesh, (0, 1))
            xs.mark_sharding(attention_mask, mesh, (0, 1))
            xs.mark_sharding(labels, mesh, (0, 1))
            
            yield input_ids, attention_mask, labels

TRAIN_DATASET = train_dataset(CFG.BATCH_SIZE)

In [None]:
# Load model for classification with 3 target label
base_model = LlamaForSequenceClassification.from_pretrained(
    CFG.MODEL_NAME,
    num_labels=CFG.NUM_LABELS,
    torch_dtype=torch.bfloat16)

base_model.config.pretraining_tp = 1 

# Assign Padding TOKEN
base_model.config.pad_token_id = tokenizer.pad_token_id

In [None]:
lora_config = LoraConfig(
    r=CFG.LORA_RANK,  # the dimension of the low-rank matrices
    lora_alpha = CFG.LORA_ALPHA, # scaling factor for LoRA activations vs pre-trained weight activations
    lora_dropout= CFG.DROPOUT, 
    bias='none',
    inference_mode=False,
    task_type=TaskType.SEQ_CLS,
    target_modules=CFG.LORA_MODULES ) # Only Use Output and Values Projection

In [None]:
# Create LoRa Model
model = get_peft_model(base_model, lora_config)
model.load_state_dict(torch.load(CFG.WEIGHTS_PATH), strict=False)

# Trainable Parameters
model.print_trainable_parameters()

In [None]:
# Number of TPU Nodes
num_devices = xr.global_runtime_device_count()
mesh_shape = (1, num_devices, 1)
device_ids = np.array(range(num_devices))
mesh = Mesh(device_ids, mesh_shape, ('dp', 'fsdp', 'mp'))
# distribute model
partition_module(model, mesh)

print(f'num_devices: {num_devices}')

In [None]:
# Verfy The Trainable Layers
MODEL_LAYERS_ROWS = []
TRAINABLE_PARAMS = []
N_TRAINABLE_PARAMS = 0

for name, param in model.named_parameters():
    # Layer Parameter Count
    n_parameters = int(torch.prod(torch.tensor(param.shape)))
    # Only Trainable Layers
    if param.requires_grad:
        # Add Layer Information
        MODEL_LAYERS_ROWS.append({
            'param': n_parameters,
            'name': name,
            'dtype': param.data.dtype,
        })
        # Append Trainable Parameter
        TRAINABLE_PARAMS.append({ 'params': param })
        # Add Number Of Trainable Parameters"
        N_TRAINABLE_PARAMS += n_parameters
        
display(pd.DataFrame(MODEL_LAYERS_ROWS))

print(f"""
===============================
N_TRAINABLE_PARAMS: {N_TRAINABLE_PARAMS:,}
N_TRAINABLE_LAYERS: {len(TRAINABLE_PARAMS)}
===============================
""")

In [None]:
# LR & Optimizer
N_SAMPLES = len(train)
STEPS_PER_EPOCH = N_SAMPLES // CFG.BATCH_SIZE

OPTIMIZER = torch.optim.AdamW(model.parameters(), lr=CFG.LR_MAX)

# Cosine Learning Rate With Warmup
lr_scheduler = transformers.get_cosine_schedule_with_warmup(
    optimizer=OPTIMIZER,
    num_warmup_steps=CFG.NUM_WARMUP_STEPS,
    num_training_steps=STEPS_PER_EPOCH * CFG.NUM_EPOCHS)

print(f'BATCH_SIZE: {CFG.BATCH_SIZE}, N_SAMPLES: {N_SAMPLES}, STEPS_PER_EPOCH: {STEPS_PER_EPOCH}')

In [None]:
# Set the data type for the optimizer's state (e.g., momentum buffers)
for state in OPTIMIZER.state.values():
    for k, v in state.items():
        if isinstance(v, torch.Tensor) and state[k].dtype is not torch.float32:
            state[v] = v.to(dtype=torch.float32)

In [None]:
input_ids, attention_mask, labels = next(TRAIN_DATASET)

print(f'input_ids shape: {input_ids.shape}, dtype: {input_ids.dtype}')
print(f'attention_mask shape: {attention_mask.shape}, dtype: {attention_mask.dtype}')
print(f'labels shape: {labels.shape}, dtype: {labels.dtype}')

In [None]:
# Put Model In Train Mode
model.train()

# Loss Function, Cross Entropy
LOSS_FN = torch.nn.CrossEntropyLoss().to(dtype=torch.float32)

In [None]:
print("Begin to train model")
st = time()
warnings.filterwarnings("error")
METRICS = {
    'loss': [],
    'accuracy': {'y_true': [], 'y_pred': [] }}

for epoch in tqdm(range(CFG.NUM_EPOCHS)):
    ste = time()
    for step in range(STEPS_PER_EPOCH):
        # Zero Out Gradients
        OPTIMIZER.zero_grad()
        
        # Get Batch
        input_ids, attention_mask, labels = next(TRAIN_DATASET)
        
        # Forward Pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
       
        # Logits Float32
        logits = outputs.logits.to(dtype=torch.float32)
        
        # Backward Pass
        loss = LOSS_FN(logits,labels.to(dtype=torch.float32))
        loss.backward()
        
        # optimizer step
        OPTIMIZER.step()
        xm.mark_step()
        
        # Update Learning Rate Scheduler
        lr_scheduler.step()
        
        # Update Metrics And Progress Bar
        METRICS['loss'].append(float(loss))
        METRICS['accuracy']['y_true'] += labels.squeeze().tolist()
        METRICS['accuracy']['y_pred'] += torch.argmax(F.softmax(logits, dim=-1), dim=1).cpu().tolist()
        
        if (step + 1) % 200 == 0:  
            metrics = 'µ_loss: {:.3f}'.format(np.mean(METRICS['loss']))
            metrics += ', step_loss: {:.3f}'.format(METRICS['loss'][-1])
            metrics += ', µ_auc: {:.3f}'.format(accuracy_score(torch.argmax(torch.tensor(METRICS['accuracy']['y_true']), axis=-1), \
                                                               METRICS['accuracy']['y_pred']))
            lr = OPTIMIZER.param_groups[0]['lr']
            print(f'{epoch+1:02}/{CFG.NUM_EPOCHS:02} | {step+1:04}/{STEPS_PER_EPOCH} lr: {lr:.2E}, {metrics}', end='')
            print(f'\nSteps per epoch: {step+1} complete | Time elapsed: {time()- st}')
    
    print(f'\nEpoch {epoch+1} Completed | Total time for epoch: {time() - ste} ' )

    # If stopped, and to continue training in future on tpu we save model and optimizer
    xm.save({k: v.cpu() for k, v in model.named_parameters() if v.requires_grad}, f'model_llama_3_cp_{epoch+1}_v1.pth')
    xm.save(OPTIMIZER.state_dict(), f'optimizer_llama_3_cp_{epoch+1}_v1.pth')    
    
    print(f'Model saved at epoch {epoch+1}| Elapsed time: {time() - st} ')

In [None]:
plt.figure(figsize=(15, 6))
plt.plot(METRICS['loss'])    
plt.xlabel('Step per epoch')
plt.ylabel('Loss')
plt.title('Loss Plot step per epoch')    
plt.show()

In [None]:
model = model.cpu()
torch.save(dict([(k,v) for k, v in model.named_parameters() if v.requires_grad]), 'llama_3_finetuned_full_1.pth')