### Code Reference:

- Hugging Face: https://github.com/huggingface/trl/tree/main/examples/research_projects/stack_llama_2/scripts
- Helpful Notebook: 
    - https://colab.research.google.com/drive/12dVqXZMIVxGI0uutU6HG9RWbWPXL3vts?usp=sharing#scrollTo=yWEM89A48NrU
    - https://colab.research.google.com/drive/1PEQyJO1-f6j0S_XJ8DV50NkpzasXkrzd?usp=sharing#scrollTo=OJXpOgBFuSrc
    
- Interesting Reads:
    - https://www.assemblyai.com/blog/how-rlhf-preference-model-tuning-works-and-how-things-may-go-wrong/ 
    - https://huyenchip.com/2023/05/02/rlhf.html
    - https://lightning.ai/pages/community/lora-insights/?utm_medium=social&utm_source=twitter&utm_campaign=Education_10132023

### Imports

In [None]:
import os
from datasets import load_dataset

# Set Cache Dir
CACHE_DIR = os.getcwd()+'/cache'

import torch
from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
from peft import LoraConfig, get_peft_model
from transformers import TrainingArguments
from trl import SFTTrainer

### Set Hugging Face Token

In [1]:
token = ###

from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

### Load Custom Dataset

In [1]:
dataset_name = 'ProcessedData/lima_train.csv'
train_dataset = load_dataset('csv', data_files=dataset_name, split="train")

dataset_name = 'ProcessedData/lima_test.csv'
eval_dataset = load_dataset('csv', data_files=dataset_name, split="train")

### Data Pre-Processing

In [2]:
# Trimming to a maximum length of 1024 for SFT
MAX_LEN = 1024

def map_trim(samples):
    return {'text': samples['text'][:MAX_LEN]}

train_dataset = train_dataset.map(map_trim)
eval_dataset = eval_dataset.map(map_trim)

### Set-up Hugging Face Package

In [3]:
# Using Llama for Fine-Tuning
model_name = "meta-llama/Llama-2-7b-hf"

# Quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load Model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": Accelerator().local_process_index},
    use_auth_token=True,
    trust_remote_code=True,
    cache_dir=CACHE_DIR
)
model.config.use_cache = False

# Set-Up Tokenizer 
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, cache_dir=CACHE_DIR)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" 

In [16]:
# Set-Up Lora
lora_alpha = 16
lora_dropout = 0.05
target_modules = ['q_proj', 'v_proj'] #'k_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj', 'lm_head']
lora_r = 8

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=target_modules,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM"
)

### SFT Trainer

In [18]:
output_dir = "sft"
per_device_train_batch_size = 2
gradient_accumulation_steps = 4
optim = "paged_adamw_32bit"
save_steps = 5000
logging_steps = 2500
learning_rate = 2e-4
max_grad_norm = 0.3
max_steps = -1
# warmup_ratio = 0.03
warmup_steps=100
weight_decay = 0.05
lr_scheduler_type = "cosine"
num_train_epochs = 4

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    evaluation_strategy="steps",
    do_eval=True,
    learning_rate=learning_rate,
    fp16=True,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_steps=warmup_steps,
    group_by_length=False,
    weight_decay=weight_decay,
    lr_scheduler_type=lr_scheduler_type,
    
)

max_seq_length = MAX_LEN

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
)

### Train Model

In [20]:
# Train model
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Currently logged in as: [33myashsharma0906[0m. Use [1m`wandb login --relogin`[0m to force relogin
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss


TrainOutput(global_step=512, training_loss=1.7740617990493774, metrics={'train_runtime': 619.0457, 'train_samples_per_second': 6.655, 'train_steps_per_second': 0.827, 'total_flos': 4.495840496369664e+16, 'train_loss': 1.7740617990493774, 'epoch': 3.98})

### Store Model

In [21]:
SAVE_MODEL_NAME = "sft/llama2_sft_lima_1024"

trainer.save_model(SAVE_MODEL_NAME)
trainer.model.save_pretrained(SAVE_MODEL_NAME+"/final_checkpoint")

del model
torch.cuda.empty_cache()

### Combine Model

In [23]:
from peft import AutoPeftModelForCausalLM

model = AutoPeftModelForCausalLM.from_pretrained(SAVE_MODEL_NAME, \
                                                 device_map="auto", torch_dtype=torch.bfloat16, \
                                                cache_dir=CACHE_DIR)
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [24]:
model.save_pretrained(SAVE_MODEL_NAME+"/merged_model", safe_serialization=True)

### Inference

In [18]:
CACHE_DIR = os.getcwd()+'/cache'
model_name = "sft/llama2_sft_lima"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map={"": Accelerator().local_process_index},
    use_auth_token=True,
    trust_remote_code=True,
    cache_dir=CACHE_DIR
)
model.config.use_cache = False

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [4]:
from transformers import pipeline

# Run text generation pipeline with our next model
prompt = "What is the meaning of life, universe, and everything?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"###Question:\n{prompt}\n\n###Answer:\n")
print(result[0]['generated_text'])