<a href="https://colab.research.google.com/github/alga-hopf/alpaca_lora_sage/blob/main/sage_finetuning_github.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip3 install datasets Accelerate bitsandbytes sentencepiece wandb
!pip3 install git+https://github.com/huggingface/transformers
!pip3 install git+https://github.com/huggingface/peft
!pip3 install pynvml

In [None]:
import json
import numpy as np
import timeit
import copy
import torch
import sys
from transformers import Trainer, TrainingArguments, LlamaForCausalLM, LlamaTokenizer, DataCollatorWithPadding, DataCollatorForSeq2Seq, get_scheduler, AdamW
from datasets import load_dataset, DatasetDict, Dataset
from torch.utils.data import DataLoader
import wandb
import os
from tqdm.auto import tqdm
import random
from pynvml import *
from peft import LoraConfig, get_peft_model, get_peft_model_state_dict, prepare_model_for_kbit_training, set_peft_model_state_dict

In [None]:
!nvidia-smi

# Set some custom variables before the training.
Choose the path to the training dataset, the preferred output directory, how to save the model and the name of the wandb project and run.

In [None]:
path = "your path to dataset"  # path to the training dataset
out_dir = "your output dir"  # your output directory
model_name = "your model name"  # how to save your pretrained model
wandb_project = "your wandb project"  # name of your wandb project
wandb_run_name = "your wandb run name" # name of your wandb run (can also be empty)

# Load and inspect dataset

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
with open(path, 'r') as f:
    raw_dataset = json.load(f)

In [None]:
len(raw_dataset)

# Fine tuning

In [None]:
wandb.login()

In [None]:
world_size = int(os.environ.get("WORLD_SIZE", 1))
model_max_length = 512
batch_size = 128
micro_batch_size = 4  
lora_r = 8
lora_alpha = 16
lora_target_modules = ["q_proj", "v_proj"]
lora_dropout = 0.05
ddp = world_size != 1

In [None]:
tokenizer = LlamaTokenizer.from_pretrained("decapoda-research/llama-7b-hf", model_max_length=model_max_length, padding_side="right", use_fast=False)
DEFAULT_PAD_TOKEN = "[PAD]"
DEFAULT_EOS_TOKEN = "</s>"
DEFAULT_BOS_TOKEN = "<s>"
DEFAULT_UNK_TOKEN = "<unk>"
tokenizer.pad_token = DEFAULT_PAD_TOKEN
tokenizer.eos_token = DEFAULT_EOS_TOKEN
tokenizer.bos_token = DEFAULT_BOS_TOKEN
tokenizer.unk_token = DEFAULT_UNK_TOKEN
IGNORE_INDEX = -100

In [None]:
PROMPT_DICT = {
    "prompt_input": (
        "Below is an instruction that describes a task, paired with an input that provides further context. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
    ),
    "prompt_no_input": (
        "Below is an instruction that describes a task. "
        "Write a response that appropriately completes the request.\n\n"
        "### Instruction:\n{instruction}\n\n### Response:"
    ),
}
list_data_dict = raw_dataset
prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
sources_list = [prompt_no_input.format_map(example) for example in list_data_dict]
targets = [f"{example['output']}{tokenizer.eos_token}" for example in list_data_dict]

In [None]:
# If want to train on a smaller portion of the dataset just change pop_size
pop_size = len(raw_dataset)
order = list(range(pop_size))
random.shuffle(order)
examples_list = [s + t for s, t in zip(sources_list, targets)]
examples, sources = [], []
for n in order:
  examples.append(examples_list[n])
  sources.append(sources_list[n])
full_examples = {}
all_examples, all_sources = [], []
for i in range(len(examples)):
    all_examples.append(examples[i])
    all_sources.append(sources[i])
full_examples["example"] = all_examples
full_examples["source"] = all_sources

In [None]:
full_examples_dataset = Dataset.from_dict(full_examples)

In [None]:
def tokenize_function(example):
    data_dict = tokenizer(example["example"], padding="longest", max_length=model_max_length, truncation=True) 
    tokenized_source = tokenizer(example["source"], padding="longest", max_length=model_max_length, truncation=True) 
    data_dict["labels"] = [IGNORE_INDEX] * len(tokenized_source["input_ids"]) + data_dict["input_ids"][len(tokenized_source["input_ids"]):]
    return data_dict

In [None]:
train_dataset = full_examples_dataset.map(tokenize_function)

In [None]:
train_dataset = train_dataset.remove_columns(["example", "source"])

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True)

In [None]:
gradient_accumulation_steps = batch_size // micro_batch_size
device_map = "auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))
ddp = world_size != 1
if ddp:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
    gradient_accumulation_steps = gradient_accumulation_steps // world_size

In [None]:
model = LlamaForCausalLM.from_pretrained("decapoda-research/llama-7b-hf", load_in_8bit=True, torch_dtype=torch.float16, device_map=device_map)

In [None]:
model = prepare_model_for_kbit_training(model)
config = LoraConfig(
    r=lora_r,
    lora_alpha=lora_alpha,
    target_modules=lora_target_modules,
    lora_dropout=lora_dropout,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

In [None]:
def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")

In [None]:
num_epochs = 3
learning_rate = 3e-4
cutoff_len = 256
val_set_size = 0
train_on_inputs = True  # if False, masks out inputs in loss
add_eos_token = False
group_by_length = False  # faster, but produces an odd training loss curve
os.environ["WANDB_PROJECT"] = wandb_project
resume_from_checkpoint = None 
gradient_accumulation_steps = batch_size // micro_batch_size
use_wandb = True

arguments = TrainingArguments(
        per_device_train_batch_size=micro_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        warmup_steps=100,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        fp16=True,
        logging_strategy = "steps",
        logging_steps=1,
        optim="adamw_torch",
        evaluation_strategy="no", #"steps" if val_set_size > 0 else "no",
        save_strategy="no",#"steps",
        #eval_steps=200 if val_set_size > 0 else None,
        #save_steps=200000000000,#200,
        output_dir=out_dir,
        save_total_limit=3,
        load_best_model_at_end=True if val_set_size > 0 else False,
        ddp_find_unused_parameters=False if ddp else None,
        group_by_length=group_by_length,
        report_to="wandb" if use_wandb else None,
        run_name=wandb_run_name if use_wandb else None,
    )

trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=None,
    args=arguments,
    data_collator=DataCollatorForSeq2Seq(tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True),
)
model.config.use_cache = False

old_state_dict = model.state_dict
model.state_dict = (
    lambda self, *_, **__: get_peft_model_state_dict(
        self, old_state_dict()
    )
).__get__(model, type(model))

if torch.__version__ >= "2" and sys.platform != "win32":
    model = torch.compile(model)

trainer.train(resume_from_checkpoint=resume_from_checkpoint)

print_gpu_utilization()

In [None]:
model.save_pretrained(out_dir+model_name)