In [1]:
!pip install -r requirements.txt

Collecting setuptools_rust==1.7.0
  Downloading setuptools_rust-1.7.0-py3-none-any.whl (25 kB)
Collecting setuptools==68.1.2
  Downloading setuptools-68.1.2-py3-none-any.whl (805 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m805.1/805.1 kB[0m [31m14.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting wheel==0.41.2
  Downloading wheel-0.41.2-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.8/64.8 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting peft==0.5.0
  Downloading peft-0.5.0-py3-none-any.whl (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.6/85.6 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate==0.22.0
  Downloading accelerate-0.22.0-py3-none-any.whl (251 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m251.2/251.2 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets==2.14.4
  Downloading datasets-2.1

In [2]:
!pip install --upgrade gradio

Collecting gradio
  Downloading gradio-3.44.1-py3-none-any.whl (20.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.2/20.2 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting altair<6.0,>=4.2.0
  Downloading altair-5.1.1-py3-none-any.whl (520 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m520.6/520.6 kB[0m [31m53.6 MB/s[0m eta [36m0:00:00[0m
Collecting pydantic!=1.8,!=1.8.1,!=2.0.0,!=2.0.1,<3.0.0,>=1.7.4
  Downloading pydantic-2.3.0-py3-none-any.whl (374 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.5/374.5 kB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting gradio-client==0.5.0
  Downloading gradio_client-0.5.0-py3-none-any.whl (298 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.2/298.2 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
Collecting python-multipart
  Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━

In [3]:
!pip install typing_extensions==4.7.1 --upgrade

[0m

In [6]:
from dataclasses import dataclass, field
from typing import Optional, Dict
import logging
import nvidia_smi
import os

import torch
from datasets import load_dataset
from huggingface_hub import login
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    PreTrainedTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    TrainerCallback
)
from peft import (
    LoraConfig,
    AutoPeftModelForCausalLM
)
from transformers.trainer_callback import TrainerControl, TrainerState
from transformers.training_args import TrainingArguments
from trl import SFTTrainer
from helpers.storj import Storj

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

@dataclass
class ModelArguments:
    model_name: Optional[str] = field(
        default="meta-llama/Llama-2-7b-hf",
        metadata={"help": "The model that you want to train from Huggingface. Defaults to Meta's Llama2 7B-chat and requires a HF login"}
    )
    new_model_name: Optional[str] = field(
        default="agora-llama-7b-chat",
        metadata={"help": "The name for your fine-tuned model"}
    )

@dataclass
class DataArguments:
    hf_data_path: str = field(
        default="iamtarun/python_code_instructions_18k_alpaca",
        metadata={"help": "The path to the HF dataset. Defaults to `iamtarun/python_code_instructions_18k_alpaca`"}
    )
    split: Optional[str] = field(
        default="train", #TODO: should this be default?,
        metadata={"help": "Which portion of the dataset you want to use"}
    )
    personal_data: Optional[str] = field(
        default=None,
        metadata={"help": "The path to your proprietary data"}
    )

@dataclass
class ModelTrainingArguments(TrainingArguments):
    # Specify an additional cache dir for files downloaded during training
    # Usually things are downloaded into ~/.cache/huggingface
    # Adding this is helpful for distributed training where all workers should read from a central cache
    job_id : int = field(
        default=None,
        metadata={"help": "Unique id for the model training job"}
    )
    bucket_name : Optional[str] = field(
        default=None,
        metadata={"help": "The name of the Storj bucket to upload/download checkpoints to and from"}
    )
    cache_dir : Optional[str] = field(
        default=None,
        metadata={"help": "Optional path where you want model checkpoints and final model to be saved"}
    )
    model_max_length: int = field(
        default=512,
        metadata={"help": "Different models have different max lengths but this keeps it at a standard 512 incase you don't specify. Seq might be truncated"}
    )
    output_dir : str = field(
        default="./results",
        metadata={"help": "Optional path where you want model checkpoints and final model to be saved"}
    ) 
    # num_train_epochs : int = field(
    #     default=1,
    #     metadata={"help": "Number of training epochs"}
    # )
    fp16 : bool = field(
        default=True,
        metadata={"help": "Enable fp16 training"}
    )
    bf16 : bool = field(
        default=False,
        metadata={"help": "Enable bf16 training. Only possible on A100 GPUs"}
    )
    per_device_train_batch_size : int = field(
        default=1,
        metadata={"help": "Training batch size per device"}
    )
    gradient_accumulation_steps : int = field(
        default=4,
        metadata={"help": "Number of updates steps to accumulate the gradients for, before performing a backward/update pass."}
    )
    gradient_checkpointing : bool = field(
        default=True,
        metadata={"help": " If True, use gradient checkpointing to save memory at the expense of slower backward pass."}
    )
    max_grad_norm : float = field(
        default=0.0,
        metadata={"help": "Maximum gradient normal (gradient clipping)"}
    )
    learning_rate : float = field(
        default=2e-4,
        metadata={"help": "Initial learning rate (AdamW optimizer)"}
    )
    weight_decay : float = field(
        default=0.001,
        metadata={"help": "Weight decay to apply to all layers except bias/LayerNorm weights"}
    )
    optim : str = field(
        default="paged_adamw_32bit",
        metadata={"help": "Optimizer to use for training"}
    )
    lr_scheduler_type : str = field(
        default="constant",
        metadata={"help": "Learning rate schedule (constant a bit better than cosine)"}
    )
    warmup_ratio : float = field(
        default=0.03,
        metadata={"help": "Ratio of steps for a linear warmup (from 0 to learning rate)"}
    )
    group_by_length : bool = field(
        default=True,
        metadata={"help": "Group sequences into batches with same length - Saves memory and speeds up training considerably"}
    )
    save_steps : int = field(
        default=10,
        metadata={"help": "Save checkpoint every X updates steps"}
    )
    save_total_limit: int = field(
        default=2,
        metadata={}
    )
    logging_steps : int = field(
        default=25,
        metadata={"help": "Log every X updates steps"}
    )
    max_seq_length : int = field(
        default=None,
        metadata={"help":"Maximum sequence length to use"}
    )
    packing : bool = field(
        default=False,
        metadata={"help":"Pack multiple short examples in the same input sequence to increase efficiency"}
    )
    device_map : any = field(
        default_factory=(lambda: {"":0}),
        metadata={"help":"Device mapping for the SFTTrainer"}
    )
    max_steps: int = field(
        default=20,
        metadata={}
    )

@dataclass
class QuanitzationArguments():
    # added all the params here in order to specify defaults
    load_in_4bit: bool = field(
        default=True,
        metadata={"help": "Load a model in 4bit"}
    )
    bnb_4bit_compute_dtype: torch.dtype = field(
        default=torch.float16, 
        metadata={"help": "Compute dtype for 4-bit base models"}
    )
    bnb_4bit_quant_type: Optional[str] = field(
        default="nf4", 
        metadata={"help": "Quantization type (fp4 or nf4)"}
    )
    use_nested_quant: Optional[bool] = field(
        default=False,
        metadata={"help": "Activate nested quantization for 4-bit base models (double quantization)"},
    )

@dataclass
class QloraArguments():
    # added all the params here in order to specify defaults
    lora_r: Optional[int] = field(
        default=64, 
        metadata={"help": "LoRA attention dimension"}
    )
    lora_alpha: Optional[int] = field(
        default=16, 
        metadata={"help": "Alpha parameter for LoRA scaling"}
    )
    lora_dropout: Optional[float] = field(
        default=0.1, 
        metadata={"help": "Dropout probability for LoRA layers"}
    )
    bias: Optional[str] = field(
        default="none",
        metadata={}
    )
    task_type: Optional[str] = field(
        default="CAUSAL_LM",
        metadata={}
    )

class CheckpointCallback(TrainerCallback):
    def __init__(self, training_args: TrainingArguments, storj: Storj) -> None:
        self.training_args = training_args
        self.storj = storj
   
    def on_save(self, args: TrainingArguments, state: TrainerState, control: TrainerControl, **kwargs):
        self.storj.save_checkpoints_to_cloud(args.output_dir, state.global_step, args.job_id)
        
def huggingface_login():
    # try: 
    #     HUGGING_FACE_TOKEN = os.environ['HUGGING_FACE_TOKEN']
    # except KeyError:
    #     raise Exception('Need to pass hugging face access token as environment variable.')

    login(token="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")

def safe_save_model_for_hf_trainer(trainer: Trainer, output_dir: str):
    """
    - Get model state dict containing weights at time of call
    - Convert to CPU tensors -> reduced memory?
    - Delete original state dict to free VRAM
    - _save() call to save it to disk/or external storage...?
    """
    state_dict = trainer.model.state_dict()
    if trainer.args.should_save:
        cpu_state_dict = {key: value.cpu() for key, value in state_dict.items()}
        del state_dict
        trainer._save(output_dir, state_dict=cpu_state_dict)

def preprocess_data(source, tokenizer: PreTrainedTokenizer) -> Dict:
    return {}

def print_gpu_utilization():
    nvidia_smi.nvmlInit()
    deviceCount = nvidia_smi.nvmlDeviceGetCount()
    for i in range(deviceCount):
        handle = nvidia_smi.nvmlDeviceGetHandleByIndex(i)
        info = nvidia_smi.nvmlDeviceGetMemoryInfo(handle)
        print("Device {}: {}, Memory : ({:.2f}% free): {}(total), {} (free), {} (used)".format(i, nvidia_smi.nvmlDeviceGetName(handle), 100*info.free/info.total, info.total, info.free, info.used))
    nvidia_smi.nvmlShutdown()

def build_bnb_config(quant_args) -> BitsAndBytesConfig:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=quant_args.load_in_4bit,
        bnb_4bit_quant_type=quant_args.bnb_4bit_quant_type,
        bnb_4bit_compute_dtype=quant_args.bnb_4bit_compute_dtype
    )
    return bnb_config

def build_lora_config(qlora_args) -> LoraConfig:
    peft_config = LoraConfig(
        lora_alpha=qlora_args.lora_alpha,
        lora_dropout=qlora_args.lora_dropout,
        r=qlora_args.lora_r,
        bias=qlora_args.bias,
        task_type=qlora_args.task_type
    )
    return peft_config

def finetune():
    huggingface_login()

    parser = HfArgumentParser(
        (ModelArguments, DataArguments, ModelTrainingArguments, QuanitzationArguments, QloraArguments)
    )
    model_args, data_args, training_args, quant_args, qlora_args, remaining = parser.parse_args_into_dataclasses(return_remaining_strings=True) #TODO: remaining and the argument were added due to a weird error on Vast

    # if bucket_name is not '', check for checkpoints in user's bucket
    resume_from_checkpoint = False
    if training_args.bucket_name:
        storj = Storj(training_args.bucket_name)
        resume_from_checkpoint = storj.pull_checkpoints_from_cloud(training_args)

    bnb_config = build_bnb_config(quant_args=quant_args)
    peft_config = build_lora_config(qlora_args=qlora_args)

    model = AutoModelForCausalLM.from_pretrained(
        model_args.model_name,
        quantization_config = bnb_config,
        device_map = "auto",
    )

    tokenizer = AutoTokenizer.from_pretrained(model_args.model_name)
    tokenizer.pad_token = tokenizer.eos_token

    dataset = load_dataset(data_args.hf_data_path, split=data_args.split)
    dataset = dataset.remove_columns(['instruction', 'input', 'output']) #TODO: this is python dataset specific preprocessing. Will need to handle this inside preprocess function somehow

    trainer = SFTTrainer(
        model=model,  
        train_dataset=dataset,
        peft_config=peft_config,
        dataset_text_field="prompt", #TODO: this will change based on dataset. I would add this as an optional default into DataArguments
        max_seq_length=None,
        tokenizer=tokenizer,
        args=training_args,
        packing=False,
    )
    if training_args.bucket_name:
        trainer.add_callback(CheckpointCallback(training_args, storj))
    trainer.train(resume_from_checkpoint=resume_from_checkpoint)
    trainer.save_state() #grabbed from skypilot but need to understand state better
    print("state saved")
    safe_save_model_for_hf_trainer(trainer=trainer,
                                   output_dir="results/finalsave")
    print("all done")

In [7]:
finetune()

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/905 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/18612 [00:00<?, ? examples/s]



Map:   0%|          | 0/18612 [00:00<?, ? examples/s]

You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss


state saved
all done


In [13]:
def build_lora_config() -> LoraConfig:
    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM"
    )
    return peft_config

def build_bnb_config() -> BitsAndBytesConfig:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16
    )
    return bnb_config

bnb = build_bnb_config()
peft = build_lora_config()

In [8]:
from peft import AutoPeftModelForCausalLM, get_peft_model, PeftModel, PeftConfig

In [9]:
model = AutoPeftModelForCausalLM.from_pretrained(
    "results/finalsave", 
    device_map="auto", 
    torch_dtype=torch.bfloat16,
    #quantization_config = bnb cant merge and unload in 8bit
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
#the following should be the same as above 
base_model = AutoModelForCausalLM.from_pretrained(
        "meta-llama/Llama-2-7b-hf",
        quantization_config = bnb,
        device_map = "auto",
    )

peft_config_path = "results/finalsave"
pmodel = PeftModel.from_pretrained(base_model, peft_config_path)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
type(model)
model.num_parameters()

6771970048

In [15]:
type(pmodel)
pmodel.num_parameters()

3533967360

both ways to loading them in are equal! good thing to know

In [None]:
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
tokenizer.pad_token = tokenizer.eos_token

llama_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto")

In [None]:
import gradio as gr

In [None]:
demo = gr.Interface.from_pipeline(llama_pipeline)
demo.launch(share=True)

In [18]:
merged_model = model.merge_and_unload(progressbar=True)

Unloading and merging model: 100%|██████████| 454/454 [00:00<00:00, 29059.10it/s]


In [19]:
# rest of the full save 
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")

output_merged_dir = "results/llama2/final_merged_model"
os.makedirs(output_merged_dir, exist_ok=True)
merged_model.save_pretrained(output_merged_dir)
tokenizer.save_pretrained(output_merged_dir)

('results/llama2/final_merged_model/tokenizer_config.json',
 'results/llama2/final_merged_model/special_tokens_map.json',
 'results/llama2/final_merged_model/tokenizer.json')

In [20]:
#needed for fastchat
!pip3 install "fschat[model_worker,webui]"

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting fschat[model_worker,webui]
  Downloading fschat-0.2.28-py3-none-any.whl (196 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m196.9/196.9 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m38.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting pydantic<2,>=1
  Downloading pydantic-1.10.12-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m63.4 MB/s[0m eta [36m0:0