# Install Docker CLI

First, let's install Docker CLI. Docker CLI is required to leverage SageMaker Local Mode.

With Local Mode, developers can now train and test models, debug code, and validate end-to-end pipelines directly on their SageMaker Studio notebook instance without the need for spinning up remote compute resources. This reduces the iteration cycle from minutes down to seconds, boosting developer productivity. Docker support in SageMaker Studio notebooks enables developers to effortlessly build Docker containers and access pre-built containers, providing a consistent development environment across the team and avoiding time-consuming setup and dependency management.

Learn more --> [Accelerate ML workflows with Amazon SageMaker Studio Local Mode and Docker support](https://aws.amazon.com/blogs/machine-learning/accelerate-ml-workflows-with-amazon-sagemaker-studio-local-mode-and-docker-support/)

In [None]:
%%bash
sudo apt-get -y install ca-certificates curl gnupg

sudo install -m 0755 -d /etc/apt/keyrings
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /etc/apt/keyrings/docker.gpg
sudo chmod a+r /etc/apt/keyrings/docker.gpg
echo \
  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] https://download.docker.com/linux/ubuntu \
  $(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
  sudo tee /etc/apt/sources.list.d/docker.list > /dev/null

In [None]:
%%bash
sudo apt-get -y update

# pick the latest patch from:
# apt-cache madison docker-ce | awk '{ print $3 }' | grep -i 20.10
VERSION_STRING=5:20.10.24~3-0~ubuntu-jammy
sudo apt-get install docker-ce-cli=$VERSION_STRING docker-compose-plugin -y

In [None]:
%%bash 
# validate the Docker Client is able to access Docker Server at [unix:///docker/proxy.sock]
docker version

## Install Pre-requisites

In [None]:
%pip install -Uq datasets==2.18.0
%pip install -Uq transformers==4.39.0

In [None]:
import os
import glob
import boto3
import pprint
from tqdm import tqdm
import sagemaker
from sagemaker.collection import Collection
from sagemaker.utils import name_from_base

In [None]:
sagemaker_session =  sagemaker.session.Session() #sagemaker.session.Session()
region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()
default_bucket = sagemaker_session.default_bucket()
sm_client = boto3.client('sagemaker', region_name=region)
model_collector = Collection(sagemaker_session=sagemaker_session)

## Define Parameters

In [None]:
model_id = "mistralai/Mistral-7B-v0.1" 
# define a base dataset to finetune this base model
dataset_name = "databricks/databricks-dolly-15k"
# data s3 path
s3_data_uri = f"s3://{default_bucket}/dataset-for-training/dolly2"
# your hf token
hf_token = "hf_xxxxx"

## Prepare Dataset

In [None]:
from datasets import load_dataset
from random import randrange

# Load dataset from the hub
train_dataset = load_dataset(dataset_name, split="train[:05%]")
validation_dataset = load_dataset(dataset_name, split="train[95%:]")

print(f"Training size: {len(train_dataset)} | Validation size: {len(validation_dataset)}")
print("\nTraining sample:\n")
print(train_dataset[randrange(len(train_dataset))])
print("\nValidation sample:\n")
print(validation_dataset[randrange(len(validation_dataset))])

In [None]:
def format_dolly(sample):
    instruction = f"### Instruction\n{sample['instruction']}"
    context = f"### Context\n{sample['context']}" if len(sample["context"]) > 0 else None
    response = f"### Answer\n{sample['response']}"
    # join all the parts together
    prompt = "\n\n".join([i for i in [instruction, context, response] if i is not None])
    return prompt

In [None]:
from random import randrange

print(format_dolly(train_dataset[randrange(len(train_dataset))]))

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_token)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from random import randint
from itertools import chain
from functools import partial


# template dataset to add prompt to each sample
def template_dataset(sample):
    sample["text"] = f"{format_dolly(sample)}{tokenizer.eos_token}"
    return sample


# apply prompt template per sample
# train
train_dataset = train_dataset.map(template_dataset, remove_columns=list(train_dataset.features))
# validation
validation_dataset = validation_dataset.map(template_dataset, remove_columns=list(validation_dataset.features))
# print random sample
print(validation_dataset[randint(0, len(validation_dataset))]["text"])

# empty list to save remainder from batches to use in next batch
remainder = {"input_ids": [], "attention_mask": [], "token_type_ids": []}

def chunk(sample, chunk_length=2048):
    # define global remainder variable to save remainder from batches to use in next batch
    global remainder
    # Concatenate all texts and add remainder from previous batch
    concatenated_examples = {k: list(chain(*sample[k])) for k in sample.keys()}
    concatenated_examples = {k: remainder[k] + concatenated_examples[k] for k in concatenated_examples.keys()}
    # get total number of tokens for batch
    batch_total_length = len(concatenated_examples[list(sample.keys())[0]])

    # get max number of chunks for batch
    if batch_total_length >= chunk_length:
        batch_chunk_length = (batch_total_length // chunk_length) * chunk_length

    # Split by chunks of max_len.
    result = {
        k: [t[i : i + chunk_length] for i in range(0, batch_chunk_length, chunk_length)]
        for k, t in concatenated_examples.items()
    }
    # add remainder to global variable for next batch
    remainder = {k: concatenated_examples[k][batch_chunk_length:] for k in concatenated_examples.keys()}
    # prepare labels
    result["labels"] = result["input_ids"].copy()
    return result


# tokenize and chunk dataset

# training
lm_train_dataset = train_dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(train_dataset.features)
).map(
    partial(chunk, chunk_length=2048),
    batched=True,
)

# validation
lm_valid_dataset = validation_dataset.map(
    lambda sample: tokenizer(sample["text"]), batched=True, remove_columns=list(validation_dataset.features)
).map(
    partial(chunk, chunk_length=2048),
    batched=True,
)

# Print total number of samples
print(f"Total number of samples: {len(validation_dataset)}")

## Upload dataset to S3

This step is optional if you're using only local mode. S3 is required if you're scaling your model training to multi-node/multi-gpu training job.

In [None]:
f'{s3_data_uri}/train'

In [None]:
# save train_dataset to s3
training_input_path = f'{s3_data_uri}/train'
lm_train_dataset.save_to_disk("./train")
sagemaker.s3.S3Uploader.upload(local_path="./train", desired_s3_uri=training_input_path, sagemaker_session=sagemaker_session)

print(f"saving training dataset to: {training_input_path}")

# save train_dataset to s3
validation_input_path = f'{s3_data_uri}/validation'
lm_valid_dataset.save_to_disk("./validation")
sagemaker.s3.S3Uploader.upload(local_path="./validation", desired_s3_uri=validation_input_path, sagemaker_session=sagemaker_session)

print(f"saving validation dataset to: {validation_input_path}")

In [None]:
from datetime import datetime
from sagemaker.huggingface import HuggingFace
from sagemaker.local import LocalSession

In [None]:
# define Training Job Name 
time_suffix = datetime.now().strftime('%y%m%d%H%M')
job_name = f'huggingface-qlora-{time_suffix}'
experiments_name = f"exp-{model_id.replace('/', '-')}"
run_name = f"qlora-finetune-run-{time_suffix}"

In [None]:
DUMMY_IAM_ROLE = 'arn:aws:iam::111111111111:role/service-role/AmazonSageMaker-ExecutionRole-20200101T000001'
LOCAL_SESSION = LocalSession()
LOCAL_SESSION.config = {'local': {'local_code': True}}

### Write Session Files

In [None]:
!mkdir ./code

In [None]:
%%writefile code/finetune_llm.py

import os
import boto3
import json
import tarfile
import argparse
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    set_seed,
    default_data_collator,
    BitsAndBytesConfig,
    Trainer,
    TrainingArguments,
)
from peft import (
    get_peft_model,
    LoraConfig,
    prepare_model_for_kbit_training,
)
from peft.tuners.lora import LoraLayer
from datasets import load_from_disk
import torch
import bitsandbytes as bnb
import sagemaker
import shutil


os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"


# Reference: https://github.com/artidoro/qlora/blob/main/qlora.py
def print_trainable_parameters(
    model, 
    use_4bit=False
):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}"
    )


# Reference: https://github.com/artidoro/qlora/blob/main/qlora.py
def find_all_linear_names(
    model
):
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, bnb.nn.Linear4bit):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:  # needed for 16-bit
        lora_module_names.remove("lm_head")
    return list(lora_module_names)


def create_peft_model(
    model, 
    r_value, 
    lora_alpha, 
    lora_dropout, 
    task_type,
    gradient_checkpointing=True, 
    bf16=True
):

    # prepare int-4 model for training
    model = prepare_model_for_kbit_training(
        model, use_gradient_checkpointing=gradient_checkpointing
    )
    if gradient_checkpointing:
        model.gradient_checkpointing_enable()

    # get lora target modules
    modules = find_all_linear_names(model)
    print(f"Found {len(modules)} modules to quantize: {modules}")

    peft_config = LoraConfig(
        r=r_value,
        lora_alpha=lora_alpha,
        target_modules=modules,
        lora_dropout=lora_dropout,
        bias="none",
        task_type=task_type
    )

    model = get_peft_model(model, peft_config)

    model.print_trainable_parameters()
    
    return model


def finetune_llm(args):
    # set seed
    set_seed(args.seed)
    
    print(f"loading dataset from {args.sm_train_dir} and {args.sm_validation_dir}")

    train_dataset = load_from_disk(args.sm_train_dir)
    validation_dataset = load_from_disk(args.sm_validation_dir)

    print(f"Training Dataset: {len(train_dataset)} || Validation Dataset: {len(validation_dataset)}")
    
    print(f"region: {args.region}")
    
    # load model from the hub with a bnb config
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
    )

    model = AutoModelForCausalLM.from_pretrained(
        args.model_id,
        use_cache=False if args.gradient_checkpointing else True,
        device_map="auto",
        quantization_config=bnb_config,
        token=args.hf_token
    )
    
    tokenizer = AutoTokenizer.from_pretrained(
        args.model_id, 
        token=args.hf_token
    )

    # create peft config
    model = create_peft_model(
        model, 
        r_value=args.lora_r, 
        lora_alpha=args.lora_alpha, 
        lora_dropout=args.lora_dropout, 
        task_type=args.task_type,
        gradient_checkpointing=args.gradient_checkpointing, 
        bf16=args.bf16
    )

    # Define training args
    training_args = TrainingArguments(
        output_dir=f"{args.sm_output_dir}/{args.model_id}/trainer-outputs",
        per_device_train_batch_size=args.per_device_train_batch_size,
        bf16=args.bf16,  # Use BF16 if available
        learning_rate=args.learning_rate,
        num_train_epochs=args.epochs,
        gradient_checkpointing=args.gradient_checkpointing,
        logging_dir=f"{args.sm_output_dir}/{args.model_id}/logs",
        logging_strategy="steps",
        logging_steps=args.logging_steps,
        save_strategy="no",
    )

    # Create Trainer instance with SageMaker experiments callback
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=validation_dataset,
        data_collator=default_data_collator,
        # callbacks=[SageMakerExperimentsCallback(region=args.region)]
    )
    
    # mutes warnings during training, reenable during inference
    model.config.use_cache = False 

    # Start training
    trainer.train()
    
    # Start evaluation
    trainer.evaluate()
    
    temp_dir="/tmp/model/"
    
    if args.merge_weights:
        
        trainer.model.save_pretrained(temp_dir, safe_serialization=False)
        # clear memory
        del model
        del trainer
        torch.cuda.empty_cache()
        
        from peft import AutoPeftModelForCausalLM

        # load PEFT model in fp16
        model = AutoPeftModelForCausalLM.from_pretrained(
            temp_dir,
            low_cpu_mem_usage=True,
            torch_dtype=torch.float16,
        )  
        # Merge LoRA and base model and save
        model = model.merge_and_unload()        
        model.save_pretrained(
            args.sm_model_dir, safe_serialization=True, max_shard_size="2GB"
        )   
        
        source_dir = './djl-inference/'

        # copy djl-inference files to model directory
        for f in os.listdir(source_dir):
            source_f = os.path.join(source_dir, f)
            
            # Copy the files to the destination folder
            shutil.copy(source_f, args.sm_model_dir)
        
    else:   
        # save finetuned LoRA model and then the tokenizer for inference
        trainer.model.save_pretrained(
            args.sm_model_dir, 
            safe_serialization=True
        )
    tokenizer.save_pretrained(
        args.sm_model_dir
    )
    
    print("Done!")
    

def read_parameters():
    parser = argparse.ArgumentParser()
    # add model id and dataset path argument
    parser.add_argument("--hf_token", type=str, help="Hugging face token for gated models")
    parser.add_argument("--model_id", type=str, help="Hugging face model id to use for training.")
    parser.add_argument("--epochs", type=int, default=1, help="No of epochs to train the model")
    parser.add_argument("--per_device_train_batch_size", type=int, default=2, help="Batch size to use for training.")
    parser.add_argument("--learning_rate", type=float, default=1e-5, help="Model learning rate")
    parser.add_argument("--seed", type=int, default=8, help="Seed to use for training")
    parser.add_argument("--gradient_checkpointing", type=bool, default=True, help="Path to deepspeed config file")
    parser.add_argument("--bf16", type=bool, default=False if torch.cuda.get_device_capability()[0] == 8 else False, help="Whether to use bf16.")
    parser.add_argument("--lora_r", type=int, default=64, help="Lora attention dimension value")
    parser.add_argument("--lora_alpha", type=int, default=16, help="The alpha parameter for Lora scaling")
    parser.add_argument("--lora_dropout", type=float, default=0.1, help="The dropout probability for Lora layers")
    parser.add_argument(
        "--task_type", 
        type=str, default="CAUSAL_LM", 
        help="Choose from: CAUSAL_LM, FEATURE_EXTRACTION, QUESTION_ANS, SEQ_2_SEQ_LM, SEQ_CLS, TOKEN_CLS"
    )
    parser.add_argument(
        "--merge_weights",
        action='store_true',
        help="Whether to merge LoRA weights with base model.",
    )
    parser.add_argument("--logging_steps", type=int, default=2, help="Step interval to start logging to console/sagemaker experiments")
    parser.add_argument("--region", type=str, default="us-east-1", help="SageMaker job execution region")
    
    # sagemaker env args: refer to this for more arguments: https://github.com/aws/sagemaker-training-toolkit/blob/master/ENVIRONMENT_VARIABLES.md
    parser.add_argument("--sm_model_dir", type=str, default=os.environ["SM_MODEL_DIR"])
    parser.add_argument("--sm_train_dir", type=str, default=os.environ["SM_CHANNEL_TRAINING"])
    parser.add_argument("--sm_validation_dir", type=str, default=os.environ["SM_CHANNEL_VALIDATION"])
    parser.add_argument("--sm_current_host", type=str, default=os.environ["SM_CURRENT_HOST"])
    parser.add_argument("--sm_hosts", type=list, default=os.environ["SM_HOSTS"])
    parser.add_argument("--sm_output_dir", type=list, default=os.environ["SM_OUTPUT_DIR"])
    parser.add_argument("--n_gpus", type=list, default=os.environ["SM_NUM_GPUS"])
    
    args, _ = parser.parse_known_args()
    return args


def main():
    args = read_parameters()
    print(args)
    finetune_llm(args)


if __name__ == "__main__":
    main()


In [None]:
%%writefile code/smexperiments_callback.py
""" SageMaker Experiments callback implementation"""

import importlib
import logging
from transformers import TrainerCallback


# disable INFO and WARNING logging status to prevent flood of WARNs
logging.getLogger("sagemaker").setLevel(logging.CRITICAL)


def is_sagemaker_available():
    return importlib.util.find_spec("sagemaker") is not None


class SageMakerExperimentsCallback(TrainerCallback):
    """
    SageMaker Experiments Plus transformer callback. 
    Designed to allow auto logging from transformer API.
    """
    def __init__(
        self, 
        region,
        _has_sagemaker_experiments=is_sagemaker_available()
    ):
        
        assert (
            _has_sagemaker_experiments
        ), "SageMakerExperimentsCallback requires sagemaker to be install. Run 'pip install -U sagemaker'"
        
        import boto3
        import sagemaker
        from sagemaker.experiments.run import load_run      
        
        self.sagemaker_session = sagemaker.session.Session(
            boto3.session.Session(region_name=region)
        )
        self.local_load_run = load_run
        
        # epoch tracker
        self.last_epoch = None
        
        with load_run(sagemaker_session=self.sagemaker_session) as run: 
            self.sm_experiments_run = run
            self.ctx_exp_name = run.experiment_name
            self.ctx_run_name = run.run_name
            
            print(f"[sm-callback] loaded sagemaker Experiment (name: {self.ctx_exp_name}) with run: {self.ctx_run_name}!")
    
    def on_init_end(self, args, state, control, **kwargs):
        
        print(f"[sm-callback] adding parameters to {self.ctx_exp_name}: {self.ctx_run_name}")
        
        with self.local_load_run(
            experiment_name=self.ctx_exp_name, 
            run_name=self.ctx_run_name,
            sagemaker_session=self.sagemaker_session
        ) as ctx_run: 
            ctx_run.log_parameters(
                {
                    k: str(v) if str(v) else None 
                        for k, v in vars(args).items() 
                            if isinstance(v, (str, int, float, bool))
                }
            )
              
    def on_log(self, args, state, control, logs=None, **kwargs):
        
        with self.local_load_run(
            experiment_name=self.ctx_exp_name, 
            run_name=self.ctx_run_name,
            sagemaker_session=self.sagemaker_session
        ) as ctx_run: 
            
            for k, v in logs.items():
                if not k.startswith('eval'):
                    ctx_run.log_metric(
                        name=f"train/step:{k}", 
                        value=v, 
                        step=int(state.global_step)
                    )

    def on_epoch_end(self, args, state, control, logs=None, **kwargs): 
        """
        On epoch end we average results and log it into an epoch value as x 
        and average of metrics as y
        """
        with self.local_load_run(
            experiment_name=self.ctx_exp_name, 
            run_name=self.ctx_run_name,
            sagemaker_session=self.sagemaker_session
        ) as ctx_run:
            
            epoch_history = state.log_history
            
            if self.last_epoch is None:
                self.last_epoch = 0
            
            current_epoch = int(round(epoch_history[-1]['epoch']))
            
            print(f"[sm-callback] start: {self.last_epoch} ep to end: {current_epoch} ep!")
            
            epoch_loss_values = {
                row['epoch']: row['loss'] 
                for row in epoch_history 
                if self.last_epoch < row['epoch'] <= current_epoch
            }
            average_epoch_loss = sum(list(epoch_loss_values.values()))/len(epoch_loss_values)
            
            ctx_run.log_metric(
                name="train/epoch:loss",
                value=average_epoch_loss, 
                step=int(current_epoch)
            )
            
            self.last_epoch = current_epoch

    def on_evaluate(self, args, state, control, logs=None, **kwargs):
        """
        On train end we average results and log it into an epoch value as x 
        and average of metrics as y
        """
        with self.local_load_run(
            experiment_name=self.ctx_exp_name, 
            run_name=self.ctx_run_name,
            sagemaker_session=self.sagemaker_session
        ) as ctx_run:
            
            epoch_history = state.log_history
            
            ctx_run.log_metric(
                name="final/eval:loss",
                value=epoch_history[-1]["eval_loss"] 
            )

In [None]:
# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point='finetune_llm.py',      
    source_dir='code',         
    instance_type='local_gpu',   
    instance_count=1,       
    role=DUMMY_IAM_ROLE,
    base_job_name=job_name,             
    transformers_version='4.36',            
    pytorch_version='2.1',             
    py_version='py310',           
    hyperparameters={
        'hf_token': hf_token,
        'model_id': model_id,                             
        'dataset_path': '/opt/ml/input/data/training',    
        'epochs': 1,                                      
        'per_device_train_batch_size': 2,                 
        'lr': 1e-4,
        'merge_weights':True,
        'region':region,
    },
    output_path='file://model/',
    sagemaker_session=LOCAL_SESSION
)

In [None]:
# starting the train job with our uploaded datasets as input
data = {
    'training': "file://./train", #training_input_path, 
    'validation': "file://./validation", #validation_input_path
}
huggingface_estimator.fit(
    data, 
    wait=True
)