In [None]:
!pip install transformers==4.38.1 datasets==2.17.1 peft==0.8.2 bitsandbytes==0.42.0 trl==0.7.11 --upgrade --quiet

In [None]:
from datasets import load_dataset
from random import randrange

# Load dataset from the hub
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

#For local testing the fine tuning code, we limit the dataset to 20 samples 
#dataset = load_dataset("databricks/databricks-dolly-15k", split="train").select(range(20))

print(f"dataset size: {len(dataset)}")
print(dataset[randrange(len(dataset))])

In [None]:
dataset.save_to_disk("./dataset/dolly.hf")

### Local test with Python script

In [None]:
!python ./training/train.py --dataset_path "./dataset/dolly.hf" --model_save_path "./results" --job_output_path "./results" --per_device_train_batch_size 1 --epochs 1

### Local test with SageMaker sdk

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
instance_type = 'local_gpu'  # instances type used for the training job
training_input_path = "file://./dataset"

In [None]:
import time
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

# define Training Job Name
job_name = f'huggingface-qlora-{model_id.replace("/", "-").lower()}'

# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training/dolly.hf',    # path where sagemaker will save training dataset
  'epochs': 1,                                      # number of training epochs
  'per_device_train_batch_size': 1,                 # batch size for training
  'lr': 2e-4,                                       # learning rate used during training
}
metric=[
    {"Name": "loss", "Regex": r"'loss':\s*([0-9.]+)"},
    {"Name": "epoch", "Regex": r"'epoch':\s*([0-9.]+)"},
]
# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'train.py',      # train script
    source_dir           = 'training',         # directory which includes all the files needed for training
    metric_definitions   = metric,
    instance_type        = instance_type,   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/opt/ml/.cache" }, # set env variable to cache models in /tmp
)

In [None]:
# define a data input dictonary with our uploaded s3 uris
data = {'training': training_input_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)