In [None]:
!pip install transformers==4.38.1 datasets==2.17.1 peft==0.8.2 bitsandbytes==0.42.0 trl==0.7.11 --upgrade --quiet

This notebook has been tested on Amazon SageMaker Notebook Instances with single GPU on ml.g5.2xlarge

In [45]:
from datasets import load_dataset

from random import randrange

# Load dataset from the hub
dataset = load_dataset("databricks/databricks-dolly-15k", split="train")

#For local testing the fine tuning code, we limit the dataset to 20 samples 
#dataset = load_dataset("databricks/databricks-dolly-15k", split="train").select(range(20))

print(f"dataset size: {len(dataset)}")
print(dataset[randrange(len(dataset))])


dataset size: 15011
{'instruction': 'Given this paragraph, which highs school did Drake Maye attend?', 'context': 'Drake Maye was born on August 30, 2002, in Charlotte, North Carolina. He attended and played high school football for Myers Park High School in Charlotte, where he was named MaxPreps North Carolina player of the year. He was a four-star prospect and originally committed to Alabama before flipping to North Carolina.', 'response': 'Based on this text, Drake Maye attended Myers Park High School in Charlotte, North Carolina.', 'category': 'closed_qa'}


In [37]:
local_path = "./dataset/dolly.hf"
dataset.save_to_disk(local_path)

Saving the dataset (0/1 shards):   0%|          | 0/15011 [00:00<?, ? examples/s]

In [38]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::70768*******:role/service-role/AmazonSageMaker-ExecutionRole-20191024T163188
sagemaker session region: us-east-1


In [39]:
# save train_dataset to s3

s3_data_prefix = "train/data/dolly.hf"
bucket = sagemaker_session_bucket  # bucket to house artifacts
training_input_path = sess.upload_data(local_path, bucket, s3_data_prefix)
print(f"training dataset uploaded to --- &gt; {training_input_path}")

training dataset uploaded to --- &gt; s3://sagemaker-us-east-1-70768*******/train/data/dolly.hf


### Training with SageMaker

In [40]:
model_id = "mistralai/Mistral-7B-Instruct-v0.1"
instance_type = 'ml.g5.4xlarge'  # instances type used for the training job

In [41]:
import time
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

# define Training Job Name
job_name = f'huggingface-qlora-{model_id.replace("/", "-").lower()}'

# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset_path': '/opt/ml/input/data/training/dolly.hf',    # path where sagemaker will save training dataset
  'epochs': 1,                                      # number of training epochs
  'per_device_train_batch_size': 1,                 # batch size for training
  'lr': 2e-5,                                       # learning rate used during training
}
metric=[
    {"Name": "loss", "Regex": r"'loss':\s*([0-9.]+)"},
    {"Name": "epoch", "Regex": r"'epoch':\s*([0-9.]+)"},
]
# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'train.py',      # train script
    source_dir           = 'training',         # directory which includes all the files needed for training
    metric_definitions   = metric,
    instance_type        = instance_type,   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
)

In [42]:
# define a data input dictonary with our uploaded s3 uris
training_input_path = "s3://sagemaker-us-east-1-70768*******/train/data"
data = {'training': training_input_path}

# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=False)

INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: huggingface-qlora-mistralai-mistral-7b--2024-03-04-07-40-17-256


### Download the model weight from SageMaker Training job

In [26]:
# Specify the training job name
from sagemaker.s3 import S3Downloader

training_job_name = 'huggingface-qlora-mistralai-mistral-7b--2024-03-04-07-40-17-256'
print(sagemaker_session_bucket)
key = f'{training_job_name}/output/model.tar.gz'

# Download the output of the training job
local_path = './results/training_job/'
S3Downloader.download(f's3://{bucket}/{key}', local_path)

sagemaker-us-east-1-70768********


['./results/training_job/model.tar.gz']

### The deployable model artifact with huggingface safe tensor

In [30]:
import tarfile

# Specify the path to the tar.gz file
tar_gz_file = local_path + "model.tar.gz"

# Extract the contents of the tar.gz file
with tarfile.open(tar_gz_file, 'r:gz') as tar:
    tar.extractall('./results/training_job')  # Specify the directory where you want to extract the contents