In [None]:
!pip install "sagemaker>=2.140.0" "transformers==4.26.1" "datasets[s3]==2.10.1" --upgrade

In [None]:
import sagemaker
import boto3
from datasets import load_dataset
from sagemaker.huggingface import HuggingFace

In [None]:

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
s3_prefix = "phi3-dataset"
dataset_cache = f's3://{sess.default_bucket()}/{s3_prefix}/train'

In [None]:

base_job_name = 'phi3-finetuning'
# enables spot training
use_spot_instances=True
# max time including spot start + training time
max_wait=7200
# expected training time
max_run=4000
checkpoint_s3_uri = f's3://{sess.default_bucket()}/{base_job_name}/checkpoints'
# hyperparameters, which are passed into the training job
hyperparameters={
    'dataset_dir': dataset_cache,
    'output_dir': checkpoint_s3_uri,
}

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point='train.py',
    source_dir='./scripts',
    instance_type='ml.p3.2xlarge',
    instance_count=1,
    base_job_name=base_job_name,
    role=role,
    transformers_version='4.26.0',
    pytorch_version='1.13.1',
    py_version='py39',
    hyperparameters=hyperparameters,
    use_spot_instances=True,
    max_wait=max_wait,
    max_run=max_run,
    checkpoint_s3_uri=checkpoint_s3_uri,
)

In [None]:
huggingface_estimator.fit()