# Fine-tune Llama-7B with Dolly dataset

In [25]:
%pip install -U sagemaker -q

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [26]:
import logging 
sagemaker_config_logger = logging.getLogger("sagemaker.config") 
sagemaker_config_logger.setLevel(logging.WARNING)
from sagemaker import get_execution_role, Session
from sagemaker.pytorch import PyTorch

sess = Session()
default_bucket = sess.default_bucket()

In [28]:
## Upload Dolly Dataset to S3 bucket
from sagemaker.s3 import S3Uploader
import sagemaker
import random

output_bucket = sagemaker.Session().default_bucket()
dataset_file = f"dolly-dataset.json"
s3_location = f"s3://{output_bucket}/dolly_dataset"
S3Uploader.upload(dataset_file, s3_location)
print(f"Training data is located here: {s3_location}")

Training data is located here: s3://sagemaker-us-west-2-495365983931/dolly_dataset


In [29]:
##Define the training container
training_image = "763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-training-neuronx:1.13.1-neuronx-py310-sdk2.15.0-ubuntu20.04"

In [30]:
# Pytorch Estimator
pt_estimator = PyTorch(
        entry_point="run_clm.py",
        role=get_execution_role(),
        sagemaker_session=sess,
        source_dir='./',
        instance_count=1,
        instance_type="ml.trn1.32xlarge",
        framework_version='1.13.1',
        py_version='py310',
        image_uri=training_image,
        disable_profiler=True,
        output_path=f"s3://{default_bucket}/llama-finetuning",
        base_job_name="llama-trn1",
        code_bucket=f"s3://{default_bucket}/llama-finetuning_code",
        checkpoint_s3_uri=f"s3://{default_bucket}/llama-finetuning_output",
        distribution={"torch_distributed": {"enabled": True} },  # Required for torchrun-based job launch
        environment={ "FI_EFA_FORK_SAFE": "1", "FI_EFA_USE_DEVICE_RDMA": "1", "FI_PROVIDER": "efa"},
        disable_output_compression=True,
        hyperparameters={
            "model_name_or_path": "/opt/ml/input/data/model",
            "per_device_train_batch_size": 1,
            "train_file": "/opt/ml/input/data/train/processed-train-information_extraction.json",
            "do_train": "",
            "max_steps": 25,
            "block_size": 150,
            "tensor_parallel_size": 8,
            "output_dir": "/opt/ml/model",
            "gradient_accumulation_steps": 8,
            "logging_steps": 5,
            "bf16": True,
            "disable_tqdm": True,
            "learning_rate": 0.0001,
            "num_train_epochs": 5
        }
    )

In [31]:
## Provide your model S3 url here
model_uri=f"s3://sagemaker-us-west-2-495365983931/llama-2-7b/"
train_uri=f"s3://sagemaker-us-west-2-495365983931/dolly_dataset_trn1/"
print(f"{model_uri}")

s3://sagemaker-us-west-2-495365983931/llama-2-7b/


In [32]:
pt_estimator.fit({"model": model_uri, "train": train_uri}, wait=False, logs=False)

INFO:sagemaker:Creating training-job with name: llama-trn1-2023-12-15-19-48-15-133


Using provided s3_resource


In [33]:
model_archive_path = pt_estimator.jobs[-1].describe()['ModelArtifacts']['S3ModelArtifacts']
print(f"Your fine-tuned model is available here:\n\n{model_archive_path}")

KeyError: 'ModelArtifacts'