## Convert HF weights to Nemo format

In order to use a pretrained model weights we need to convert HF Weights into nemo checkpoint. This notebook helps in converting the HF pretrained weights to Nemo checkpoints.

In [None]:
#retrive the docker image URL stored in step 1
%store -r docker_image 

use_fsx = False # set this to true and check other fsx parameters to use FSxL for the job

In [None]:
import sagemaker 

sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

role = sagemaker.get_execution_role()

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")

In [None]:
# Retrive the FSX details from Store Magic 

if use_fsx:
    #retrive fsx details
    %store -r fsx_id
    %store -r sec_group
    %store -r private_subnet_id
    %store -r fsx_mount
    %store -r fsx_file_system_path
else:
    use_fsx = False

In [None]:
hyperparameters = {}
hyperparameters["access_token"] =  "hf_xxxxx" # update the access token from hf
hyperparameters["model_name"] = "meta-llama/Llama-2-7b-hf"
hyperparameters["tp_degree"] = 8
hyperparameters["pp_degree"] = 1

In [None]:
# setup fsx config for data channels
from sagemaker.inputs import FileSystemInput
if use_fsx:
    FS_ID = fsx_id # FSX ID
    FS_BASE_PATH = "/" + fsx_mount + "/" + fsx_file_system_path # Path in the filesystem that needs to be mounted
    SUBNET_ID = private_subnet_id # Subnet to launch SM jobs in
    SEC_GRP = [sec_group]

    fsx_train_input = FileSystemInput(
        file_system_id=FS_ID,
        file_system_type='FSxLustre',
        directory_path=FS_BASE_PATH + "/nemo_llama",
        file_system_access_mode="rw"
    )
    hyperparameters["output_path"] = "/opt/ml/input/data/train/llama7b_weights"
    data_channels = {"train": fsx_train_input}

else:
    checkpoint_s3_uri = "s3://" + sagemaker_session_bucket + "/nemo_llama_experiment"
    # we will use the sagemaker s3 checkpoints mechanism since we need read/write access to the paths.
    hyperparameters["output_path"] = "/opt/ml/checkpoints/llama7b_weights"
    hyperparameters["checkpoint-dir"] = '/opt/ml/checkpoints'

In [None]:
from sagemaker.pytorch import PyTorch
# Need to check if this works on multinode with torchrun.
estimator = PyTorch(
    base_job_name="nemo-megatron-data-prep",
    source_dir="./scripts",
    entry_point="convert_hf_checkpoint_to_nemo.py",
    role=role,
    image_uri=docker_image,
    instance_count=1,
    instance_type="ml.trn1.2xlarge",
    sagemaker_session=sess,
    volume_size=512,
    hyperparameters=hyperparameters,
    debugger_hook_config=False,
    checkpoint_s3_uri=checkpoint_s3_uri if not use_fsx else None,
    checkpoint_local_path=hyperparameters["checkpoint-dir"] if not use_fsx else None,
    disable_output_compression=True,
    subnets = [SUBNET_ID] if use_fsx else None, # Give SageMaker Training Jobs access to FSx resources in your Amazon VPC
    security_group_ids=SEC_GRP if use_fsx else None,
)

In [None]:
if use_fsx:
    estimator.fit(data_channels)
else:
    estimator.fit()

The above job will store the model in the s3 bucket specified.