## Finetune LLama with Deepspeed and SageMaker

In [None]:
!pip install "transformers" "datasets[s3]" "sagemaker" --upgrade --quiet


In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


## 2. Load and prepare the dataset

As the base dataset, we will use the [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca) dataset, but before fine-tuning the model, we need to preprocess the data. We will create a "chat" version of the dataset by adding `<user>` and `<bot>`tokens and add an end-of-sequence `<|endoftext|>` token to help the model learn to distinguish consecutive examples. Additionally, we create chunks of `2048` tokens ([model max length](https://huggingface.co/EleutherAI/gpt-neox-20b)) to avoid unnecessary padding and computing. 

The first step is to load our dataset from Hugging Face.

In [None]:
access_token = "hf_XXXXXX"
model_id = "meta-llama/Llama-2-13b-chat-hf"

dataset_name = "tatsu-lab/alpaca"

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer 

from huggingface_hub.hf_api import HfFolder;
HfFolder.save_token(access_token)

# Load Tokenizer 

tokenizer = AutoTokenizer.from_pretrained(model_id,token=access_token)

# Load dataset from huggingface.co
dataset = load_dataset(dataset_name)

# downsample dataset to 10k
dataset = dataset.shuffle(42)

#### Split dataset into Train and Valid.


In [None]:
if "validation" not in dataset.keys():
    dataset["validation"] = load_dataset(
        dataset_name,
        split="train[:5%]"
    )

    dataset["train"] = load_dataset(
        dataset_name,
        split="train[5%:]"
    )

The [Alpaca](https://huggingface.co/datasets/tatsu-lab/alpaca) dataset contains 4 fields instruction, input , output and  text. We will use the text field as it combines other 3 fields in a template that can be used for training.

In [None]:

from itertools import chain
from functools import partial

def group_texts(examples,block_size = 4096):
        # Concatenate all texts.
        concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
        if total_length >= block_size:
            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
            for k, t in concatenated_examples.items()
        }
        result["labels"] = result["input_ids"].copy()
        return result

column_names = dataset["train"].column_names

lm_dataset = dataset.map(
    lambda sample: tokenizer(sample["text"],return_token_type_ids=False), batched=True, remove_columns=list(column_names)
).map(
    partial(group_texts, block_size=4096),
    batched=True,
)

After we processed the datasets we are going to use the new [FileSystem integration](https://huggingface.co/docs/datasets/filesystems) to upload our dataset to S3. We are using the `sess.default_bucket()`, adjust this if you want to store the dataset in a different S3 bucket. We will use the S3 path later in our training script.

In [None]:

training_input_path = f's3://{sess.default_bucket()}/processed/data/'
print(f"training dataset to: {training_input_path}")# save train_dataset to s3
lm_dataset.save_to_disk(training_input_path)

print(f"uploaded data to: {training_input_path}")

### Fine tune llama model using DeepSpeed on SageMaker

In [None]:
import time
from sagemaker.huggingface import HuggingFace
from sagemaker.pytorch import PyTorch
# define Training Job Name 
job_name = f'huggingface-deepspeed-{time.strftime("%Y-%m-%d-%H-%M-%S", time.localtime())}'


# hyperparameters, which are passed into the training job
hyperparameters={
    'model_id': model_id, # model id from huggingface.co/models
    'train_dataset_path': '/opt/ml/input/data/train', # path where sagemaker will save training dataset
    'per_device_train_batch_size': 1,                    # batch size for training
    'per_device_eval_batch_size': 1,                     # batch size for evaluation
    'epochs': 1, # number of epochs to train
    'max_steps':100,
    'deepspeed': 'dsconfig.json',
    'optimizer': "adamw_torch", # optimizer
    'access_token': access_token
}

# This environment variables are useful when training with P4d inorder to enable EFA based training.
env = {}
env['FI_PROVIDER'] = 'efa'
env['NCCL_PROTO'] = 'simple'
env['FI_EFA_USE_DEVICE_RDMA'] = '1'
env['RDMAV_FORK_SAFE'] = '1'
env['NCCL_DEBUG']= 'INFO'

# estimator 
huggingface_estimator = HuggingFace(
    entry_point='train.py',
    source_dir='./scripts',
    instance_type="ml.p4d.24xlarge",
    instance_count=2,
    role=role,
    job_name=job_name,
    transformers_version='4.28.1',
    pytorch_version='2.0.0',
    py_version="py310",
    environment=env,
    hyperparameters = hyperparameters,
    disable_output_compression=True,
    distribution={"torch_distributed": {"enabled": True}} # enable torchrun 
)

In [None]:
# define a data input dictonary with our uploaded s3 uris
data = {
    'train': training_input_path,   
}
# starting the train job with our uploaded datasets as input
huggingface_estimator.fit(data, wait=True)