In [1]:
!pip install "transformers==4.31.0" "datasets[s3]==2.13.0" sagemaker --upgrade --quiet

In [1]:
%env AWS_PROFILE=dev-admin
%env AWS_REGION=us-east-1
%env HF_HOME=~/.cache/huggingface
%env TOKENIZERS_PARALLELISM=fale

env: AWS_PROFILE=dev-admin
env: AWS_REGION=us-east-1
env: HF_HOME=~/.cache/huggingface
env: TOKENIZERS_PARALLELISM=fale


In [11]:
from scripts.aws_init import init_sagemaker

sagemaker_session_bucket = "sagemaker-ms-thesis-llm"
# role = "arn:aws:iam::171706357329:role/service-role/SageMaker-ComputeAdmin"
role = "arn:aws:iam::171706357329:role/service-role/AmazonSageMakerServiceCatalogProductsExecutionRole"

sess = init_sagemaker(sagemaker_session_bucket)

INFO:botocore.tokens:Loading cached SSO token for slu-sso


sagemaker bucket: sagemaker-ms-thesis-llm
sagemaker session region: us-east-1


In [12]:
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-2-7b-hf" # sharded weights
tokenizer = AutoTokenizer.from_pretrained(model_id,use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token

## 3. Fine-Tune LLaMA 13B with QLoRA on Amazon SageMaker

We are going to use the recently introduced method in the paper "[QLoRA: Quantization-aware Low-Rank Adapter Tuning for Language Generation](https://arxiv.org/abs/2106.09685)" by Tim Dettmers et al. QLoRA is a new technique to reduce the memory footprint of large language models during finetuning, without sacrificing performance. The TL;DR; of how QLoRA works is: 

* Quantize the pretrained model to 4 bits and freezing it.
* Attach small, trainable adapter layers. (LoRA)
* Finetune only the adapter layers, while using the frozen quantized model for context.

We prepared a [run_clm.py](./scripts/run_clm.py), which implements QLora using PEFT to train our model. The script also merges the LoRA weights into the model weights after training. That way you can use the model as a normal model without any additional code. The model will be temporally offloaded to disk, if it is too large to fit into memory.

In order to create a sagemaker training job we need an `HuggingFace` Estimator. The Estimator handles end-to-end Amazon SageMaker training and deployment tasks. The Estimator manages the infrastructure use. 
SagMaker takes care of starting and managing all the required ec2 instances for us, provides the correct huggingface container, uploads the provided scripts and downloads the data from our S3 bucket into the container at `/opt/ml/input/data`. Then, it starts the training job by running.

### Harwarde requirements

We also ran several experiments to determine, which instance type can be used for the different model sizes. The following table shows the results of our experiments. The table shows the instance type, model size, context length, and max batch size. 

| Model        | Instance Type     | Max Batch Size | Context Length |
|--------------|-------------------|----------------|----------------|
| [LLama 7B]() | `(ml.)g5.4xlarge` | `3`            | `2048`         |
| [LLama 13B]() | `(ml.)g5.4xlarge` | `2`            | `2048`         |
| [LLama 70B]() | `(ml.)p4d.24xlarge` | `1++` (need to test more configs)            | `2048`         |


> You can also use `g5.2xlarge` instead of the `g5.4xlarge` instance type, but then it is not possible to use `merge_weights` parameter, since to merge the LoRA weights into the model weights, the model needs to fit into memory. But you could save the adapter weights and merge them using [merge_adapter_weights.py](./scripts/merge_adapter_weights.py) after training.

_Note: We plan to extend this list in the future. feel free to contribute your setup!_

In [13]:
# define Training Job Name 
job_name = f'goatOrig-QLORA-test'
model_output_path = f's3://{sagemaker_session_bucket}/models'

use_spot_instances = True # wether to use spot instances or not
max_wait = 400000 # max time including spot start + training time
max_run = 175000 # max expected training time
checkpoint_s3_uri = f's3://{sagemaker_session_bucket}/checkpoints/{job_name}-9.7'

In [14]:
import time
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  # 'train_dataset_path': '/opt/ml/input/data/training',    # path where sagemaker will save training dataset
  # 'val_dataset_path': '/opt/ml/input/data/validation',    # path where sagemaker will save validation dataset
  'dataset': "abeiler/GOAT",
  'epochs': 1,                                      # number of training epochs
  'per_device_train_batch_size': 4,                 # batch size for training
  'lr': 1e-4,                                       # learning rate used during training
  'hf_token': HfFolder.get_token(),                 # huggingface token to access llama 2
  'merge_weights': True,                            # wether to merge LoRA into the model (needs more memory)
  'lora_r': 64,
  'lora_alpha': 16,
  'lora_dropout': 0.1,
  'output_data_path': '/opt/ml/output',
  'push_to_hub': True,                            # Defines if we want to push the model to the hub
  'hub_model_id': job_name, # The model id of the model to push to the hub
  'hub_strategy': 'every_save',                   # The strategy to use when pushing the model to the hub
  'hub_token': HfFolder.get_token()   
}

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'run_clm.py',      # train script
    source_dir           = 'phil-examples',         # directory which includes all the files needed for training
    instance_type        = 'ml.g5.4xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
    output_path          = f"{model_output_path}/",
    code_location        = model_output_path,
    # use_spot_instances   = use_spot_instances,# wether to use spot instances or not
    # max_wait             = max_wait,          # max time including spot start + training time
    # max_run              = max_run,           # max expected training time
    # checkpoint_s3_uri    = checkpoint_s3_uri
)

In [15]:
print(model_id)
print(job_name)


meta-llama/Llama-2-7b-hf
goatOrig-QLORA-test


We can now start our training job, with the `.fit()` method passing our S3 path to the training script.

In [17]:

import telegram
import asyncio

api_key = '***REMOVED***'
usr_id = '***REMOVED***'

ver = "v10"
training_input_path = f's3://{sess.default_bucket()}/datasets/goat/{ver}/training'
validation_input_path = f's3://{sess.default_bucket()}/datasets/goat/{ver}/validation'

# define a data input dictonary with our uploaded s3 uris
data = {'training': training_input_path,
        'validation': validation_input_path}

# starting the train job with our uploaded datasets as input
try:
        huggingface_estimator.fit(data, wait=True)
        msg = 'SageMaker Training Finished!'
except:
        msg = 'SageMaker Training Finished with Error'
finally:
        bot = telegram.Bot(token=api_key)
        async with bot:
                await bot.send_message(chat_id=usr_id, text=msg)

Using provided s3_resource


INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: goatOrig-QLORA-test-2023-09-18-01-53-22-542


2023-09-18 01:53:23 Starting - Starting the training job...
2023-09-18 01:53:39 Starting - Preparing the instances for training......
2023-09-18 01:54:37 Downloading - Downloading input data...
2023-09-18 01:55:02 Training - Downloading the training image...........................
2023-09-18 02:00:03 Training - Training image download completed. Training in progress........bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2023-09-18 02:01:01,616 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2023-09-18 02:01:01,629 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2023-09-18 02:01:01,638 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2023-09-18 02:01:01,639 sagemaker_pytorch_container.training INFO     Invoking user training script.
2023-09-18 02:01:02,979 sagemaker-training-toolkit INFO     Installin

In [21]:
import json
number_of_gpu = 1
config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(2048), # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),
  # 'HF_MODEL_QUANTIZE': "bitsandbytes",# Comment in to quantize
# 
}
image_uri = "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04-v1.0"

huggingface_estimator.create_model(
    role=role,
    image_uri=image_uri,
    env=config
    )

<sagemaker.huggingface.model.HuggingFaceModel at 0x1541e4160>

In [None]:
import telegram
import asyncio

api_key = '***REMOVED***'
usr_id = '***REMOVED***'

bot = telegram.Bot(token=api_key)
async with bot:
        await bot.send_message(chat_id=usr_id, text='SageMaker Training Finished!')

INFO:httpx:HTTP Request: POST https://api.telegram.org/bot***REMOVED***/getMe "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.telegram.org/bot***REMOVED***/sendMessage "HTTP/1.1 200 OK"


In our example for LLaMA 13B, the SageMaker training job took `31728 seconds`, which is about `8.8 hours`. The ml.g5.4xlarge instance we used costs `$2.03 per hour` for on-demand usage. As a result, the total cost for training our fine-tuned LLaMa 2 model was only ~`$18`.

## Next Steps 

You can deploy your fine-tuned LLaMA model to a SageMaker endpoint and use it for inference. Check out the [Deploy Falcon 7B & 40B on Amazon SageMaker](https://www.philschmid.de/sagemaker-falcon-llm) and [Securely deploy LLMs inside VPCs with Hugging Face and Amazon SageMaker](https://www.philschmid.de/sagemaker-llm-vpc) for more details.

### Pull Model from S3 

In [26]:
import json
from sagemaker.huggingface import HuggingFaceModel
from huggingface_hub import HfFolder

s3_model_uri = "s3://sagemaker-ms-thesis-llm/models/goatV10-testData-withAutoInference-with-2023-09-02-11-38-37-525/output/model.tar.gz"
image_uri = "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04-v1.0"

# sagemaker config
instance_type = "ml.g5.4xlarge"
number_of_gpu = 1
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(2048), # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),
  # 'HF_MODEL_QUANTIZE': "bitsandbytes",# Comment in to quantize
# 
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
    role=role,
    image_uri=image_uri,
    model_data=s3_model_uri,
    env=config,
    # source_dir="GOAT/code/",
    # entry_point="inference.py"
)

print(llm_model)


<sagemaker.huggingface.model.HuggingFaceModel object at 0x155132770>


In [27]:
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

INFO:botocore.tokens:Loading cached SSO token for slu-sso
INFO:sagemaker:Creating model with name: huggingface-pytorch-tgi-inference-2023-09-02-11-49-34-978
INFO:sagemaker:Creating endpoint-config with name huggingface-pytorch-tgi-inference-2023-09-02-11-49-35-809
INFO:sagemaker:Creating endpoint with name huggingface-pytorch-tgi-inference-2023-09-02-11-49-35-809


------------------*

UnexpectedStatusException: Error hosting endpoint huggingface-pytorch-tgi-inference-2023-09-02-11-49-35-809: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint..

In [113]:
import telegram
import asyncio

api_key = '***REMOVED***'
usr_id = '***REMOVED***'

bot = telegram.Bot(token=api_key)
async with bot:
        await bot.send_message(chat_id=usr_id, text='SageMaker Model Deploy Finished!')

INFO:httpx:HTTP Request: POST https://api.telegram.org/bot***REMOVED***/getMe "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.telegram.org/bot***REMOVED***/sendMessage "HTTP/1.1 200 OK"


### Build Input Data for Inference

#### Simple String Input

In [58]:
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = 0
# tokenizer.padding_side = "left"  # Allow batched inference

In [119]:
data = {
   "inputs": "<s>[INST] <<SYS>>\n\n You are a helpful math assistant\n\n<<SYS>>\n\n10 + 6 = \n[/INST]<s>"
   # "inputs": """You are a helpful AI assistant who responds to question simple and straightforward questions.
   # Question: What is the Capital of California?"""
}

In [108]:
data = {
    "prompt": "[INST] What is the capital of CA? [/INST]",
    "system_prompt": "You are a helpful assistant"
}

In [120]:
payload = {
  "inputs":  json.dumps(data),
  "parameters": {
    # "do_sample": True,
    "top_p": 0.6,
    "temperature": 0.7,
    "top_k": 50,
    "max_new_tokens": 512,
    "repetition_penalty": 1.1,
    # "stop": ["</s>"]
  }
}

# send request to endpoint
response = llm.predict(payload)

print(response[0]["generated_text"])




{
"outputs": " <<SYS>>\n\n You are a helpful math assistant\n\n<<SYS>>\n\n10 + 6 = 16"
}



## Load weights and merge

In [33]:
import json
from sagemaker.huggingface import HuggingFaceModel
from huggingface_hub import HfFolder

s3_model_uri = "s3://sagemaker-ms-thesis-llm/checkpoints/huggingface-qlora-goatV8-testData"
image_uri = "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04-v1.0"

# sagemaker config
instance_type = "ml.g5.4xlarge"
number_of_gpu = 1
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(2048), # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),
  # 'HF_MODEL_QUANTIZE': "bitsandbytes",# Comment in to quantize
# 
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
    role=role,
    image_uri=image_uri,
    model_data=s3_model_uri,
    env=config
)

print(llm_model)


<sagemaker.huggingface.model.HuggingFaceModel object at 0x157278400>


In [34]:
# from peft import AutoPeftModelForCausalLM

# model = AutoPeftModelForCausalLM.from_pretrained(
#             output_dir,
#             low_cpu_mem_usage=True,
#             device_map="auto", # Added from 28_train_llms_with_qlora example
#             torch_dtype=torch.float16,
#             trust_remote_code=True, # Added from 28_train_llms_with_qlora example
#         )  
        # Merge LoRA and base model and save
model = llm_model.merge_and_unload()
print(model)

AttributeError: 'HuggingFaceModel' object has no attribute 'merge_and_unload'