In [1]:
!pip install "transformers==4.31.0" "datasets[s3]==2.13.0" sagemaker --upgrade --quiet

In [1]:
%env AWS_PROFILE=dev-admin
%env AWS_REGION=us-east-1
%env HF_HOME=~/.cache/huggingface
%env TOKENIZERS_PARALLELISM=false

env: AWS_PROFILE=dev-admin
env: AWS_REGION=us-east-1
env: HF_HOME=~/.cache/huggingface
env: TOKENIZERS_PARALLELISM=fale


In [37]:
from scripts.aws_init import init_sagemaker

sagemaker_session_bucket = "sagemaker-ms-thesis-llm"
# role = "arn:aws:iam::171706357329:role/service-role/SageMaker-ComputeAdmin"
role = "arn:aws:iam::171706357329:role/service-role/AmazonSageMakerServiceCatalogProductsExecutionRole"

sess = init_sagemaker(sagemaker_session_bucket)

INFO:botocore.tokens:Loading cached SSO token for slu-sso


sagemaker bucket: sagemaker-ms-thesis-llm
sagemaker session region: us-east-1


In [38]:
from huggingface_hub import login, HfFolder

login(token=HfFolder.get_token())

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /Users/andrewbeiler/.cache/huggingface/token
Login successful


In [39]:
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-2-7b-hf" # sharded weights
tokenizer = AutoTokenizer.from_pretrained(model_id,use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token

In [40]:
# define Training Job Name 
job_name = f'goatNumAndAlphaInstruct-75-25-100K-QLORA'
model_output_path = f's3://{sagemaker_session_bucket}/models'

In [44]:
import time
from sagemaker.huggingface import HuggingFace
from huggingface_hub import HfFolder

# hyperparameters, which are passed into the training job
hyperparameters ={
  'model_id': model_id,                             # pre-trained model
  'dataset': "abeiler/GOAT_Numeric_and_Alpha_Instruct",
  'data_rev': "75_25_100K",
  'epochs': 1,                                      # number of training epochs
  'per_device_train_batch_size': 4,                 # batch size for training
  'lr': 1e-4,                                       # learning rate used during training
  'merge_weights': True,                            # wether to merge LoRA into the model (needs more memory)
  'lora_r': 64,
  'lora_alpha': 16,
  'lora_dropout': 0.1,
  'output_data_path': '/opt/ml/output',
  'push_to_hub': True,                            # Defines if we want to push the model to the hub
  'hub_model_id': job_name, # The model id of the model to push to the hub
  'hub_strategy': 'every_save',                   # The strategy to use when pushing the model to the hub
  'hub_token': HfFolder.get_token()   
}

# create the Estimator
huggingface_estimator = HuggingFace(
    entry_point          = 'run_clm.py',      # train script
    source_dir           = 'phil-examples',   # directory which includes all the files needed for training
    instance_type        = 'ml.g5.4xlarge',   # instances type used for the training job
    instance_count       = 1,                 # the number of instances used for training
    base_job_name        = job_name,          # the name of the training job
    role                 = role,              # Iam role used in training job to access AWS ressources, e.g. S3
    volume_size          = 300,               # the size of the EBS volume in GB
    transformers_version = '4.28',            # the transformers version used in the training job
    pytorch_version      = '2.0',             # the pytorch_version version used in the training job
    py_version           = 'py310',           # the python version used in the training job
    hyperparameters      =  hyperparameters,  # the hyperparameters passed to the training job
    environment          = { "HUGGINGFACE_HUB_CACHE": "/tmp/.cache" }, # set env variable to cache models in /tmp
    output_path          = f"{model_output_path}/",
    code_location        = model_output_path,
)

In [45]:
print(model_id)
print(job_name)

meta-llama/Llama-2-7b-hf
goatNumAndAlphaInstruct-75-25-100K-QLORA


We can now start our training job, with the `.fit()` method passing our S3 path to the training script.

In [3]:
import os
! source ~/.zshrc
! printenv
print(os.getenv("TELE_API"))

AWS_PROFILE=dev-admin
AWS_REGION=us-east-1
COMMAND_MODE=unix2003
HF_HOME=/Users/andrewbeiler/.cache/huggingface
HOME=/Users/andrewbeiler
HOMEBREW_CELLAR=/opt/homebrew/Cellar
HOMEBREW_PREFIX=/opt/homebrew
HOMEBREW_REPOSITORY=/opt/homebrew
INFOPATH=/opt/homebrew/share/info:
LOGNAME=andrewbeiler
LaunchInstanceID=96BD37E2-9837-4CCA-AAD3-BA07FFD77775
MANPATH=/opt/homebrew/share/man::
MallocNanoZone=0
OLDPWD=/Users/andrewbeiler/projects/llm-data-driven-optimization
ORIGINAL_XDG_CURRENT_DESKTOP=undefined
PATH=/Users/andrewbeiler/projects/llm-data-driven-optimization/venv_310/bin:/Library/Frameworks/Python.framework/Versions/3.8/bin:/opt/homebrew/bin:/opt/homebrew/sbin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/Library/TeX/texbin:/Applications/VMware Fusion.app/Contents/Public:/Library/Apple/usr/bin:/Library/Frameworks/Mono.framework/Versions/Current/Commands
PWD=/Users/andrewbeiler/projects/llm-data-driven-optimization
SECURITYSESSIONID=186b1
SHELL=/bin/zsh
SHLVL=1
SSH_AUTH_SOCK=/private/t

In [5]:
import my_const

api_key = my_const.TELE_API_KEY
usr_id = my_const.TELE_USER 

***REMOVED***
***REMOVED***


In [46]:

import telegram
import my_const

api_key = my_const.TELE_API_KEY#'***REMOVED***'
usr_id = my_const.TELE_USER #'***REMOVED***'

# starting the train job with our uploaded datasets as input
try:
        huggingface_estimator.fit(wait=True)
        msg = 'SageMaker Training Finished!'
except Exception as e:
        msg = 'SageMaker Training Finished with Error'
        print("Error: ", e)
finally:
        bot = telegram.Bot(token=api_key)
        async with bot:
                await bot.send_message(chat_id=usr_id, text=msg)

Using provided s3_resource


INFO:botocore.tokens:Loading cached SSO token for slu-sso
INFO:sagemaker.image_uris:image_uri is not presented, retrieving image_uri based on instance_type, framework etc.
INFO:sagemaker:Creating training-job with name: goatNumAndAlphaInstruct-75-25-100K-QLOR-2023-10-09-17-10-39-967


2023-10-09 17:10:41 Starting - Starting the training job...
2023-10-09 17:10:57 Starting - Preparing the instances for training......
2023-10-09 17:12:14 Downloading - Downloading input data...
2023-10-09 17:12:39 Training - Downloading the training image..............................
2023-10-09 17:17:36 Training - Training image download completed. Training in progress.....bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2023-10-09 17:18:29,534 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training
2023-10-09 17:18:29,548 sagemaker-training-toolkit INFO     No Neurons detected (normal if no neurons installed)
2023-10-09 17:18:29,556 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.
2023-10-09 17:18:29,558 sagemaker_pytorch_container.training INFO     Invoking user training script.
2023-10-09 17:18:30,943 sagemaker-training-toolkit INFO     Installin

INFO:httpx:HTTP Request: POST https://api.telegram.org/bot***REMOVED***/getMe "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.telegram.org/bot***REMOVED***/sendMessage "HTTP/1.1 200 OK"


## Next Steps 

You can deploy your fine-tuned LLaMA model to a SageMaker endpoint and use it for inference. Check out the [Deploy Falcon 7B & 40B on Amazon SageMaker](https://www.philschmid.de/sagemaker-falcon-llm) and [Securely deploy LLMs inside VPCs with Hugging Face and Amazon SageMaker](https://www.philschmid.de/sagemaker-llm-vpc) for more details.

### Pull Model from S3 

In [26]:
import json
from sagemaker.huggingface import HuggingFaceModel
from huggingface_hub import HfFolder

s3_model_uri = "s3://sagemaker-ms-thesis-llm/models/goatV10-testData-withAutoInference-with-2023-09-02-11-38-37-525/output/model.tar.gz"
image_uri = "763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04-v1.0"

# sagemaker config
instance_type = "ml.g5.4xlarge"
number_of_gpu = 1
health_check_timeout = 300

# Define Model and Endpoint configuration parameter
config = {
  'HF_MODEL_ID': "/opt/ml/model", # path to where sagemaker stores the model
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(1024), # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(2048), # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),
  # 'HF_MODEL_QUANTIZE': "bitsandbytes",# Comment in to quantize
# 
}

# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
    role=role,
    image_uri=image_uri,
    model_data=s3_model_uri,
    env=config,
    # source_dir="GOAT/code/",
    # entry_point="inference.py"
)

print(llm_model)


<sagemaker.huggingface.model.HuggingFaceModel object at 0x155132770>


In [27]:
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

INFO:botocore.tokens:Loading cached SSO token for slu-sso
INFO:sagemaker:Creating model with name: huggingface-pytorch-tgi-inference-2023-09-02-11-49-34-978
INFO:sagemaker:Creating endpoint-config with name huggingface-pytorch-tgi-inference-2023-09-02-11-49-35-809
INFO:sagemaker:Creating endpoint with name huggingface-pytorch-tgi-inference-2023-09-02-11-49-35-809


------------------*

UnexpectedStatusException: Error hosting endpoint huggingface-pytorch-tgi-inference-2023-09-02-11-49-35-809: Failed. Reason: The primary container for production variant AllTraffic did not pass the ping health check. Please check CloudWatch logs for this endpoint..

In [113]:
import telegram
import asyncio

api_key = '***REMOVED***'
usr_id = '***REMOVED***'

bot = telegram.Bot(token=api_key)
async with bot:
        await bot.send_message(chat_id=usr_id, text='SageMaker Model Deploy Finished!')

INFO:httpx:HTTP Request: POST https://api.telegram.org/bot***REMOVED***/getMe "HTTP/1.1 200 OK"
INFO:httpx:HTTP Request: POST https://api.telegram.org/bot***REMOVED***/sendMessage "HTTP/1.1 200 OK"


### Build Input Data for Inference

#### Simple String Input

In [58]:
from transformers import AutoTokenizer

model_id = "meta-llama/Llama-2-7b-hf"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.pad_token_id = 0
# tokenizer.padding_side = "left"  # Allow batched inference

In [119]:
data = {
   "inputs": "<s>[INST] <<SYS>>\n\n You are a helpful math assistant\n\n<<SYS>>\n\n10 + 6 = \n[/INST]<s>"
   # "inputs": """You are a helpful AI assistant who responds to question simple and straightforward questions.
   # Question: What is the Capital of California?"""
}

In [108]:
data = {
    "prompt": "[INST] What is the capital of CA? [/INST]",
    "system_prompt": "You are a helpful assistant"
}

In [120]:
payload = {
  "inputs":  json.dumps(data),
  "parameters": {
    # "do_sample": True,
    "top_p": 0.6,
    "temperature": 0.7,
    "top_k": 50,
    "max_new_tokens": 512,
    "repetition_penalty": 1.1,
    # "stop": ["</s>"]
  }
}

# send request to endpoint
response = llm.predict(payload)

print(response[0]["generated_text"])




{
"outputs": " <<SYS>>\n\n You are a helpful math assistant\n\n<<SYS>>\n\n10 + 6 = 16"
}

