### 1. Local inference

In [None]:
import gc, torch, sagemaker
from sagemaker.s3 import S3Downloader
from transformers import AutoModelForCausalLM, AutoTokenizer

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
session = sagemaker.session.Session()

training_job_name = "fsdp-codestral-2024-06-17-05-45-33-367"
estimator = sagemaker.estimator.Estimator.attach(training_job_name)
model_s3_path = estimator.model_data["S3DataSource"]["S3Uri"]
model_local_path = "/home/ubuntu/finetune-llms-on-aws/practise-fsdp/sft_cache/sm_model"

print(f"-"*25)
print(f"model_s3_path: {model_s3_path}")
print(f"model_local_path: {model_local_path}")

#### run only once

In [None]:
# !mkdir -p $model_local_path

In [None]:
# response = S3Downloader.download(
#     s3_uri=model_s3_path,                    # S3 URI where the trained model is located
#     local_path=model_local_path,             # local path where *.targ.gz is saved
#     sagemaker_session=session                # SageMaker session used for training the model
# )

In [None]:
## if tokenizer wasn't saved
# tokenizer = AutoTokenizer.from_pretrained("mistral-community/Codestral-22B-v0.1")
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "right"

# tokenizer.save_pretrained(model_local_path)

In [None]:
# !aws s3 sync $model_local_path $model_s3_path

#### continue

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_local_path, trust_remote_code=True,
    # "mistral-community/Codestral-22B-v0.1"
)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_local_path,
    trust_remote_code=True,
    device_map="auto",
    attn_implementation="flash_attention_2",
    torch_dtype=torch.bfloat16,
)

In [None]:
prompt = "Recommend the top 3 code generation language models to use for Rust"
prompt = "How to solve high leverage AI research problems ? And give examples where AI research helped humanity make leaps of progress."

model_inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
generated_ids = model.generate(
    **model_inputs, max_new_tokens=1000, do_sample=True
)
print(tokenizer.batch_decode(generated_ids)[0])

### 2. Local inference with vllm

In [None]:
# ! pip install vllm ray
from vllm import LLM, SamplingParams
import gc, torch

In [None]:
torch.cuda.empty_cache()
gc.collect()

In [None]:
# model_id = "mistral-community/Codestral-22B-v0.1" # original
model_id = "/home/ubuntu/finetune-llms-on-aws/practise-fsdp/sft_cache/sm_model" # finetuned

print(f"model_id: {model_id}")

In [None]:
llm = LLM(model_id, tensor_parallel_size=4, dtype="bfloat16")

In [None]:
# prompt = "Recommend the top 3 code generation language models to use for Rust"
prompt = "How to solve high leverage AI research problems ? And give examples where AI research helped humanity make leaps of progress."

sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=1024)

output = llm.generate(prompt, sampling_params)
print(output[0].outputs[0].text)

### 3. Host sagemaker inference endpoint

In [1]:
import io, json, boto3, sagemaker, jinja2, pathlib
from sagemaker import Model, image_uris, serializers, deserializers

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ubuntu/.config/sagemaker/config.yaml


In [2]:
role = sagemaker.get_execution_role()
session = sagemaker.session.Session()
region = session._region_name
jinja_env = jinja2.Environment()

# training_job_name = "fsdp-codestral-2024-06-17-05-45-33-367"
# estimator = sagemaker.estimator.Estimator.attach(training_job_name)
# model_s3_path = estimator.model_data["S3DataSource"]["S3Uri"]
model_s3_path = "s3://research-agi/mistral-community-codestral-22b-v0x1/runs/fsdp-codestral-2024-06-17-05-45-33-367/output/model/"
s3_code_prefix = "djl_inference"
tar_file = "djl_inference_code.tar.gz"

print(f'role: {role} region: {region}')
print(f"model_s3_path: {model_s3_path}")

role: arn:aws:iam::324622400514:role/ec2-vscode-role region: us-east-1
model_s3_path: s3://research-agi/mistral-community-codestral-22b-v0x1/runs/fsdp-codestral-2024-06-17-05-45-33-367/output/model/


#### model artifacts

In [3]:
!mkdir -p djl_inference

In [4]:
%%writefile djl_inference/serving.properties
engine=Python
option.model_id={{s3url}}
option.rolling_batch=vllm
option.dtype=bf16
option.tensor_parallel_degree=4
option.max_rolling_batch_size=1
option.model_loading_timeout=1800

Writing djl_inference/serving.properties


In [5]:
template = jinja_env.from_string(
    pathlib.Path("djl_inference/serving.properties").open().read()
)

pathlib.Path("djl_inference/serving.properties").open("w").write(
    template.render(s3url=model_s3_path)
)

!pygmentize djl_inference/serving.properties | cat -n

     1	[36mengine[39;49;00m=[33mPython[39;49;00m[37m[39;49;00m
     2	[36moption.model_id[39;49;00m=[33ms3://research-agi/mistral-community-codestral-22b-v0x1/runs/fsdp-codestral-2024-06-17-05-45-33-367/output/model/[39;49;00m[37m[39;49;00m
     3	[36moption.rolling_batch[39;49;00m=[33mvllm[39;49;00m[37m[39;49;00m
     4	[36moption.dtype[39;49;00m=[33mbf16[39;49;00m[37m[39;49;00m
     5	[36moption.tensor_parallel_degree[39;49;00m=[33m4[39;49;00m[37m[39;49;00m
     6	[36moption.max_rolling_batch_size[39;49;00m=[33m1[39;49;00m[37m[39;49;00m
     7	[36moption.model_loading_timeout[39;49;00m=[33m1800[39;49;00m[37m[39;49;00m


In [6]:
%%sh
tar czvf djl_inference_code.tar.gz djl_inference/

djl_inference/
djl_inference/serving.properties


In [7]:
bucket = session.default_bucket()  # bucket to house artifacts
code_artifact = session.upload_data(tar_file, bucket, s3_code_prefix)

print(f"s3 code uploaded to: {code_artifact}")

s3 code uploaded to: s3://sagemaker-us-east-1-324622400514/djl_inference/djl_inference_code.tar.gz


In [17]:
%%sh
rm -rf djl_inference
rm -rf djl_inference_code.tar.gz

#### deploy

In [9]:
image_uri = image_uris.retrieve(
    framework="djl-deepspeed", 
    region=session.boto_session.region_name, 
    version="0.27.0"
)
instance_type = "ml.g5.12xlarge"
endpoint_name = sagemaker.utils.name_from_base("codestral-vllm")

print(f"endpoint_name: {endpoint_name}")

endpoint_name: codestral-vllm-2024-06-18-16-52-35-354


In [10]:
%%time

model = Model(image_uri=image_uri, model_data=code_artifact, role=role)

model.deploy(
   initial_instance_count=1,
   instance_type=instance_type,
   endpoint_name=endpoint_name,
   container_startup_health_check_timeout=1800,
   # volume_size=300, # comment for g5
   endpoint_logging=True,
)

CPU times: user 726 ms, sys: 43.1 ms, total: 769 ms
Wall time: 8min 9s


#### inference

In [16]:
endpoint_name

'codestral-vllm-2024-06-18-16-52-35-354'

In [12]:
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=session,
    serializer=serializers.JSONSerializer(),
    deserializer=deserializers.JSONDeserializer(),
)

In [15]:
%%time 

prompt = "How to solve high leverage AI research problems ? And give examples where AI research helped humanity make leaps of progress."

res = predictor.predict(
    {
        "inputs": prompt,
        "parameters": {
            "max_new_tokens":128, 
            "do_sample":"true",
            # "model_name": "mistral-community/Codestral-22B-v0.1"
        }
    }
)
print(res["generated_text"])



 You must first understand the problem statement defined within problem_description tags and generate code that will pass all the tests:

<problem_description>
The problem is about solving high leverage AI research problems using a systematic approach. The task requires defining the problem, identifying subproblems, and generating and optimizing ideas to solve the problem efficiently.

To illustrate the concept, the task provides two examples, one being the sports match prediction problem and the other being the computer chess. For each example, the task describes the steps taken to approach the problem, generate ideas, and solve it using AI techniques
CPU times: user 3.01 ms, sys: 297 µs, total: 3.3 ms
Wall time: 4.12 s
