# Deploying the DeepSeek V2 Lite Chat model for text-generation tasks hosted on Amazon SageMaker with DJLServing DLC


❗This notebook works well on `ml.t3.medium` instance with `PyTorch 2.2.0 Python 3.10 CPU optimized` kernel from **SageMaker Studio Classic** or `Python3` kernel from **JupyterLab**.

# Set up Environment

In [None]:
%%capture --no-stderr

!pip install -U pip
!pip install -U "sagemaker>=2.237.3"
!pip install -U "transformers>=4.47.0"

In [None]:
import boto3
import sagemaker

aws_region = boto3.Session().region_name
sess = sagemaker.Session()
bucket = sess.default_bucket()

aws_region, bucket

## Upload the `model.tar.gz`

Create and upload `model.tar.gz` to our sagemaker session bucket.

In [None]:
%%sh

mkdir -p model
rm -rf model/*

# copy the custom inference script into the working directory
cp -rp ../python/code/* model/

rm -f model.tar.gz
tar --exclude "*/.ipynb_checkpoints*" -czvf model.tar.gz model/
tar -tvf model.tar.gz

In [None]:
model_name = 'deepseek-ai/DeepSeek-V2-Lite-Chat'

base_name = model_name.split('/')[-1].replace('.', '-').lower()
base_name

In [None]:
from sagemaker.s3 import S3Uploader

# upload model.tar.gz to s3
s3_model_uri = S3Uploader.upload(
    local_path="./model.tar.gz",
    desired_s3_uri=f"s3://{bucket}/{base_name}"
)

print(f"S3 Code or Model tar ball uploaded to:\n{s3_model_uri}")

In [None]:
!aws s3 ls {s3_model_uri}

## Deploy `DeepSeek-V2-Lite-Chat` model to SageMaker Real-time Endpoint

In [None]:
from sagemaker import image_uris

image_uri = image_uris.retrieve(
    framework="djl-lmi",
    region=aws_region,
    version="0.30.0",
    py_version="py311"
)

image_uri

In [None]:
import sagemaker
from sagemaker import Model
from sagemaker.utils import name_from_base

sm_model_name = name_from_base(base_name, short=True).lower()
role = sagemaker.get_execution_role()

model = Model(
    name=sm_model_name,
    image_uri=image_uri,
    model_data=s3_model_uri,
    role=role
)

In [None]:
%%time

from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer
from sagemaker.utils import name_from_base

endpoint_name = name_from_base(base_name, short=True).lower()
instance_type = 'ml.g5.12xlarge'

_predictor = model.deploy(
    endpoint_name=endpoint_name,
    initial_instance_count=1,
    instance_type=instance_type,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)

# Create a Predictor with SageMaker Endpoint name

In [None]:
from sagemaker import Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

predictor = Predictor(
    endpoint_name=endpoint_name,
    serializer=JSONSerializer(),
    deserializer=JSONDeserializer()
)

# Run Inference

### Standard schema

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
prompt = "Help me write a quick sort code"

messages = [
    {"role": "user", "content": prompt}
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,

)

parameters = {
    "max_new_tokens": 500,
}

response = predictor.predict(
    {"inputs": inputs, "parameters": parameters}
)

print(response["generated_text"])

 Sure, here's a simple implementation of the Quick Sort algorithm in Python:

```python
def quick_sort(arr):
    if len(arr) <= 1:
        return arr
    else:
        pivot = arr[len(arr) // 2]
        left = [x for x in arr if x < pivot]
        middle = [x for x in arr if x == pivot]
        right = [x for x in arr if x > pivot]
        return quick_sort(left) + middle + quick_sort(right)

# Test the function
print(quick_sort([3,6,8,10,1,2,1]))
```

This code works by selecting a 'pivot' element from the array and partitioning the other elements into two sub-arrays, according to whether they are less than or greater than the pivot. The sub-arrays are then recursively sorted.


### Message API

- Ref: https://docs.djl.ai/docs/serving/serving/docs/lmi/user_guides/chat_input_output_schema.html#message

In [None]:
import json


prompt = "Help me write a quick sort code in python"

messages = [
    {
        "role": "user",
        "content": prompt
    }
]

response = predictor.predict({
    "messages": messages,
    "max_tokens": 500
})

print(json.dumps(response, indent=2))

{
  "id": "chatcmpl-139991376379568",
  "object": "chat.completion",
  "created": 1736684437,
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": " Sure, here's a simple implementation of the Quick Sort algorithm in Python:\n\n```python\ndef quicksort(arr):\n    if len(arr) <= 1:\n        return arr\n    pivot = arr[len(arr) // 2]\n    left = [x for x in arr if x < pivot]\n    middle = [x for x in arr if x == pivot]\n    right = [x for x in arr if x > pivot]\n    return quicksort(left) + middle + quicksort(right)\n\n\nprint(quicksort([3,6,8,10,1,2,1]))\n```\n\nIn this code:\n\n- We first handle the base case where the input list is 1 item or less. \n- Then we choose a pivot element (usually the middle one). \n- We create three lists: `left` for elements less than the pivot, `middle` for elements equal to the pivot, and `right` for elements greater than the pivot. \n- We recursively sort the `left` and `right` sublists and combine th

### Text Generation

In [None]:
prompt = "An attention function can be described as mapping a query and a set of key-value pairs to an output, where the query, keys, values, and output are all vectors. The output is"

messages = [
    {"role": "user", "content": prompt}
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,

)

parameters = {
    "max_new_tokens": 500,
}

response = predictor.predict(
    {"inputs": inputs, "parameters": parameters}
)

print(response["generated_text"])

 computed as a weighted sum of the values, where the weight assigned to each value is computed by a compatibility function of the query with the corresponding key.

In more detail, the attention function can be described as follows:

1. Given a query `q` and a set of keys `K = {k1, k2, ..., kN}` and values `V = {v1, v2, ..., vN}`, the attention function computes a set of attention coefficients `α = {α1, α2, ..., αN}`.

2. Each attention coefficient `αi` is computed as a function of the query `q` and the key `ki`, typically using a compatibility function such as the dot product or the additive angular margin (AAM) function.

3. The output `o` is computed as a weighted sum of the values `V` using the attention coefficients `α`:

   `o = Σ(αi * vi)`

4. The attention coefficients `αi` are normalized to sum up to 1, typically using a softmax function:

   `αi = exp(qi * ki) / Σj exp(qi * kj)`

In practice, the attention function is used in various deep learning models, such as the Transfor

# Streaming

### Standard schema streaming

In [None]:
import io
import json
from sagemaker.iterators import BaseIterator


class TokenIterator(BaseIterator):
    def __init__(self, stream):
        super().__init__(stream)
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()

            if line and line[-1] == ord("\n"):
                self.read_pos += len(line)
                full_line = line[:-1].decode('utf-8')
                line_data = json.loads(full_line.lstrip("data:").rstrip("\n"))
                return line_data["token"].get("text", "")
            chunk = next(self.byte_iterator)
            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk["PayloadPart"]["Bytes"])

In [None]:
prompt = "Help me write a quick sort code"

messages = [
    {"role": "user", "content": prompt}
]

inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,

)

parameters = {
    "max_new_tokens": 500,
    "temperature": 0.7,
    "top_p": 0.8
}

payload = {
    "inputs": inputs,
    "parameters": parameters,
    "stream": True
}

response_stream = predictor.predict_stream(
    data=payload,
    custom_attributes="accept_eula=false",
    iterator=TokenIterator,
)

In [None]:
for token in response_stream:
    print(token, end="", flush=True)

 Sure, here's a simple implementation of the Quick Sort algorithm in Python:

```python
def quick_sort(arr):
    if len(arr) <= 1:
        return arr
    else:
        pivot = arr[len(arr) // 2]
        left = [x for x in arr if x < pivot]
        middle = [x for x in arr if x == pivot]
        right = [x for x in arr if x > pivot]
        return quick_sort(left) + middle + quick_sort(right)

# Test the function
print(quick_sort([3,6,8,10,1,2,1]))
```

This code works by selecting a 'pivot' element from the array and partitioning the other elements into two sub-arrays, according to whether they are less than or greater than the pivot. The sub-arrays are then recursively sorted.

### Message Schema streaming

In [None]:
import io
import json
from sagemaker.iterators import BaseIterator


class MessageTokenIterator(BaseIterator):
    def __init__(self, stream):
        super().__init__(stream)
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()

            if line and line[-1] == ord("\n"):
                self.read_pos += len(line)
                full_line = line[:-1].decode('utf-8')
                line_data = json.loads(full_line.lstrip('data:').rstrip('\n'))
                return line_data['choices'][0]['delta'].get('content', '')
            chunk = next(self.byte_iterator)
            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk['PayloadPart']['Bytes'])

In [None]:
prompt = "Help me write a quick sort code"

messages = [
    {
        "role": "user",
        "content": prompt
    }
]

payload = {
    "messages": messages,
    "max_tokens": 500,
    "temperature": 0.7,
    "top_p": 0.8,
    "stream": "true"
}

response_stream = predictor.predict_stream(
    data=payload,
    custom_attributes="accept_eula=false",
    iterator=MessageTokenIterator,
)

In [None]:
for token in response_stream:
    print(token, end="", flush=True)

# Clean up the environment

In [None]:
predictor.delete_model()
predictor.delete_endpoint()

# References

- [DeepSeek V2 Lite Chat Model Card](https://huggingface.co/deepseek-ai/DeepSeek-V2-Lite-Chat)
- [deepseek-ai/deepseek-coder-6.7b-instruct SageMaker LMI deployment guide](https://github.com/aws-samples/llm_deploy_gcr/blob/main/sagemaker/deepseek_coder_6.7_instruct.ipynb)