In [1]:
!pip install sagemaker -U --quiet

#### Before we begin with the actual work for packaging and deploying the model to Amazon SageMaker, we need to setup the notebook environment. 

#### This includes:
#### 1) Execution role for SageMaker Studio
#### 2) Bucket 
#### 3) chosen region 

In [2]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")



sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
sagemaker role arn: arn:aws:iam::099732224608:role/service-role/AmazonSageMaker-ExecutionRole-20250215T170368
sagemaker session region: us-west-2


### Get the container URI and leverage this to HF model to point to that image.

In [3]:
from sagemaker.huggingface import get_huggingface_llm_image_uri

# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="1.1.0"
)

# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi1.1.0-gpu-py39-cu118-ubuntu20.04


### Step 2: Deploying the Model from Hugging Face Hub
### Amazon SageMaker allows direct deployment of models from the Hugging Face Model Hub. For large models like Med42-70B, it's essential to use the Large Model Inference (LMI) container provided by SageMaker, which is optimized for such deployments.

### Define the Model and Deployment Configuration: Utilize the HuggingFaceModel class from the SageMaker SDK:

In [None]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker
import json
import os
# Set CUDA memory allocation configuration to avoid fragmentation
#os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
# Define the IAM role with necessary permissions
role = sagemaker.get_execution_role()

# Hub model configuration
hub = {
   #'HF_MODEL_ID': 'm42-health/med42-70b',  # Model ID from Hugging Face
   #'HF_MODEL_ID': 'm42-health/Llama3-Med42-70B',
   'SM_NUM_GPUS': json.dumps(1),
   'HF_MODEL_ID': 'm42-health/Llama3-Med42-8B',
   'HF_TASK':'question-answering', # Task for the model
   #'HF_API_TOKEN': '' , # Replace with your actual token
   #'HF_MODEL_QUANTIZE': "eetq"
   #'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize 
   #'QUANTIZE' : "4bit",
   #'BITSANDBYTES_USE_4BIT': "true",
   #'DEVICE_MAP': "auto"
}

# Specify the Hugging Face TGI inference container URI
image_uri = llm_image

# Create Hugging Face Model Class with the specified container
huggingface_model = HuggingFaceModel(
    env=hub,  # Configuration for loading model from Hub
    role=role,  # IAM role with permissions
    image_uri=image_uri,  # Use the TGI inference container
    pytorch_version="2.0.0",  # PyTorch version
    py_version='py310',  # Python version        
)

### Deploy the Model 

In [5]:
# Deploy the model to a SageMaker endpoint
import time
from time import gmtime, strftime
endpoint_name = "vitalstorymed4270B" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    #instance_type="ml.p4d.24xlarge",  # Instance type with sufficient GPU resources
    instance_type="ml.g5.2xlarge",
	container_startup_health_check_timeout=300,
    endpoint_name=endpoint_name  # Name of the endpoint
)

-----------!

In [15]:
prompt = """You are a medical question-generation assistant.

Given a patient health log, generate 3 medically relevant follow-up questions. Be concise and only ask short questions. Do **not** give any advice. Do **not** repeat the patient log or the instructions in your response. Only return valid JSON.

The output must follow this format:
```json
{{
  "questions": [
    "First follow-up question here?",
    "Second follow-up question here?",
    "Third follow-up question here?"
  ]
}}```
---
Example 1:
Patient Health Log: "My stomach hurts after I eat anything, and I feel bloated all the time."
Response:
```json
{{
  "questions": [
    "What types of foods trigger your symptoms?",
    "Do you experience nausea or vomiting?",
    "Have you had any recent changes in bowel habits?"
  ]
}}```
---

Example 2:

Patient Health Log: "I keep getting migraines that last all day and don’t respond to painkillers."

Response:
```json
{{
  "questions": [
    "How frequently do the migraines occur?",
    "Do you notice any warning signs before they start?",
    "Have you tried any treatments other than painkillers?"
  ]
}}```

---

Now, generate follow-up questions for the following:

Patient Health Log: {health_log}

Response:
"""

payload = {
    "inputs": prompt + " I have a fever",
    "parameters": {
        "do_sample": True,
        "top_p": 0.6,
        "temperature": 0.8,
        "top_k": 50,
        "max_new_tokens": 512,
        "repetition_penalty": 1.03,
        "stop": ["</s>"]
    }
}

In [16]:
import boto3
ENDPOINT = 'vitalstorymed4270B2025-04-06-02-45-59'
runtime = boto3.client('runtime.sagemaker')
response = runtime.invoke_endpoint(EndpointName = ENDPOINT, ContentType = "application/json",Body = json.dumps(payload))
print(response)

{'ResponseMetadata': {'RequestId': '0485fcb9-6507-4906-a9f4-9bd8656d7307', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '0485fcb9-6507-4906-a9f4-9bd8656d7307', 'x-amzn-invoked-production-variant': 'AllTraffic', 'date': 'Sun, 06 Apr 2025 02:57:09 GMT', 'content-type': 'application/json', 'content-length': '1415', 'connection': 'keep-alive'}, 'RetryAttempts': 0}, 'ContentType': 'application/json', 'InvokedProductionVariant': 'AllTraffic', 'Body': <botocore.response.StreamingBody object at 0x7f3fea71e1d0>}


In [17]:
prediction = json.loads(response['Body'].read().decode('utf-8'))


In [18]:
prediction[0]['generated_text']



In [21]:
import gradio as gr
import boto3
import json
import io

# hyperparameters for llm
parameters = {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "top_k": 50,
    "max_new_tokens": 512,
    "repetition_penalty": 1.1,
    "stop": ["</s>"],
}

# system_prompt = "You are an helpful Medical Assistant, called Vitalstory. Knowing everyting about Medical related."

system_prompt = """
        You are VitalChat, a helpful medical assistant specializing in
        healthcare-related questions. Your goal is to collect enough information
        from the user about their symptom(s) before providing insights.If a
        user's input is vague or lacks details, ask two or three clarifying
        questions before proceeding.Once you have enough context, generate five
        follow-up questions to gather more information.After the three followup
        questions create a summary and advice on next steps
"""


# Helper for reading lines from a stream
class LineIterator:
    def __init__(self, stream):
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()
            if line and line[-1] == ord("\n"):
                self.read_pos += len(line)
                return line[:-1]
            try:
                chunk = next(self.byte_iterator)
            except StopIteration:
                if self.read_pos < self.buffer.getbuffer().nbytes:
                    continue
                raise
            if "PayloadPart" not in chunk:
                print("Unknown event type:" + chunk)
                continue
            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk["PayloadPart"]["Bytes"])


# define format function for our input
def format_prompt(user_input, history, system_prompt):
    """
    Formats the conversation history and user input using a structured instruction format.
    This approach improves the model's ability to follow instructions and ask clarifying questions.
    """

    # Initialize the prompt with system instructions
    prompt = f"<|system|>\n{system_prompt}\n<|system|>\n\n"

    # Ensure history is properly formatted as [(user_input, bot_response), ...]
    if not isinstance(history, list):
        history = []
    formatted_history = []
    for entry in history:
        #print(entry)
        #print(len(entry))
        #if isinstance(entry, dict) and len(entry) == 4 and all(isinstance(x, str) for x in entry):
        if len(entry) == 2 and all(isinstance(x, str) for x in entry):
            formatted_history.append(entry)  # Valid tuple
        else:
            print(f"⚠️ Invalid history entry: {entry}, resetting history.")
            history = []  # Reset history if invalid
            break  # Prevent partial corruption

    # Append formatted history using structured instruction format
    for user_text, bot_response in formatted_history:
        prompt += f"<|prompter|>\n{user_text}\n<|prompter|>\n"
        prompt += f"<|assistant|>\n{bot_response}\n<|assistant|>\n"

    # Append the new user input with instruction
    prompt += f"<|prompter|>\n{user_input}\n<|prompter|>\n"
    prompt += "<|assistant|>\n\n<|assistant|>\n"

    return prompt

def create_gradio_app(
    endpoint_name,
    session=boto3,
    parameters=parameters,
    system_prompt=system_prompt,
    format_prompt=format_prompt,
    concurrency_count=4,
    share=True,
):
    smr = session.client("sagemaker-runtime")

    def generate(
        prompt,
        history,
    ):
        formatted_prompt = format_prompt(prompt, history, system_prompt)

        request = {"inputs": formatted_prompt, "parameters": parameters, "stream": True}
        resp = smr.invoke_endpoint_with_response_stream(
            EndpointName=endpoint_name,
            Body=json.dumps(request),
            ContentType="application/json",
        )

        output = ""
        for c in LineIterator(resp["Body"]):
            c = c.decode("utf-8")
            if c.startswith("data:"):
                chunk = json.loads(c.lstrip("data:").rstrip("/n"))
                if chunk["token"]["special"]:
                    continue
                if chunk["token"]["text"] in request["parameters"]["stop"]:
                    break
                output += chunk["token"]["text"]
                for stop_str in request["parameters"]["stop"]:
                    if output.endswith(stop_str):
                        output = output[: -len(stop_str)]
                        output = output.rstrip()
                        yield output

                yield output
        return output

    demo = gr.ChatInterface(generate, title="Chat with Vital Story", chatbot=gr.Chatbot(layout="panel"))
    demo.queue().launch(share=share)
    #demo.queue(concurrency_count=concurrency_count).launch(share=share)

# create gradio app
create_gradio_app(
    predictor.endpoint_name,
    session=sess.boto_session,
    parameters=parameters,
    system_prompt=None,
    format_prompt=format_prompt,
    concurrency_count=4,
    share=True,
)

  demo = gr.ChatInterface(generate, title="Chat with Vital Story", chatbot=gr.Chatbot(layout="panel"))


* Running on local URL:  http://127.0.0.1:7860


* Running on public URL: https://63b80fe788bc655425.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
