In [1]:
!pip install "sagemaker>=2.175.0" --upgrade --quiet

In [3]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()
 
try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']
 
sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)
 
print(f"sagemaker role arn: {role}")
print(f"sagemaker session region: {sess.boto_region_name}")

sagemaker role arn: arn:aws:iam::099732224608:role/service-role/AmazonSageMaker-ExecutionRole-20250215T170368
sagemaker session region: us-west-2


In [4]:
from sagemaker.huggingface import get_huggingface_llm_image_uri
 
# retrieve the llm image uri
llm_image = get_huggingface_llm_image_uri(
  "huggingface",
  version="0.9.3"
)
 
# print ecr image uri
print(f"llm image uri: {llm_image}")

llm image uri: 763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-tgi-inference:2.0.1-tgi0.9.3-gpu-py39-cu118-ubuntu20.04


In [None]:
import json
from sagemaker.huggingface import HuggingFaceModel
 
# sagemaker config
instance_type = "ml.p4d.24xlarge"
number_of_gpu = 8
health_check_timeout = 1500
 
# Define Model and Endpoint configuration parameter
"""
config = {
  'HF_MODEL_ID': "m42-health/med42-70b", # model_id from hf.co/models
  'SM_NUM_GPUS': json.dumps(number_of_gpu), # Number of GPU used per replica
  'MAX_INPUT_LENGTH': json.dumps(2048),  # Max length of input text
  'MAX_TOTAL_TOKENS': json.dumps(4096),  # Max length of the generation (including input text)
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(8192),  # Limits the number of tokens that can be processed in parallel during the generation
  'HUGGING_FACE_HUB_TOKEN': "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",
  'HF_USE_SAFETENSORS': "true"
  # ,'HF_MODEL_QUANTIZE': "bitsandbytes", # comment in to quantize
}
"""
config = {
  'HF_MODEL_ID': "m42-health/med42-70b",
  'SM_NUM_GPUS': json.dumps(number_of_gpu),
  'MAX_INPUT_LENGTH': json.dumps(4096),  # Increased input length
  'MAX_TOTAL_TOKENS': json.dumps(8192),  # Increased total length
  'MAX_NEW_TOKENS': json.dumps(2048),  # Ensure enough generation tokens
  'MAX_BATCH_TOTAL_TOKENS': json.dumps(16384),
  'TEMPERATURE': json.dumps(0.7),
  'TOP_P': json.dumps(0.9),
  'LOG_LEVEL': "debug",
  #'HUGGING_FACE_HUB_TOKEN': "hf_***",
  'HF_USE_SAFETENSORS': "true"
}

# check if token is set
assert config['HUGGING_FACE_HUB_TOKEN'] != "<REPLACE WITH YOUR TOKEN>", "Please set your Hugging Face Hub token"
 
# create HuggingFaceModel with the image uri
llm_model = HuggingFaceModel(
  role=role,
  image_uri=llm_image,
  env=config
)

In [6]:
llm = llm_model.deploy(
  initial_instance_count=1,
  instance_type=instance_type,
  container_startup_health_check_timeout=health_check_timeout, # 10 minutes to be able to load the model
)

-----------------------------------------!

In [6]:
!pip install gradio --quiet

In [7]:
import gradio as gr
import boto3
import json
import io

# hyperparameters for llm
parameters = {
    "do_sample": True,
    "top_p": 0.9,
    "temperature": 0.8,
    "top_k": 50,
    "max_new_tokens": 512,
    "repetition_penalty": 1.1,
    "stop": ["</s>"],
}

# system_prompt = "You are an helpful Medical Assistant, called Vitalstory. Knowing everyting about Medical related."

system_prompt = """
        You are VitalChat, a helpful medical assistant specializing in
        healthcare-related questions. Your goal is to collect enough information
        from the user about their symptom(s) before providing insights.If a
        user's input is vague or lacks details, ask two or three clarifying
        questions before proceeding.Once you have enough context, generate five
        follow-up questions to gather more information.After the three followup
        questions create a summary and advice on next steps
"""


# Helper for reading lines from a stream
class LineIterator:
    def __init__(self, stream):
        self.byte_iterator = iter(stream)
        self.buffer = io.BytesIO()
        self.read_pos = 0

    def __iter__(self):
        return self

    def __next__(self):
        while True:
            self.buffer.seek(self.read_pos)
            line = self.buffer.readline()
            if line and line[-1] == ord("\n"):
                self.read_pos += len(line)
                return line[:-1]
            try:
                chunk = next(self.byte_iterator)
            except StopIteration:
                if self.read_pos < self.buffer.getbuffer().nbytes:
                    continue
                raise
            if "PayloadPart" not in chunk:
                print("Unknown event type:" + chunk)
                continue
            self.buffer.seek(0, io.SEEK_END)
            self.buffer.write(chunk["PayloadPart"]["Bytes"])


# define format function for our input
def format_prompt(user_input, history, system_prompt):
    """
    Formats the conversation history and user input using a structured instruction format.
    This approach improves the model's ability to follow instructions and ask clarifying questions.
    """

    # Initialize the prompt with system instructions
    prompt = f"<|system|>\n{system_prompt}\n<|system|>\n\n"

    # Ensure history is properly formatted as [(user_input, bot_response), ...]
    if not isinstance(history, list):
        history = []
    formatted_history = []
    for entry in history:
        #print(entry)
        #print(len(entry))
        #if isinstance(entry, dict) and len(entry) == 4 and all(isinstance(x, str) for x in entry):
        if len(entry) == 2 and all(isinstance(x, str) for x in entry):
            formatted_history.append(entry)  # Valid tuple
        else:
            print(f"⚠️ Invalid history entry: {entry}, resetting history.")
            history = []  # Reset history if invalid
            break  # Prevent partial corruption

    # Append formatted history using structured instruction format
    for user_text, bot_response in formatted_history:
        prompt += f"<|prompter|>\n{prompt}\n<|prompter|>\n"
        prompt += f"<|prompter|>\n{user_text}\n<|prompter|>\n"
        prompt += f"<|assistant|>\n{bot_response}\n<|assistant|>\n"

    # Append the new user input with instruction
    prompt += f"<|prompter|>\n{user_input}\n<|prompter|>\n"
    prompt += "<|assistant|>\n\n<|assistant|>\n"
    
    return prompt

def create_gradio_app(
    endpoint_name,
    session=boto3,
    parameters=parameters,
    system_prompt=system_prompt,
    format_prompt=format_prompt,
    concurrency_count=4,
    share=True,
):
    smr = session.client("sagemaker-runtime")

    def generate(
        prompt,
        history,
    ):
        formatted_prompt = format_prompt(prompt, history, system_prompt)

        request = {"inputs": formatted_prompt, "parameters": parameters, "stream": True}
        resp = smr.invoke_endpoint_with_response_stream(
            EndpointName=endpoint_name,
            Body=json.dumps(request),
            ContentType="application/json",
        )

        output = ""
        for c in LineIterator(resp["Body"]):
            c = c.decode("utf-8")
            if c.startswith("data:"):
                chunk = json.loads(c.lstrip("data:").rstrip("/n"))
                if chunk["token"]["special"]:
                    continue
                if chunk["token"]["text"] in request["parameters"]["stop"]:
                    break
                output += chunk["token"]["text"]
                for stop_str in request["parameters"]["stop"]:
                    if output.endswith(stop_str):
                        output = output[: -len(stop_str)]
                        output = output.rstrip()
                        yield output

                yield output
        return output

    demo = gr.ChatInterface(generate, title="Chat with Vital Story", chatbot=gr.Chatbot(layout="panel"))
    demo.queue().launch(share=share)
    #demo.queue(concurrency_count=concurrency_count).launch(share=share)

# create gradio app
create_gradio_app(
    llm.endpoint_name,
    session=sess.boto_session,
    parameters=parameters,
    system_prompt=None,
    format_prompt=format_prompt,
    concurrency_count=4,
    share=True,
)

  demo = gr.ChatInterface(generate, title="Chat with Vital Story", chatbot=gr.Chatbot(layout="panel"))


* Running on local URL:  http://127.0.0.1:7860


* Running on public URL: https://95411e37445bac10b3.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
