# Open Source LLM serving using the Azure ML Python SDK

[Note] Please use `Python 3.10 - SDK v2 (azureml_py310_sdkv2)` conda environment.


In [1]:
%load_ext autoreload
%autoreload 2

import os, sys
lab_prep_dir = os.getcwd().split("SLMWorkshopCN")[0] + "SLMWorkshopCN/0_lab_preparation"
sys.path.append(os.path.abspath(lab_prep_dir))

from common import check_kernel
check_kernel()

Kernel: python31014jvsc74a57bd01f90a0206bde5cf3732dab79adbbcc7570d5fab64b89fc69d46a8fe33664a709


In [1]:
# %store -r job_name
job_name = "happy_gold_q5lypvqkw5	" # 这里是手动指定job_name。上一行是使用在1_training_custom_phi3.ipynb或1_training_mlflow_phi3.ipynb中保存的job_name
try:
    job_name
except NameError:
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
    print("[ERROR] Please run the previous notebook (model training) again.")
    print("++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++")
print(job_name)#在训练Notebook中要把之前训练模型的job名字记录下来，这里有用

happy_gold_q5lypvqkw5	


## 1. Load config file

---


In [2]:
import os
import yaml
from logger import logger
from datetime import datetime
snapshot_date = datetime.now().strftime("%Y-%m-%d")

with open('config_prd.yml') as f:
    d = yaml.load(f, Loader=yaml.FullLoader)
    
AZURE_SUBSCRIPTION_ID = d['config']['AZURE_SUBSCRIPTION_ID']
AZURE_RESOURCE_GROUP = d['config']['AZURE_RESOURCE_GROUP']
AZURE_WORKSPACE = d['config']['AZURE_WORKSPACE']
AZURE_DATA_NAME = d['config']['AZURE_DATA_NAME']    
DATA_DIR = d['config']['DATA_DIR']
CLOUD_DIR = d['config']['CLOUD_DIR']
HF_MODEL_NAME_OR_PATH = d['config']['HF_MODEL_NAME_OR_PATH']

azure_env_name = d['serve']['azure_env_name']
azure_model_name = d['serve']['azure_model_name']
azure_endpoint_name = d['serve']['azure_endpoint_name']
azure_deployment_name = d['serve']['azure_deployment_name']
azure_serving_cluster_size = d['serve']['azure_serving_cluster_size']


logger.info("===== 0. Azure ML Deployment Info =====")
logger.info(f"AZURE_SUBSCRIPTION_ID={AZURE_SUBSCRIPTION_ID}")
logger.info(f"AZURE_RESOURCE_GROUP={AZURE_RESOURCE_GROUP}")
logger.info(f"AZURE_WORKSPACE={AZURE_WORKSPACE}")
logger.info(f"AZURE_DATA_NAME={AZURE_DATA_NAME}")
logger.info(f"DATA_DIR={DATA_DIR}")
logger.info(f"CLOUD_DIR={CLOUD_DIR}")
logger.info(f"HF_MODEL_NAME_OR_PATH={HF_MODEL_NAME_OR_PATH}")

logger.info(f"azure_env_name={azure_env_name}")
logger.info(f"azure_model_name={azure_model_name}")
logger.info(f"azure_endpoint_name={azure_endpoint_name}")
logger.info(f"azure_deployment_name={azure_deployment_name}")
logger.info(f"azure_serving_cluster_size={azure_serving_cluster_size}")

2025-02-26 22:28:25,704 - logger - INFO - ===== 0. Azure ML Deployment Info =====
2025-02-26 22:28:25,706 - logger - INFO - AZURE_SUBSCRIPTION_ID=49aee8bf-3f02-464f-a0ba-e3467e7d85e2
2025-02-26 22:28:25,707 - logger - INFO - AZURE_RESOURCE_GROUP=rg-slmwrkshp_9
2025-02-26 22:28:25,709 - logger - INFO - AZURE_WORKSPACE=mlw-pgwgybluulpec
2025-02-26 22:28:25,710 - logger - INFO - AZURE_DATA_NAME=lgds-gsm8k-main-demo
2025-02-26 22:28:25,711 - logger - INFO - DATA_DIR=./dataset
2025-02-26 22:28:25,712 - logger - INFO - CLOUD_DIR=./cloud
2025-02-26 22:28:25,713 - logger - INFO - HF_MODEL_NAME_OR_PATH=microsoft/phi-4
2025-02-26 22:28:25,714 - logger - INFO - azure_env_name=llm-srv-2024-11-05
2025-02-26 22:28:25,714 - logger - INFO - azure_model_name=phi4-grpo-2024-11-05
2025-02-26 22:28:25,715 - logger - INFO - azure_endpoint_name=phi4-endpoint-2024-11-05
2025-02-26 22:28:25,716 - logger - INFO - azure_deployment_name=phi4-blue
2025-02-26 22:28:25,717 - logger - INFO - azure_serving_cluster_si

<br>

## 2. Serving preparation

---

### 2.1. Configure workspace details

To connect to a workspace, we need identifying parameters - a subscription, a resource group, and a workspace name. We will use these details in the MLClient from azure.ai.ml to get a handle on the Azure Machine Learning workspace we need. We will use the default Azure authentication for this hands-on.


In [3]:
# import required libraries
import time
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient, Input
from azure.ai.ml import command
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes
from azure.core.exceptions import ResourceNotFoundError, ResourceExistsError

logger.info(f"===== 2. Serving preparation =====")
logger.info(f"Calling DefaultAzureCredential.")
credential = DefaultAzureCredential()
ml_client = MLClient(
    credential, AZURE_SUBSCRIPTION_ID, AZURE_RESOURCE_GROUP, AZURE_WORKSPACE
) # 创建AML workspace client, 其实一个AIF ai prj就对应了一个AML wrkspac
print(ml_client)

2025-02-26 22:28:37,249 - logger - INFO - ===== 2. Serving preparation =====
2025-02-26 22:28:37,250 - logger - INFO - Calling DefaultAzureCredential.


MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7fb14415b670>,
         subscription_id=49aee8bf-3f02-464f-a0ba-e3467e7d85e2,
         resource_group_name=rg-slmwrkshp_9,
         workspace_name=mlw-pgwgybluulpec)


### 2.2. Create model asset


In [6]:
def get_or_create_model_asset(ml_client, model_name, job_name, model_dir="outputs", model_type="custom_model", update=False):
    
    try:
        latest_model_version = max([int(m.version) for m in ml_client.models.list(name=model_name)])
        if update:
            raise ResourceExistsError('Found Model asset, but will update the Model.')
        else:
            model_asset = ml_client.models.get(name=model_name, version=latest_model_version)
            logger.info(f"Found Model asset: {model_name}. Will not create again")
    except (ResourceNotFoundError, ResourceExistsError) as e:
        logger.info(f"Exception: {e}")        
        model_path = f"azureml://jobs/{job_name}/outputs/artifacts/paths/{model_dir}/"    # 从训练job的输出目录拿到模型权重
        run_model = Model(
            name=model_name,        
            path=model_path,
            description="Model created from run.",
            type=model_type # mlflow_model, custom_model, triton_model
        )
        model_asset = ml_client.models.create_or_update(run_model)#AIF/ai prj/Models+Endpoints但只有deployment和endpoint而无权重; AML/wrkspc/Models有权重文件，该代码就是注册模型到这里。
        logger.info(f"Created Model asset: {model_name}")

    return model_asset

In [7]:
model_dir = d['train']['model_dir']
model = get_or_create_model_asset(ml_client, azure_model_name, job_name, model_dir, model_type="custom_model", update=False)
print(model) #AIF/ai prj/Models+Endpoints但只有deployment和endpoint而无权重; AML/wrkspc/Models有权重文件，该代码就是注册模型到这里。

2025-02-26 22:29:18,372 - logger - INFO - Found Model asset: phi4-grpo-2024-11-05. Will not create again


creation_context:
  created_at: '2025-02-26T13:18:56.848645+00:00'
  created_by: Gang Luo
  created_by_type: User
  last_modified_at: '2025-02-26T13:18:56.848645+00:00'
  last_modified_by: Gang Luo
  last_modified_by_type: User
description: Model created from run.
id: azureml:/subscriptions/49aee8bf-3f02-464f-a0ba-e3467e7d85e2/resourceGroups/rg-slmwrkshp_9/providers/Microsoft.MachineLearningServices/workspaces/mlw-pgwgybluulpec/models/phi4-grpo-2024-11-05/versions/1
job_name: happy_gold_q5lypvqkw5
name: phi4-grpo-2024-11-05
path: azureml://subscriptions/49aee8bf-3f02-464f-a0ba-e3467e7d85e2/resourceGroups/rg-slmwrkshp_9/workspaces/mlw-pgwgybluulpec/datastores/workspaceartifactstore/paths/ExperimentRun/dcid.happy_gold_q5lypvqkw5/outputs
properties: {}
stage: Development
tags: {}
type: custom_model
version: '1'



### 2.3. Create AzureML environment

Azure ML defines containers (called environment asset) in which your code will run. We can use the built-in environment or build a custom environment (Docker container, conda). This hands-on uses Docker container.


#### Docker environment


In [8]:
%%writefile {CLOUD_DIR}/serve/Dockerfile
FROM mcr.microsoft.com/aifx/acpt/stable-ubuntu2004-cu124-py310-torch241:biweekly.202410.2

# Install pip dependencies
COPY requirements.txt .
RUN pip install -r requirements.txt --no-cache-dir

# Inference requirements
COPY --from=mcr.microsoft.com/azureml/o16n-base/python-assets:20230419.v1 /artifacts /var/

RUN /var/requirements/install_system_requirements.sh && \
    cp /var/configuration/rsyslog.conf /etc/rsyslog.conf && \
    cp /var/configuration/nginx.conf /etc/nginx/sites-available/app && \
    ln -sf /etc/nginx/sites-available/app /etc/nginx/sites-enabled/app && \
    rm -f /etc/nginx/sites-enabled/default
ENV SVDIR=/var/runit
ENV WORKER_TIMEOUT=400
EXPOSE 5001 8883 8888

# support Deepspeed launcher requirement of passwordless ssh login
RUN apt-get update
RUN apt-get install -y openssh-server openssh-client

RUN pip install vllm

Overwriting ./cloud/serve/Dockerfile


In [21]:
%%writefile {CLOUD_DIR}/serve/requirements.txt
azureml-mlflow==1.58.0
accelerate==1.4.0
beautifulsoup4==4.13.3
bitsandbytes==0.45.3
datasets==3.3.2
deepspeed==0.15.4
huggingface_hub==0.29.1
latex2sympy2_extended==1.0.6
Markdown==3.7
math_verify==0.5.2
mlflow_skinny==2.15.0
numpy~=1.23.5
openai==1.64.0
packaging==24.2
pandas==2.2.3
peft==0.14.0
python-dotenv==1.0.1
safetensors==0.5.2
torch==2.5.1
tqdm==4.66.4
transformers==4.48.2
trl==0.15.1
unsloth==2025.2.15
unsloth_zoo==2025.2.7
wandb==0.19.7
azureml-sdk==1.58.0
azureml-core==1.58.0
azureml-dataset-runtime==1.58.0
azureml-defaults==1.58.0
azureml-contrib-services==1.58.0
azureml-inference-server-http~=1.3

Overwriting ./cloud/serve/requirements.txt


In [22]:
from azure.ai.ml.entities import Environment, BuildContext

def get_or_create_docker_environment_asset(ml_client, env_name, docker_dir, update=False):
    
    try:
        latest_env_version = max([int(e.version) for e in ml_client.environments.list(name=env_name)])
        if update:
            raise ResourceExistsError('Found Environment asset, but will update the Environment.')
        else:
            env_asset = ml_client.environments.get(name=env_name, version=latest_env_version)
            logger.info(f"Found Environment asset: {env_name}. Will not create again")
    except (ResourceNotFoundError, ResourceExistsError) as e:
        logger.info(f"Exception: {e}")
        env_docker_image = Environment(
            build=BuildContext(path=docker_dir),
            name=env_name,
            description="Environment created from a Docker context.",
        )
        env_asset = ml_client.environments.create_or_update(env_docker_image)
        logger.info(f"Created Environment asset: {env_name}")
    
    return env_asset

env = get_or_create_docker_environment_asset(ml_client, azure_env_name, f"{CLOUD_DIR}/serve", update=True)
print(env)

2025-02-26 23:27:48,416 - logger - INFO - Exception: Found Environment asset, but will update the Environment.
[32mUploading serve (0.0 MBs): 100%|██████████| 1470/1470 [00:01<00:00, 1430.64it/s]
[39m

2025-02-26 23:33:08,390 - logger - INFO - Created Environment asset: llm-srv-2024-11-05


build:
  dockerfile_path: Dockerfile
  path: https://stpgwgybluulpec.blob.core.windows.net/azureml-blobstore-f7c27ee9-fb96-407c-9b8f-a5c76209316e/LocalUpload/30aa0c7a6287fe6fbc127148904fda5a/serve/
creation_context:
  created_at: '2025-02-26T15:33:02.406134+00:00'
  created_by: Gang Luo
  created_by_type: User
  last_modified_at: '2025-02-26T15:33:02.406134+00:00'
  last_modified_by: Gang Luo
  last_modified_by_type: User
description: Environment created from a Docker context.
id: azureml:/subscriptions/49aee8bf-3f02-464f-a0ba-e3467e7d85e2/resourceGroups/rg-slmwrkshp_9/providers/Microsoft.MachineLearningServices/workspaces/mlw-pgwgybluulpec/environments/llm-srv-2024-11-05/versions/2
name: llm-srv-2024-11-05
os_type: linux
tags: {}
version: '2'



### 2.4. Serving script

If you are not serving with MLflow but with a custom model, you are free to write your own code.The `score.py` example below shows how to write the code.

-   `init()`: This function is the place to write logic for global initialization operations like loading the LLM model.
-   `run()`: Inference logic is called for every invocation of the endpoint.


In [23]:
%%writefile src_serve/score.py
import os
import logging
import json
import torch
from transformers import pipeline
from unsloth import FastLanguageModel

def init():
    """
    This function is called when the container is initialized/started, typically after create/update of the deployment.
    You can write the logic here to perform init operations like caching the model in memory
    """
    global model
    global tokenizer
    # AZUREML_MODEL_DIR is an environment variable created during deployment.
    # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
    # Please provide your model's folder name if there is one
    model_path = os.path.join(
        os.getenv("AZUREML_MODEL_DIR"), "{{score_model_dir}}"
    )
    model_id = "{{hf_model_name_or_path}}"
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = model_id, # Load up `Phi-4 14B`, and set parameters
        # max_seq_length = max_seq_length,
        load_in_4bit = True, # False for LoRA 16bit
        fast_inference = True, # Enable vLLM fast inference
    )
    FastLanguageModel.for_inference(model) 

    model.load_adapter(model_path)
    logging.info("Loaded model.")
    
def run(json_data: str):
    logging.info("Request received")
    data = json.loads(json_data)
    input_data= data["input_data"]
    params = data['params']
    
    pipe = pipeline("text-generation", model=model, tokenizer=tokenizer)
    output = pipe(input_data, **params)
    generated_text = output[0]['generated_text']
    logging.info("Output Response: " + generated_text)
    json_result = {"result": str(generated_text)}
    
    return json_result

Overwriting src_serve/score.py


Plug in the appropriate variables in the model inference script.


In [24]:
import jinja2
from pathlib import Path
TRAINED_MLFLOW = False

jinja_env = jinja2.Environment()  

template = jinja_env.from_string(Path("src_serve/score.py").open().read())
score_model_dir = model_dir #os.path.join(model_dir, "peft") if TRAINED_MLFLOW else model_dir    

Path("src_serve/score.py").open("w").write(
    template.render(score_model_dir=score_model_dir, hf_model_name_or_path=HF_MODEL_NAME_OR_PATH)
)

!pygmentize src_serve/score.py | cat -n

     1	[34mimport[39;49;00m[37m [39;49;00m[04m[36mos[39;49;00m[37m[39;49;00m
     2	[34mimport[39;49;00m[37m [39;49;00m[04m[36mlogging[39;49;00m[37m[39;49;00m
     3	[34mimport[39;49;00m[37m [39;49;00m[04m[36mjson[39;49;00m[37m[39;49;00m
     4	[34mimport[39;49;00m[37m [39;49;00m[04m[36mtorch[39;49;00m[37m[39;49;00m
     5	[34mfrom[39;49;00m[37m [39;49;00m[04m[36mtransformers[39;49;00m[37m [39;49;00m[34mimport[39;49;00m pipeline[37m[39;49;00m
     6	[34mfrom[39;49;00m[37m [39;49;00m[04m[36munsloth[39;49;00m[37m [39;49;00m[34mimport[39;49;00m FastLanguageModel[37m[39;49;00m
     7	[37m[39;49;00m
     8	[34mdef[39;49;00m[37m [39;49;00m[32minit[39;49;00m():[37m[39;49;00m
     9	[37m    [39;49;00m[33m"""[39;49;00m
    10	[33m    This function is called when the container is initialized/started, typically after create/update of the deployment.[39;49;00m
    11	[33m    You can write the logic here to perform init 

<br>

## 3. Serving

---

### 3.1. Create endpoint

Create an endpoint. This process does not provision a GPU cluster yet.


In [25]:
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    IdentityConfiguration,
    ManagedIdentityConfiguration,
)

logger.info(f"===== 3. Serving =====")

t0 = time.time()

# Check if the endpoint already exists in the workspace
try:
    endpoint = ml_client.online_endpoints.get(azure_endpoint_name)
    logger.info("---Endpoint already exists---")
except:
    # Create an online endpoint if it doesn't exist

    # Define the endpoint
    endpoint = ManagedOnlineEndpoint(
        name=azure_endpoint_name,
        description=f"Test endpoint for {model.name}",
        # identity=IdentityConfiguration(
        #     type="user_assigned",
        #     user_assigned_identities=[ManagedIdentityConfiguration(resource_id=uai_id)],
        # )
        # if uai_id != ""
        # else None,
    )

# Trigger the endpoint creation
try:
    ml_client.begin_create_or_update(endpoint).wait() #AIF/ai prj/Models+Endpoints但只有deployment和endpoint而无权重; AML/wrkspc/Endpoints
    logger.info("\n---Endpoint created successfully---\n")
except Exception as err:
    raise RuntimeError(
        f"Endpoint creation failed. Detailed Response:\n{err}"
    ) from err
    
t1 = time.time()

from humanfriendly import format_timespan
timespan = format_timespan(t1 - t0)
logger.info(f"Creating Endpoint took {timespan}")    
print(endpoint)

2025-02-26 23:33:09,115 - logger - INFO - ===== 3. Serving =====
2025-02-26 23:33:11,439 - logger - INFO - ---Endpoint already exists---
Readonly attribute principal_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>
Readonly attribute tenant_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>
2025-02-26 23:34:52,032 - logger - INFO - 
---Endpoint created successfully---

2025-02-26 23:34:52,034 - logger - INFO - Creating Endpoint took 1 minute and 42.92 seconds


auth_mode: key
description: Test endpoint for phi4-grpo-2024-11-05
id: /subscriptions/49aee8bf-3f02-464f-a0ba-e3467e7d85e2/resourceGroups/rg-slmwrkshp_9/providers/Microsoft.MachineLearningServices/workspaces/mlw-pgwgybluulpec/onlineEndpoints/phi4-endpoint-2024-11-05
identity:
  principal_id: b2583326-5bcf-41c9-a0a9-161f06256881
  tenant_id: 16b3c013-d300-468d-ac64-7eda0820b6d3
  type: system_assigned
kind: Managed
location: eastus
mirror_traffic: {}
name: phi4-endpoint-2024-11-05
openapi_uri: https://phi4-endpoint-2024-11-05.eastus.inference.ml.azure.com/swagger.json
properties:
  AzureAsyncOperationUri: https://management.azure.com/subscriptions/49aee8bf-3f02-464f-a0ba-e3467e7d85e2/providers/Microsoft.MachineLearningServices/locations/eastus/mfeOperationsStatus/oeidp:f7c27ee9-fb96-407c-9b8f-a5c76209316e:4a7848b9-ce91-4b3a-aebf-831f4ea38e45?api-version=2022-02-01-preview
  azureml.onlineendpointid: /subscriptions/49aee8bf-3f02-464f-a0ba-e3467e7d85e2/resourcegroups/rg-slmwrkshp_9/provid

### 3.2. Create Deployment

Create a Deployment. This takes a lot of time as GPU clusters must be provisioned and the serving environment must be built.


In [29]:
%%time
import time
from azure.ai.ml.entities import (    
    OnlineRequestSettings,
    CodeConfiguration,
    ManagedOnlineDeployment,
    ProbeSettings,
    Environment
)

t0 = time.time()
deployment = ManagedOnlineDeployment(
    name=azure_deployment_name,
    endpoint_name=azure_endpoint_name,
    model=model,
    instance_type=azure_serving_cluster_size, # 这里只用了cluster size而没用cluster name，所以它占用的是servless instance
    instance_count=1,
    #code_configuration=code_configuration,
    environment=env,
    scoring_script="score.py",
    code_path="./src_serve",
    #environment_variables=deployment_env_vars,
    request_settings=OnlineRequestSettings(
        max_concurrent_requests_per_instance=3,
        request_timeout_ms=90000, 
        max_queue_wait_ms=60000
    ),
    liveness_probe=ProbeSettings(
        failure_threshold=5,
        success_threshold=1,
        timeout=10,
        period=90,
        initial_delay=500,
    ),
    readiness_probe=ProbeSettings(
        failure_threshold=3,
        success_threshold=1,
        timeout=10,
        period=30,
        initial_delay=30,
    ),
)

# Trigger the deployment creation
try:
    ml_client.begin_create_or_update(deployment).wait()# 在AIF/ai prj/models+endpoints中Model Depolyment标签页; AML/wrkspce/Enpoints选择一个endpoint的Detail标签页右侧就有该ep所关联的所有deployment的信息，在页面创建deployment也是这里
    logger.info("\n---Deployment created successfully---\n")
except Exception as err:
    raise RuntimeError(
        f"Deployment creation failed. Detailed Response:\n{err}"
    ) from err
    
endpoint.traffic = {azure_deployment_name: 100}
endpoint_poller = ml_client.online_endpoints.begin_create_or_update(endpoint)

t1 = time.time()
timespan = format_timespan(t1 - t0)
logger.info(f"Creating deployment took {timespan}")

print(f"--endpoint--: \n{endpoint}")
print(f"--deployment--: \n{deployment}")
print(f"--endpoint_poller--: \n{endpoint_poller}")

Check: endpoint phi4-endpoint-2024-11-05 exists


........................................................................................

2025-02-27 08:41:48,955 - logger - INFO - 
---Deployment created successfully---

Readonly attribute principal_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>
Readonly attribute tenant_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>
2025-02-27 08:41:56,996 - logger - INFO - Creating deployment took 8 minutes and 40.31 seconds


--endpoint--: 
auth_mode: key
description: Test endpoint for phi4-grpo-2024-11-05
id: /subscriptions/49aee8bf-3f02-464f-a0ba-e3467e7d85e2/resourceGroups/rg-slmwrkshp_9/providers/Microsoft.MachineLearningServices/workspaces/mlw-pgwgybluulpec/onlineEndpoints/phi4-endpoint-2024-11-05
identity:
  principal_id: b2583326-5bcf-41c9-a0a9-161f06256881
  tenant_id: 16b3c013-d300-468d-ac64-7eda0820b6d3
  type: system_assigned
kind: Managed
location: eastus
mirror_traffic: {}
name: phi4-endpoint-2024-11-05
openapi_uri: https://phi4-endpoint-2024-11-05.eastus.inference.ml.azure.com/swagger.json
properties:
  AzureAsyncOperationUri: https://management.azure.com/subscriptions/49aee8bf-3f02-464f-a0ba-e3467e7d85e2/providers/Microsoft.MachineLearningServices/locations/eastus/mfeOperationsStatus/oeidp:f7c27ee9-fb96-407c-9b8f-a5c76209316e:4a7848b9-ce91-4b3a-aebf-831f4ea38e45?api-version=2022-02-01-preview
  azureml.onlineendpointid: /subscriptions/49aee8bf-3f02-464f-a0ba-e3467e7d85e2/resourcegroups/rg-slm

In [30]:
print(endpoint_poller.result())

auth_mode: key
description: Test endpoint for phi4-grpo-2024-11-05
id: /subscriptions/49aee8bf-3f02-464f-a0ba-e3467e7d85e2/resourceGroups/rg-slmwrkshp_9/providers/Microsoft.MachineLearningServices/workspaces/mlw-pgwgybluulpec/onlineEndpoints/phi4-endpoint-2024-11-05
identity:
  principal_id: b2583326-5bcf-41c9-a0a9-161f06256881
  tenant_id: 16b3c013-d300-468d-ac64-7eda0820b6d3
  type: system_assigned
kind: Managed
location: eastus
mirror_traffic: {}
name: phi4-endpoint-2024-11-05
openapi_uri: https://phi4-endpoint-2024-11-05.eastus.inference.ml.azure.com/swagger.json
properties:
  AzureAsyncOperationUri: https://management.azure.com/subscriptions/49aee8bf-3f02-464f-a0ba-e3467e7d85e2/providers/Microsoft.MachineLearningServices/locations/eastus/mfeOperationsStatus/oeidp:f7c27ee9-fb96-407c-9b8f-a5c76209316e:86cc5685-6117-4adf-ab9a-64bf0786f47d?api-version=2022-02-01-preview
  azureml.onlineendpointid: /subscriptions/49aee8bf-3f02-464f-a0ba-e3467e7d85e2/resourcegroups/rg-slmwrkshp_9/provid

<br>

## 4. Test

---

### 4.1. Invocation

Try calling the endpoint.


In [31]:
import os
import json

sample = {
    "input_data": 
        [
            {"role": "user", "content": "Tell me Microsoft's brief history."},
            {"role": "assistant", "content": "Microsoft was founded by Bill Gates and Paul Allen on April 4, 1975, to develop and sell a BASIC interpreter for the Altair 8800."},
            {"role": "user", "content": "What about Azure's history?"}
        ],
    "params": {
        "temperature": 0.1,
        "max_new_tokens": 128,
        "do_sample": True,
        "return_full_text": False
    }
}

test_src_dir = "./phi3-inference-test"
os.makedirs(test_src_dir, exist_ok=True)
logger.info(f"Test script directory: {test_src_dir}")
sample_data_path = os.path.join(test_src_dir, "sample-request.json")

with open(sample_data_path, "w") as f:
    json.dump(sample, f)

2025-02-27 08:43:58,804 - logger - INFO - Test script directory: ./phi3-inference-test


In [32]:
result = ml_client.online_endpoints.invoke(
    endpoint_name=azure_endpoint_name,
    deployment_name=azure_deployment_name,
    request_file=sample_data_path,
)

result_json = json.loads(result)
print(result_json['result'])
print("~~~~~~~~~~~~~" * 20)
print(result_json)

Microsoft Azure, Microsoft's cloud computing platform, has a rich history that reflects the evolution of cloud services. Here's a brief overview:

1. **Early Development (2008-2009):** 
   - Microsoft began developing Azure in 2008, initially under the code name "Project Red Dog." The goal was to create a cloud platform that could compete with Amazon Web Services (AWS), which had launched in 2006.
   - In 2009, Microsoft announced the Windows Azure platform, which was designed to provide a range of cloud services, including infrastructure as a service (IaaS) and platform as a service (
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
{'result': 'Microsoft Azure, Microsoft\'s cloud computing platform, has a rich history that reflects the evolution of cloud services. Here\'s a br

### 4.2. LLM latency/throughput benchmarking


In [33]:
import numpy as np
from time import perf_counter

def benchmark_latency(endpoint_name, deployment_name, sample_data_path, num_warmups=1, num_infers=5):
    print(f"Measuring latency for Endpoint '{endpoint_name}' and Deployment '{deployment_name}', num_infers={num_infers}")

    latencies = []
    # warm up
    for _ in range(num_warmups):
        result = ml_client.online_endpoints.invoke(
            endpoint_name=endpoint_name,
            deployment_name=deployment_name,
            request_file=sample_data_path,
        ) 
        
    begin = time.time()        
    # Timed run
    for _ in range(num_infers):
        start_time = perf_counter()
        result = ml_client.online_endpoints.invoke(
            endpoint_name=endpoint_name,
            deployment_name=deployment_name,
            request_file=sample_data_path,
        )
        latency = perf_counter() - start_time
        latencies.append(latency)
    end = time.time() 
        
    # Compute run statistics
    duration = end - begin    
    time_avg_sec = np.mean(latencies)
    time_std_sec = np.std(latencies)
    time_p95_sec = np.percentile(latencies, 95)
    time_p99_sec = np.percentile(latencies, 99)
    
    # Metrics
    metrics = {
        'duration': duration,
        'avg_sec': time_avg_sec,
        'std_sec': time_std_sec,        
        'p95_sec': time_p95_sec,
        'p99_sec': time_p99_sec    
    }
    
    return metrics

def benchmark_latency_multicore(endpoint_name, deployment_name, sample_data_path, num_warmups=1, num_infers=5, num_threads=2):
    import time
    import concurrent.futures

    # Warmup
    for _ in range(num_warmups):
        result = ml_client.online_endpoints.invoke(
            endpoint_name=endpoint_name,
            deployment_name=deployment_name,
            request_file=sample_data_path,
        )        
                
    latencies = []

    # Thread task: Each of these thread tasks executes in a serial loop for a single model.
    #              Multiple of these threads are launched to achieve parallelism.
    def task(model):
        for _ in range(num_infers):
            start = time.time()
            result = ml_client.online_endpoints.invoke(
                endpoint_name=endpoint_name,
                deployment_name=deployment_name,
                request_file=sample_data_path,
            )   
            finish = time.time()
            latencies.append(finish - start)
            
    # Submit tasks
    begin = time.time()
    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as pool:
        for i in range(num_threads):
            pool.submit(task, model)
    end = time.time()

    # Compute metrics
    duration = end - begin
    inferences = len(latencies)
    throughput = inferences / duration
    avg_latency = sum(latencies) / len(latencies)
    
    # Compute run statistics
    time_avg_sec = np.mean(latencies)
    time_std_sec = np.std(latencies)
    time_p95_sec = np.percentile(latencies, 95)
    time_p99_sec = np.percentile(latencies, 99)
    
    time_std_sec = np.std(latencies)
    time_p95_sec = np.percentile(latencies, 95)
    time_p99_sec = np.percentile(latencies, 99)

    # Metrics
    metrics = {
        'threads': num_threads,
        'duration': duration,
        'throughput': throughput,
        'avg_sec': avg_latency,
        'std_sec': time_std_sec,        
        'p95_sec': time_p95_sec,
        'p99_sec': time_p99_sec    
    }
    
    return metrics

In [34]:
benchmark_result = benchmark_latency(azure_endpoint_name, azure_deployment_name, sample_data_path, num_warmups=1, num_infers=5)

Measuring latency for Endpoint 'phi4-endpoint-2024-11-05' and Deployment 'phi4-blue', num_infers=5


In [35]:
print(benchmark_result)

{'duration': 36.699119329452515, 'avg_sec': 7.560433123994153, 'std_sec': 0.3585406471364, 'p95_sec': 7.946338906389428, 'p99_sec': 7.946960451669293}


## Clean up


In [None]:
!rm -rf {test_src_dir}

In [None]:
# ml_client.online_endpoints.begin_delete(azure_endpoint_name) #不del，后续promptflow要用到