# LOGIN AND FETCH TOKEN

In [2]:
import globus_sdk
import requests
import json

# Globus ID and Scope
# ===================

# Define your Globus thick-client ID
# https://app.globus.org/settings/developers
# This needs to be created by users (they can also re-use one of the existing clients)
my_client_id = "99bc7f15-ebed-4d6b-b783-8f7ae8735135"

# Define the inference-gateway resource service scope
# This will be publicly available to users
vllm_client_id = "681c10cc-f684-4540-bcd7-0b4df3bc26ef"
vllm_scope = f"https://auth.globus.org/scopes/{vllm_client_id}/action_all"

# Authentication and Access Token
# ===============================

# Start an Auth client with the vLLM scope
auth_client = globus_sdk.NativeAppAuthClient(my_client_id)
auth_client.oauth2_start_flow(requested_scopes=vllm_scope)

# Authenticate with your Globus account
authorize_url = auth_client.oauth2_get_authorize_url()
print(f"Please go to this URL and login:\n\n{authorize_url}\n")


Please go to this URL and login:

https://auth.globus.org/v2/oauth2/authorize?client_id=99bc7f15-ebed-4d6b-b783-8f7ae8735135&redirect_uri=https%3A%2F%2Fauth.globus.org%2Fv2%2Fweb%2Fauth-code&scope=https%3A%2F%2Fauth.globus.org%2Fscopes%2F681c10cc-f684-4540-bcd7-0b4df3bc26ef%2Faction_all&state=_default&response_type=code&code_challenge=zE_P0JAqC7agqFGOZOkSSDdrnLC13KceVDgWG7j6fh0&code_challenge_method=S256&access_type=online



In [3]:
# Collect access token to vLLM service
auth_code = "IGEJUZia7w8Pohc2x0bcPlW8QNN6EG"
token_response = auth_client.oauth2_exchange_code_for_tokens(auth_code)
access_token = token_response.by_resource_server[vllm_client_id]["access_token"]

# LIST ENDPOINTS

In [4]:
# URL to the inference gateway (needs to end with forward slash /)
#url = "https://data-portal-dev-vmw-01.cels.anl.gov/resource_server/polaris/"
url = "http://localhost:8000/resource_server/list-endpoints/"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}
response = requests.get(url, headers=headers, verify=False)
for endpoint in response.json():
    print(endpoint)
          

{'endpoint_url': '/resource_server/polaris/vllm/completions/', 'model_name': 'meta-llama/Meta-Llama-3-8B-Instruct'}
{'endpoint_url': '/resource_server/polaris/vllm/completions/', 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'}
{'endpoint_url': '/resource_server/polaris/vllm/completions/', 'model_name': 'mistralai/Mistral-7B-Instruct-v0.3'}
{'endpoint_url': '/resource_server/polaris/llama-cpp/completions/', 'model_name': 'meta-llama-3-8b-instruct'}
{'endpoint_url': '/resource_server/polaris/llama-cpp/completions/', 'model_name': 'mistral-7b-instruct-v03'}
{'endpoint_url': '/resource_server/polaris/llama-cpp/completions/', 'model_name': 'meta-llama-3-70b-instruct'}


## INFERENCE USING VLLM

In [24]:
# URL to the inference gateway (needs to end with forward slash /)
#url = "https://data-portal-dev-vmw-01.cels.anl.gov/resource_server/polaris/"
url = "http://localhost:8000/resource_server/polaris/vllm/completions/"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
        "temperature": 0.2,
        "max_tokens": 150,
        "messages":[{"role": "user", "content": "List all proteins that interact with RAD51"}],
        'logprobs':True
}

# Convert data into Json51633660

data_json = json.dumps(data)

# Send the post request to the relay server
# Verify=False is temporary since I use a self-signed certificate
response = requests.post(url, data=data_json, headers=headers, verify=False)

# Print inference response
print(response.status_code)
print(response.json())

200
{'server_response': '{\n    "id": "cmpl-d538c498e13b406098e33b535e369aec",\n    "choices": [\n        {\n            "finish_reason": "length",\n            "index": 0,\n            "logprobs": {\n                "content": [\n                    {\n                        "token": "RAD",\n                        "bytes": [\n                            82,\n                            65,\n                            68\n                        ],\n                        "logprob": -0.6931474208831787,\n                        "top_logprobs": []\n                    },\n                    {\n                        "token": "51",\n                        "bytes": [\n                            53,\n                            49\n                        ],\n                        "logprob": 0.0,\n                        "top_logprobs": []\n                    },\n                    {\n                        "token": " is",\n                        "bytes": [\n                 

In [29]:
# URL to the inference gateway (needs to end with forward slash /)
#url = "https://data-portal-dev-vmw-01.cels.anl.gov/resource_server/polaris/"
url = "http://localhost:8000/resource_server/polaris/vllm/completions/"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "meta-llama/Meta-Llama-3-70B-Instruct",
        "temperature": 0.2,
        "max_tokens": 150,
        "messages":[{"role": "user", "content": "List all proteins that interact with RAD51"}],
        'logprobs':True
}

# Convert data into Json51633660

data_json = json.dumps(data)

# Send the post request to the relay server
# Verify=False is temporary since I use a self-signed certificate
response = requests.post(url, data=data_json, headers=headers, verify=False)

# Print inference response
print(response.status_code)
print(response.json())

200
{'server_response': '{\n    "id": "cmpl-1d10ece726d94fa39067157ff9a429ce",\n    "choices": [\n        {\n            "finish_reason": "length",\n            "index": 0,\n            "logprobs": {\n                "content": [\n                    {\n                        "token": "A",\n                        "bytes": [\n                            65\n                        ],\n                        "logprob": -0.8619948029518127,\n                        "top_logprobs": []\n                    },\n                    {\n                        "token": " great",\n                        "bytes": [\n                            32,\n                            103,\n                            114,\n                            101,\n                            97,\n                            116\n                        ],\n                        "logprob": -0.0001501924270996824,\n                        "top_logprobs": []\n                    },\n                    {\n   

In [34]:
# URL to the inference gateway (needs to end with forward slash /)
#url = "https://data-portal-dev-vmw-01.cels.anl.gov/resource_server/polaris/"
import time
start_time = time.time()
url = "http://localhost:8000/resource_server/polaris/vllm/completions/"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "mistralai/Mistral-7B-Instruct-v0.3",
        "temperature": 0.2,
        "max_tokens": 150,
        "messages":[{"role": "user", "content": "List all proteins that interact with RAD51"}],
        'logprobs':True
}

# Convert data into Json51633660

data_json = json.dumps(data)

# Send the post request to the relay server
# Verify=False is temporary since I use a self-signed certificate
response = requests.post(url, data=data_json, headers=headers, verify=False)

# Print inference response
print(response.status_code)
print(response.json())
print("Time for globus to submit and respond",time.time()-start_time)

200
{'server_response': '{\n    "id": "cmpl-3a2497d565464a9b8d554a619922707f",\n    "choices": [\n        {\n            "finish_reason": "length",\n            "index": 0,\n            "logprobs": {\n                "content": [\n                    {\n                        "token": " R",\n                        "bytes": [\n                            32,\n                            82\n                        ],\n                        "logprob": 0.0,\n                        "top_logprobs": []\n                    },\n                    {\n                        "token": "AD",\n                        "bytes": [\n                            65,\n                            68\n                        ],\n                        "logprob": 0.0,\n                        "top_logprobs": []\n                    },\n                    {\n                        "token": "5",\n                        "bytes": [\n                            53\n                        ],\n         

## FINDINGS
* Globus adds an average of 6 seconds to the latency


# INFERENCE USING LLAMACPP

In [None]:
# URL to the inference gateway (needs to end with forward slash /)
#url = "https://data-portal-dev-vmw-01.cels.anl.gov/resource_server/polaris/"
url = "http://localhost:8000/resource_server/polaris/llama-cpp/completions/"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "mistral-7B-instruct-v03",
        "temperature": 0.2,
        "prompt": "List all proteins that interact with RAD51",
        #"n_probs":1
}

# Convert data into Json51633660

data_json = json.dumps(data)

# Send the post request to the relay server
# Verify=False is temporary since I use a self-signed certificate
response = requests.post(url, data=data_json, headers=headers, verify=False)

# Print inference response
print(response.status_code)
print(response.json())

In [None]:
# URL to the inference gateway (needs to end with forward slash /)
#url = "https://data-portal-dev-vmw-01.cels.anl.gov/resource_server/polaris/"
url = "http://localhost:8000/resource_server/polaris/llama-cpp/completions/"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "meta-llama-3-70b-instruct",
        "temperature": 0.2,
        "max_tokens": 150,
        "prompt": "List all proteins that interact with RAD51"    
}

# Convert data into Json51633660

data_json = json.dumps(data)

# Send the post request to the relay server
# Verify=False is temporary since I use a self-signed certificate
response = requests.post(url, data=data_json, headers=headers, verify=False)

# Print inference response
print(response.status_code)
print(response.json())