# LOGIN AND FETCH TOKEN

In [1]:
import globus_sdk
import requests
import json

# Globus ID and Scope
# ===================
auth_client_id = "58fdd3bc-e1c3-4ce5-80ea-8d6b87cfb944"

# Define the inference-gateway resource service scope
# This will be publicly available to users
gateway_client_id = "681c10cc-f684-4540-bcd7-0b4df3bc26ef"
gateway_scope = f"https://auth.globus.org/scopes/{gateway_client_id}/action_all"

# Authentication and Access Token
# ===============================

# Start an Auth client with the vLLM scope
auth_client = globus_sdk.NativeAppAuthClient(auth_client_id)
auth_client.oauth2_start_flow(requested_scopes=gateway_scope)

# Authenticate with your Globus account
authorize_url = auth_client.oauth2_get_authorize_url()
print(f"Please go to this URL and login:\n\n{authorize_url}\n")


Please go to this URL and login:

https://auth.globus.org/v2/oauth2/authorize?client_id=58fdd3bc-e1c3-4ce5-80ea-8d6b87cfb944&redirect_uri=https%3A%2F%2Fauth.globus.org%2Fv2%2Fweb%2Fauth-code&scope=https%3A%2F%2Fauth.globus.org%2Fscopes%2F681c10cc-f684-4540-bcd7-0b4df3bc26ef%2Faction_all&state=_default&response_type=code&code_challenge=Ky86GIVIRXzNLzxjKl-pj7JrPhwkr8aeVcRT-RyNUWg&code_challenge_method=S256&access_type=online



In [2]:
# Collect access token to vLLM service
auth_code = "..paste.your.auth.code.here.."
token_response = auth_client.oauth2_exchange_code_for_tokens(auth_code)
access_token = token_response.by_resource_server[gateway_client_id]["access_token"]

# LIST ENDPOINTS

In [4]:
# URL to the inference gateway (needs to end with forward slash /)
import requests
import json
url = "https://data-portal-dev.cels.anl.gov/resource_server/list-endpoints"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}
response = requests.get(url, headers=headers)
for endpoint in response.json():
    print(endpoint)
          

{'completion_endpoint_url': '/resource_server/sophia/vllm/v1/completions/', 'chat_endpoint_url': '/resource_server/sophia/vllm/v1/chat/completions/', 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'}
{'completion_endpoint_url': '/resource_server/sophia/vllm/v1/completions/', 'chat_endpoint_url': '/resource_server/sophia/vllm/v1/chat/completions/', 'model_name': 'meta-llama/Meta-Llama-3-8B-Instruct'}
{'completion_endpoint_url': '/resource_server/sophia/vllm/v1/completions/', 'chat_endpoint_url': '/resource_server/sophia/vllm/v1/chat/completions/', 'model_name': 'mistralai/Mistral-7B-Instruct-v0.3'}
{'completion_endpoint_url': '/resource_server/polaris/vllm/v1/completions/', 'chat_endpoint_url': '/resource_server/polaris/vllm/v1/chat/completions/', 'model_name': 'meta-llama/Meta-Llama-3-8B-Instruct'}
{'completion_endpoint_url': '/resource_server/polaris/vllm/v1/completions/', 'chat_endpoint_url': '/resource_server/polaris/vllm/v1/chat/completions/', 'model_name': 'meta-llama/Meta-Lla

## INFERENCE USING VLLM

### CHAT COMPLETIONS

In [5]:
# URL to the inference gateway (needs to end with forward slash /)
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/chat/completions"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
        "max_tokens": 150,
        "messages":[{"role": "user", "content": "List all proteins that interact with RAD51"}],
}

# Convert data into Json
data_json = json.dumps(data)

# Send the post request to the relay server
# Verify=False is temporary since I use a self-signed certificate
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())

200
{'server_response': '{\n    "id": "cmpl-364f2af1491a4a3ebf4d670ac55867e9",\n    "object": "chat.completion",\n    "created": 1721315660,\n    "model": "meta-llama/Meta-Llama-3-8B-Instruct",\n    "choices": [\n        {\n            "index": 0,\n            "message": {\n                "role": "assistant",\n                "content": "RAD51 is a key protein involved in homologous recombination (HR) and DNA repair. It has been shown to interact with numerous proteins to regulate its activity and function. Here is a list of some of the proteins that interact with RAD51:\\n\\n1. BRCA1: RAD51 interacts with BRCA1, a tumor suppressor protein, to promote HR and DNA repair.\\n2. BRCA2: RAD51 also interacts with BRCA2, another tumor suppressor protein, to facilitate HR and DNA repair.\\n3. PALB2: PALB2, a protein involved in HR and DNA repair, interacts with RAD51 and BRCA2 to regulate their activity.\\n4. FANCD2: FANCD2,",\n                "tool_calls": []\n            },\n            "lo

### CHAT COMPLETIONS USING PYTHON OPENAI API

In [None]:
from openai import OpenAI
 
# Set OpenAI's API key and API base to use vLLM's API server.
# localurl = "http://localhost:8000/resource_server/polaris/vllm/v1"
openai_api_key = access_token
openai_api_base = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1"


client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
 
# sampling_params = SamplingParams({"prompt_logprobs": 1, "logprobs": 1))
chat_response = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3-8B-Instruct",
    logprobs=True,
    top_logprobs=1,
    messages=[
        {"role": "user", "content": "A detailed description of the biochemical \
            function 5-(hydroxymethyl)furfural/furfural transporter is"},
    ],
    max_tokens=2056
)
print("Chat response:", chat_response)

### LEGACY COMPLETIONS

In [None]:
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/completions"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
        "temperature": 0.2,
        "max_tokens": 150,
        "prompt": "List all proteins that interact with RAD51",
        "logprobs":1
}

# Convert data into Json51633660

data_json = json.dumps(data)

# Send the post request to the relay server
# Verify=False is temporary since I use a self-signed certificate
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())

### meta-llama/Meta-Llama-3-70B-Instruct

In [None]:
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/chat/completions"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "meta-llama/Meta-Llama-3-70B-Instruct",
        "max_tokens": 150,
        "messages":[{"role": "user", "content": "List all proteins that interact with RAD51"}]
}

# Convert data into Json51633660

data_json = json.dumps(data)

# Send the post request to the relay server
# Verify=False is temporary since I use a self-signed certificate
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())

In [None]:
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/completions"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "meta-llama/Meta-Llama-3-70B-Instruct",
        "temperature": 0.2,
        "max_tokens": 150,
        "prompt":"List all proteins that interact with RAD51"
}

# Convert data into Json51633660

data_json = json.dumps(data)

# Send the post request to the relay server
# Verify=False is temporary since I use a self-signed certificate
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())

### Mistral-7B-Instruct-v0.3

In [None]:
import time
start_time = time.time()
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/completions"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "mistralai/Mistral-7B-Instruct-v0.3",
        "temperature": 0.2,
        "max_tokens": 150,
        "prompt" : "List all proteins that interact with RAD51",
        "logprobs":1
}

# Convert data into Json

data_json = json.dumps(data)

# Send the post request to the relay server
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())
#print("Time for globus to submit and respond",time.time()-start_time)

In [None]:
import time
start_time = time.time()
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/chat/completions"
# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "mistralai/Mistral-7B-Instruct-v0.3",
        "temperature": 0.2,
        "max_tokens": 150,
        "messages":[{"role": "user", "content": "List all proteins that interact with RAD51"}],
        "logprobs":True
        #"prompt" : "List all proteins that interact with RAD51",
}

# Convert data into Json51633660

data_json = json.dumps(data)

# Send the post request to the relay server
# Verify=False is temporary since I use a self-signed certificate
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())
print("Time for globus to submit and respond",time.time()-start_time)

## FINDINGS
* Globus adds an average of 5 seconds to the latency
