# LOGIN AND FETCH TOKEN

In [None]:
import globus_sdk
import requests
import json
from inference_auth_token import get_access_token

# Authenticate and collect access token
# If tokens already exist, this will reuse them and fresh the access token if necessary
access_token = get_access_token()

# LIST ENDPOINTS

In [3]:
# URL to the inference gateway
url = "https://data-portal-dev.cels.anl.gov/resource_server/list-endpoints"

# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

response = requests.get(url, headers=headers)
for endpoint in response.json():
    print(endpoint)

{'completion_endpoint_url': '/resource_server/sophia/vllm/v1/completions/', 'chat_endpoint_url': '/resource_server/sophia/vllm/v1/chat/completions/', 'embedding_endpoint_url': '/resource_server/sophia/vllm/v1/embeddings/', 'model_name': 'meta-llama/Meta-Llama-3-70B-Instruct'}
{'completion_endpoint_url': '/resource_server/sophia/vllm/v1/completions/', 'chat_endpoint_url': '/resource_server/sophia/vllm/v1/chat/completions/', 'embedding_endpoint_url': '/resource_server/sophia/vllm/v1/embeddings/', 'model_name': 'meta-llama/Meta-Llama-3-8B-Instruct'}
{'completion_endpoint_url': '/resource_server/sophia/vllm/v1/completions/', 'chat_endpoint_url': '/resource_server/sophia/vllm/v1/chat/completions/', 'embedding_endpoint_url': '/resource_server/sophia/vllm/v1/embeddings/', 'model_name': 'mistralai/Mistral-7B-Instruct-v0.3'}
{'completion_endpoint_url': '/resource_server/polaris/vllm/v1/completions/', 'chat_endpoint_url': '/resource_server/polaris/vllm/v1/chat/completions/', 'embedding_endpoint_

## INFERENCE USING VLLM

### CHAT COMPLETIONS

In [5]:
# URL to the inference gateway
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/chat/completions"

# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
        "max_tokens": 150,
        "messages":[{"role": "user", "content": "List all proteins that interact with RAD51"}],
}

# Convert data into Json
data_json = json.dumps(data)

# Send the post request to the relay server
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())

200
{'server_response': '{\n    "id": "cmpl-364f2af1491a4a3ebf4d670ac55867e9",\n    "object": "chat.completion",\n    "created": 1721315660,\n    "model": "meta-llama/Meta-Llama-3-8B-Instruct",\n    "choices": [\n        {\n            "index": 0,\n            "message": {\n                "role": "assistant",\n                "content": "RAD51 is a key protein involved in homologous recombination (HR) and DNA repair. It has been shown to interact with numerous proteins to regulate its activity and function. Here is a list of some of the proteins that interact with RAD51:\\n\\n1. BRCA1: RAD51 interacts with BRCA1, a tumor suppressor protein, to promote HR and DNA repair.\\n2. BRCA2: RAD51 also interacts with BRCA2, another tumor suppressor protein, to facilitate HR and DNA repair.\\n3. PALB2: PALB2, a protein involved in HR and DNA repair, interacts with RAD51 and BRCA2 to regulate their activity.\\n4. FANCD2: FANCD2,",\n                "tool_calls": []\n            },\n            "lo

### CHAT COMPLETIONS USING PYTHON OPENAI API

In [None]:
from openai import OpenAI
 
# Set OpenAI's API key and API base to use vLLM's API server.
# localurl = "http://localhost:8000/resource_server/polaris/vllm/v1"
openai_api_key = access_token
openai_api_base = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1"

client = OpenAI(
    api_key=openai_api_key,
    base_url=openai_api_base,
)
 
# sampling_params = SamplingParams({"prompt_logprobs": 1, "logprobs": 1))
chat_response = client.chat.completions.create(
    model="meta-llama/Meta-Llama-3-8B-Instruct",
    logprobs=True,
    top_logprobs=1,
    messages=[
        {"role": "user", "content": "A detailed description of the biochemical \
            function 5-(hydroxymethyl)furfural/furfural transporter is"},
    ],
    max_tokens=2056
)
print("Chat response:", chat_response)

### LEGACY COMPLETIONS

In [None]:
# URL to the inference gateway
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/completions"

# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "meta-llama/Meta-Llama-3-8B-Instruct",
        "temperature": 0.2,
        "max_tokens": 150,
        "prompt": "List all proteins that interact with RAD51",
        "logprobs":1
}

# Convert data into Json
data_json = json.dumps(data)

# Send the post request to the relay server
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())

### meta-llama/Meta-Llama-3-70B-Instruct

In [None]:
# URL to the inference gateway
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/chat/completions"

# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "meta-llama/Meta-Llama-3-70B-Instruct",
        "max_tokens": 150,
        "messages":[{"role": "user", "content": "List all proteins that interact with RAD51"}]
}

# Convert data into Json
data_json = json.dumps(data)

# Send the post request to the relay server
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())

In [None]:
# URL to the inference gateway
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/completions"

# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "meta-llama/Meta-Llama-3-70B-Instruct",
        "temperature": 0.2,
        "max_tokens": 150,
        "prompt":"List all proteins that interact with RAD51"
}

# Convert data into Json
data_json = json.dumps(data)

# Send the post request to the relay server
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())

### Mistral-7B-Instruct-v0.3

In [None]:
import time
start_time = time.time()

# URL to the inference gateway
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/completions"

# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "mistralai/Mistral-7B-Instruct-v0.3",
        "temperature": 0.2,
        "max_tokens": 150,
        "prompt" : "List all proteins that interact with RAD51",
        "logprobs":1
}

# Convert data into Json
data_json = json.dumps(data)

# Send the post request to the relay server
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())
print("Time for globus to submit and respond",time.time()-start_time)

In [None]:
import time
start_time = time.time()

# URL to the inference gateway
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/chat/completions"

# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "mistralai/Mistral-7B-Instruct-v0.3",
        "temperature": 0.2,
        "max_tokens": 150,
        "messages":[{"role": "user", "content": "List all proteins that interact with RAD51"}],
        "logprobs":True
}

# Convert data into Json
data_json = json.dumps(data)

# Send the post request to the relay server
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())
print("Time for globus to submit and respond",time.time()-start_time)

## Mixtral-8x22B-Instruct-v0.1

In [17]:
import time
start_time = time.time()

# URL to the inference gateway
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/completions"

# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "mistralai/Mixtral-8x22B-Instruct-v0.1",
        "temperature": 0.2,
        "max_tokens": 150,
        "prompt" : "List all proteins that interact with RAD51",
}

# Convert data into Json
data_json = json.dumps(data)

# Send the post request to the relay server
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())
print("Time for globus to submit and respond",time.time()-start_time)

200
{'server_response': '{\n    "id": "cmpl-1fb607f8a5b648ecbf28b9d82088d3aa",\n    "object": "text_completion",\n    "created": 1722571829,\n    "model": "mistralai/Mixtral-8x22B-Instruct-v0.1",\n    "choices": [\n        {\n            "index": 0,\n            "text": ".\\n\\nTo find all proteins that interact with RAD51, you can use various protein-protein interaction databases such as BioGRID, IntAct, and STRING. Here are some of the proteins that interact with RAD51, according to these databases:\\n\\n1. BRCA1\\n2. BRCA2\\n3. PALB2\\n4. RAD51B\\n5. RAD51C\\n6. RAD51D\\n7. RAD52\\n8. RAD54\\n9. RAD54B\\n10. RAD54L\\n11. XRCC2\\n12. XR",\n            "logprobs": null,\n            "finish_reason": "length",\n            "stop_reason": null\n        }\n    ],\n    "usage": {\n        "prompt_tokens": 12,\n        "total_tokens": 162,\n        "completion_tokens": 150\n    },\n    "response_time": 7.473295450210571,\n    "throughput_tokens_per_second": 21.677183924989265\n}'}
Time for

In [15]:
import time
start_time = time.time()

# URL to the inference gateway
url = "https://data-portal-dev.cels.anl.gov/resource_server/sophia/vllm/v1/chat/completions"

# Add access token to the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json"
}

# Prepare the inference request
data = {
        "model": "mistralai/Mixtral-8x22B-Instruct-v0.1",
        "temperature": 0.2,
        "messages":[{"role": "user", "content": "List all proteins that interact with RAD51"}],
}

# Convert data into Json
data_json = json.dumps(data)

# Send the post request to the relay server
response = requests.post(url, data=data_json, headers=headers)

# Print inference response
print(response.status_code)
print(response.json())
print("Time for globus to submit and respond",time.time()-start_time)

200
{'server_response': '{\n    "id": "chat-4fc6c8569f5a43e78b967714e65c5ab1",\n    "object": "chat.completion",\n    "created": 1722571630,\n    "model": "mistralai/Mixtral-8x22B-Instruct-v0.1",\n    "choices": [\n        {\n            "index": 0,\n            "message": {\n                "role": "assistant",\n                "content": " RAD51 is a key protein involved in the homologous recombination (HR) repair of DNA double-strand breaks (DSBs). It interacts with several other proteins to carry out its function. Here are some proteins that interact with RAD51:\\n\\n1. BRCA1 (Breast cancer type 1 susceptibility protein): BRCA1 interacts with RAD51 and promotes its nuclear localization and accumulation at sites of DNA damage.\\n\\n2. BRCA2 (Breast cancer type 2 susceptibility protein): BRCA2 plays a crucial role in the HR repair pathway by mediating the recruitment of RAD51 to DSBs.\\n\\n3. PALB2 (Partner and localizer of BRCA2): PALB2 interacts with BRCA2 and RAD51 to facilitate t

## FINDINGS
* Globus adds an average of 5 seconds to the latency
