# Example notebooks
https://github.com/NVIDIA/GenerativeAIExamples/tree/main/notebooks

----

In [11]:
triton_address = "128.55.84.158"

## 00-llm-non-streaming-nemotron.ipynb

In [12]:
from triton_trt_llm import HttpTritonClient

In [13]:
# Step 1: Structure the Query in a Prompt Template

NEMOTRON_PROMPT_TEMPLATE = (
 """<extra_id_0>System
{system}
<extra_id_1>User
{prompt}
<extra_id_1>Assistant
"""
)
system = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature."
prompt = 'What is the fastest land animal?'
prompt = NEMOTRON_PROMPT_TEMPLATE.format(prompt=prompt, system=system)

In [14]:
# Step 2: Create the Triton Client


triton_url = f"{triton_address}:8000"
client = HttpTritonClient(triton_url)

In [15]:
pload = {
            'prompt':[[prompt]],
            'tokens':64,
            'temperature':1.0,
            'top_k':1,
            'top_p':0,
            'beam_width':1,
            'repetition_penalty':1.0,
            'length_penalty':1.0
}

In [16]:
# Step 3: Load the Model and Generate Response

model_name = "ensemble"
client.load_model(model_name)
val = client.request(model_name, **pload)
print(val)

ConnectionRefusedError: [Errno 111] Connection refused

----

# Notebook 1: LLM Streaming Client

In [24]:
LLAMA_PROMPT_TEMPLATE = (
 "<s>[INST] <<SYS>>"
 "{system_prompt}"
 "<</SYS>>"
 "[/INST] {context} </s><s>[INST] {question} [/INST]"
)
system_prompt = "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Please ensure that your responses are positive in nature."
context=""
# question='What is the fastest land animal?'
question='What is NERSC?'
prompt = LLAMA_PROMPT_TEMPLATE.format(system_prompt=system_prompt, context=context, question=question)

In [25]:
# from langchain_nvidia_trt.llms import TritonTensorRTLLM
from langchain_nvidia_trt_llms import TritonTensorRTLLM

triton_url = f"{triton_address}:8001"
pload = {
            'tokens':300,
            'server_url': triton_url,
            'model_name': "ensemble",
            'temperature':1.0,
            'top_k':1,
            'top_p':0,
            'beam_width':1,
            'repetition_penalty':1.0,
            'length_penalty':1.0
}
client = TritonTensorRTLLM(**pload)

In [26]:
import time
import random

start_time = time.time()
tokens_generated = 0

for val in client.stream(prompt):
    tokens_generated += 1
    print(val, end="", flush=True)

total_time = time.time() - start_time
print(f"\n--- Generated {tokens_generated} tokens in {total_time} seconds ---")
print(f"--- {tokens_generated/total_time} tokens/sec")

  NERSC (National Energy Research Scientific Computing) is a scientific computing center that provides high-performance computing (HPC) resources and expertise to researchers in the United States. It is supported by the U.S. Department of Energy's Office of Science and is located at the Lawrence Berkeley National Laboratory.

NERSC's mission is to accelerate scientific discovery and innovation by providing state-of-the-art computing resources, software, and expertise to researchers in fields such as physics, chemistry, materials science, and biology. The center offers a range of computing resources, including supercomputers, clusters, and storage systems, as well as a variety of software tools and libraries to support scientific simulations and data analysis.

Some of the key research areas that NERSC supports include:

1. Climate modeling and weather forecasting
2. Materials science and nanotechnology
3. Biology and genomics
4. Energy storage and production
5. Advanced manufacturing a