## Setup OGA (ONNX GenAI Runtime for NPU)
This notebook launches a LLM server using the TurnkeyML LLM tool.

In [None]:
%pip install turnkeyml[llm-oga-dml]

In [1]:
import torch
import turnkeyml.llm.cache as cache
from turnkeyml.llm.tools.chat import Serve
from turnkeyml.llm.tools.huggingface_load import HuggingfaceLoad
from turnkeyml.llm.tools.ort_genai.oga import OgaLoad
from turnkeyml.state import State

def launch_llm_server(backend, checkpoint, device, dtype, max_new_tokens):
    assert(device == "cpu" or device == "npu" or device == "igpu"), f"ERROR: {device} not supported, please select 'cpu' or 'npu'."
    assert(backend == "hf" or backend == "oga"), f"ERROR: {backend} not supported, please select 'groq', 'hf' or 'oga'."

    runtime = HuggingfaceLoad if backend == "hf" else OgaLoad
    dtype = torch.bfloat16 if dtype == "bfloat16" else dtype

    state = State(cache_dir=cache.DEFAULT_CACHE_DIR, build_name=f"{checkpoint}_{device}_{dtype}")
    state = runtime().run(
        state,
        input=checkpoint,
        device=device,
        dtype=dtype
    )
    state = Serve().run(state, max_new_tokens=max_new_tokens)

In [2]:
# Handle asyncio event loops in Jupyter
import nest_asyncio
nest_asyncio.apply()

In [None]:
# Run OGA CPU backend
launch_llm_server("hf", "meta-llama/Meta-Llama-3-8B", "cpu", "bfloat16", 100)

In [3]:
# Run OGA NPU backend
launch_llm_server("oga", "meta-llama/Meta-Llama-3-8B", "npu", "int4", 100)