# Sampling with CacheSaver: Bringing the power of seeding to LLM Inference

In [None]:
%load_ext autoreload
%autoreload 2

from diskcache import Cache

from openai import AsyncOpenAI

from cachesaver.pipelines import OnlineAPI

from src.models import OnlineLLM, API
from src.typedefs import DecodingParameters

## Hyperparams etc.

In [None]:
batch_size = 1
timeout=2
allow_batch_overflow = 1
correctness = 1
ns_ratio=0
value_cache=True

cache_path="caches/developping"

benchmark="hotpotqa"
method="tot_bfs"
split="mini"

model="gpt-4.1"
temperature=1.5
max_completion_tokens=10000
top_p=1.0
stop=None
logprobs=None

# Decoding Parameters
params = DecodingParameters(
    temperature=temperature,
    max_completion_tokens=max_completion_tokens,
    top_p=top_p,
    stop=stop,
    logprobs=logprobs
)

prompt = "Give me the name of a random city from all over the world (only the name, no other text)."

## Client Initializers
Creating functions to initialize clients. When `cachesaver-core` API will be updated this will be done in the background.

In [None]:
def get_cachesaver_client():
    """
    Just a random function to intialize CacheSaver client
    """
    cache = Cache(cache_path)

    # Model
    model = OnlineLLM(provider="openai")

    # Pipeline
    pipeline = OnlineAPI(
        model=model,
        cache=cache,
        batch_size=batch_size,
        timeout=timeout,
        allow_batch_overflow=allow_batch_overflow,
        correctness=bool(correctness)
    )

    # CacheSaver Client
    client_cachesaver = API(
        pipeline=pipeline,
        model="gpt-4.1"
    )
    return client_cachesaver

def get_openai_client():
    """
    Just a random function to intialize CacheSaver client
    """
    client_openai = AsyncOpenAI()
    return client_openai

## Examples with OpenAI

In [None]:
for n in [3, 2, 5]:
    OpenAI = get_openai_client()
    response = await OpenAI.chat.completions.create(
        model="gpt-4.1-nano",
        messages=[
            {"role": "user", "content": prompt}
        ],
        n=n,
        temperature=temperature,
        max_completion_tokens=max_completion_tokens,
        top_p=top_p,
        stop=stop,
        logprobs=logprobs
    )
    print(f"Asking for {n} output samples: ", [choice.message.content for choice in response.choices])

Asking for 3 output samples:  ['Brisbane', 'Lviv', 'Seoul']
Asking for 2 output samples:  ['Portland', 'Kyoto']
Asking for 5 output samples:  ['Tokyo', 'Istanbul', 'Venice', 'Kyoto', 'Cadagaution']


## Examples with CacheSaver

In [None]:
for n in [3, 2, 5]:
    CacheSaver = get_cachesaver_client()
    response = await CacheSaver.request(
        prompt = prompt,
        params = params,
        n = n,
        request_id = f"sth",
        namespace="cities_experiment",
    )
    print(f"Asking for {n} output samples: ", [content for content in response])

Asking for 3 output samples:  ['Valparaíso', 'Dakar', 'Varna']
Asking for 2 output samples:  ['Valparaíso', 'Dakar']
Asking for 5 output samples:  ['Valparaíso', 'Dakar', 'Varna', 'Copenhagen', 'Mendoza']
