# CacheSaver-Core

In [1]:
import os
import asyncio
from diskcache import Cache
from typing import List, Any
from dataclasses import dataclass

from together import AsyncTogether

from cachesaver.pipelines import OnlineAPI
from cachesaver.typedefs import Request, Batch, Response, SingleRequestModel, BatchRequestModel

import sys
sys.path.append('..')

## Online LLM

Firstly an online LLM class has to be defined in the standards of CacheSaver. It's basically the last layer in Cachesaver's pipeline.

In [7]:
class OnlineLLM(SingleRequestModel, BatchRequestModel):
    def __init__(self, client: Any, model: str):
        self.client = client
        self.model = model

    async def request(self, request: Request) -> Response:
        completion = await self.client.chat.completions.create(
            messages = [
                {
                    "role" : "user",
                    "content" : request.prompt
                }
            ],
            model = self.model,
            n = request.n,
            max_tokens= request.max_completion_tokens or None, # or None not needed but just to be explicit
            temperature = request.temperature or 1,
            stop = request.stop or None,
            top_p = request.top_p or 1,
            seed = request.seed or None,
            logprobs = request.logprobs or False,
            top_logprobs = request.top_logprobs or None,
        )
        response = Response(
            data = [choice.message.content for choice in completion.choices]
        )
        return response
    
    async def batch_request(self, batch: Batch) -> List[Response]:
        requests = [self.request(request) for request in batch.requests]
        completions = await asyncio.gather(*requests)
        return completions



client = AsyncTogether(api_key=os.environ.get('TOGETHER_API_KEY_PERS'))
model_name="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free"
model = OnlineLLM(client, model=model_name)

### Making requests

Just a simple requests.

In [None]:
request = Request(
    prompt = "What is the meaning of life?",
    n = 1,
    request_id = "sth1",
    namespace="sth",
)

await model.request(request)

To add more details regarding the model's decoding process you have to redefine the `Request` object.

In [None]:
@dataclass(frozen=True)
class Request(Request):
    temperature: float=1
    max_completion_tokens: int=None

request = Request(
    prompt = "What is the meaning of life?",
    n = 1,
    request_id = "sth1",
    namespace="sth",
    max_completion_tokens = 30,
    temperature = 0.5,
)
response = await model.request(request)
response

### Parsing the Response

The `Response` object retains data and metadata of the whole cachesaver process.

- `Response.data` holds whatever the LLM returned.
- `Response.cached` includes whether the sample returned from the LLM was retrieved from the cache or not. That if in the pipeline `cachesaver.AsyncCacher` is included. Otherwise it defaults to `None`.
- `Response.duplicated` includes whether the sample was duplicated or not. That if in the pipeline `cachesaver.AsyncDeduplicator` is included. Otherwise it defaults to `None`.

In [None]:
print(f"{response.data=}\n")
print(f"{response.cached=}\n")
print(f"{response.duplicated=}")

## Integrating the model to the Cachesaver pipeline

In [3]:
@dataclass(frozen=True)
class Request(Request):
    temperature: float=1
    max_completion_tokens: int=None

In [8]:
# Pipeline
request1 = Request(prompt="What is the meaning of life?", n=2, request_id="sth1", namespace="sth", max_completion_tokens=10)
request2 = Request(prompt="Cats or dogs?", n=1, request_id="sth2", namespace="sth", max_completion_tokens=10)
batch = Batch(requests=[request1, request2])

cache = Cache("../caches/developping")

pipeline = OnlineAPI(
    model = model,
    cache=cache,
    batch_size = 2,
    timeout = 1
)

pipeline_request_response = await pipeline.request(request1)
print(f"{pipeline_request_response=}\n")

pipeline_batch_response = await pipeline.batch_request(batch)
print(f"{pipeline_batch_response=}")

pipeline_request_response=Response(data=['The meaning of life is a question that has puzzled', 'The meaning of life is a question that has puzzled'], cached=[True, True], duplicated=[False, True])

pipeline_batch_response=[Response(data=['The meaning of life is a question that has puzzled', 'The question of the meaning of life has puzzled philosophers'], cached=[True, True], duplicated=[False, True]), Response(data=["As a neutral AI, I don't have personal"], cached=[False], duplicated=[False])]
