In [None]:
!pip install litserve torch transformers -q

import litserve as ls
import torch
from transformers import pipeline
import time
from typing import List

In [None]:
class TextGeneratorAPI(ls.LitAPI):
    def setup(self, device):
        self.model = pipeline("text-generation", model="distilgpt2", device=0 if device == "cuda" and torch.cuda.is_available() else -1)
        self.device = device
    def decode_request(self, request):
        return request["prompt"]
    def predict(self, prompt):
        result = self.model(prompt, max_length=100, num_return_sequences=1, temperature=0.8, do_sample=True)
        return result[0]['generated_text']
    def encode_response(self, output):
        return {"generated_text": output, "model": "distilgpt2"}

class BatchedSentimentAPI(ls.LitAPI):
    def setup(self, device):
        self.model = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", device=0 if device == "cuda" and torch.cuda.is_available() else -1)
    def decode_request(self, request):
        return request["text"]
    def batch(self, inputs: List[str]) -> List[str]:
        return inputs
    def predict(self, batch: List[str]):
        results = self.model(batch)
        return results
    def unbatch(self, output):
        return output
    def encode_response(self, output):
        return {"label": output["label"], "score": float(output["score"]), "batched": True}

In [None]:
class StreamingTextAPI(ls.LitAPI):
    def setup(self, device):
        self.model = pipeline("text-generation", model="distilgpt2", device=0 if device == "cuda" and torch.cuda.is_available() else -1)
    def decode_request(self, request):
        return request["prompt"]
    def predict(self, prompt):
        words = ["Once", "upon", "a", "time", "in", "a", "digital", "world"]
        for word in words:
            time.sleep(0.1)
            yield word + " "
    def encode_response(self, output):
        for token in output:
            yield {"token": token}

In [None]:
class MultiTaskAPI(ls.LitAPI):
    def setup(self, device):
        self.sentiment = pipeline("sentiment-analysis", device=-1)
        self.summarizer = pipeline("summarization", model="sshleifer/distilbart-cnn-6-6", device=-1)
        self.device = device
    def decode_request(self, request):
        return {"task": request.get("task", "sentiment"), "text": request["text"]}
    def predict(self, inputs):
        task = inputs["task"]
        text = inputs["text"]
        if task == "sentiment":
            result = self.sentiment(text)[0]
            return {"task": "sentiment", "result": result}
        elif task == "summarize":
            if len(text.split()) < 30:
                return {"task": "summarize", "result": {"summary_text": text}}
            result = self.summarizer(text, max_length=50, min_length=10)[0]
            return {"task": "summarize", "result": result}
        else:
            return {"task": "unknown", "error": "Unsupported task"}
    def encode_response(self, output):
        return output

In [None]:
class CachedAPI(ls.LitAPI):
    def setup(self, device):
        self.model = pipeline("sentiment-analysis", device=-1)
        self.cache = {}
        self.hits = 0
        self.misses = 0
    def decode_request(self, request):
        return request["text"]
    def predict(self, text):
        if text in self.cache:
            self.hits += 1
            return self.cache[text], True
        self.misses += 1
        result = self.model(text)[0]
        self.cache[text] = result
        return result, False
    def encode_response(self, output):
        result, from_cache = output
        return {"label": result["label"], "score": float(result["score"]), "from_cache": from_cache, "cache_stats": {"hits": self.hits, "misses": self.misses}}

In [5]:
def test_apis_locally():
    print("=" * 70)
    print("Testing APIs Locally (No Server)")
    print("=" * 70)

    api1 = TextGeneratorAPI(); api1.setup("cpu")
    decoded = api1.decode_request({"prompt": "Artificial intelligence will"})
    result = api1.predict(decoded)
    encoded = api1.encode_response(result)
    print(f"✓ Result: {encoded['generated_text'][:100]}...")

    api2 = BatchedSentimentAPI(); api2.setup("cpu")
    texts = ["I love Python!", "This is terrible.", "Neutral statement."]
    decoded_batch = [api2.decode_request({"text": t}) for t in texts]
    batched = api2.batch(decoded_batch)
    results = api2.predict(batched)
    unbatched = api2.unbatch(results)
    for i, r in enumerate(unbatched):
        encoded = api2.encode_response(r)
        print(f"✓ '{texts[i]}' -> {encoded['label']} ({encoded['score']:.2f})")

    api3 = MultiTaskAPI(); api3.setup("cpu")
    decoded = api3.decode_request({"task": "sentiment", "text": "Amazing tutorial!"})
    result = api3.predict(decoded)
    print(f"✓ Sentiment: {result['result']}")

    api4 = CachedAPI(); api4.setup("cpu")
    test_text = "LitServe is awesome!"
    for i in range(3):
        decoded = api4.decode_request({"text": test_text})
        result = api4.predict(decoded)
        encoded = api4.encode_response(result)
        print(f"✓ Request {i+1}: {encoded['label']} (cached: {encoded['from_cache']})")

    print("=" * 70)
    print("✅ All tests completed successfully!")
    print("=" * 70)

test_apis_locally()

Testing APIs Locally (No Server)

1️⃣ Testing Text Generator...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


✓ Result: Artificial intelligence will be a real threat to the security of the world's security as it develops...

2️⃣ Testing Batched Sentiment...




config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


✓ 'I love Python!' -> POSITIVE (1.00)
✓ 'This is terrible.' -> NEGATIVE (1.00)
✓ 'Neutral statement.' -> NEGATIVE (0.99)

3️⃣ Testing Multi-Task API...


config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

Device set to use cpu


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/460M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/460M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

Device set to use cpu
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


✓ Sentiment: {'label': 'POSITIVE', 'score': 0.9998732805252075}

4️⃣ Testing Cached API...


Device set to use cpu


✓ Request 1: POSITIVE (cached: False)
✓ Request 2: POSITIVE (cached: True)
✓ Request 3: POSITIVE (cached: True)

✅ All tests completed successfully!
