## Turn our Fine-Tuned Model into a DEMO

In [1]:
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

In [2]:
MODEL_PATH = '/Users/berkeruveyik/pythonDersleri/finetune-llm/checkpoint_models/checkpoint-426'

loaded_model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    dtype='auto',
    device_map='auto',
    attn_implementation=('eager')
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)


loaded_model_pipeline = pipeline(
    'text-generation',
    model=loaded_model,
    tokenizer=tokenizer
)

The tokenizer you are loading from '/Users/berkeruveyik/pythonDersleri/finetune-llm/checkpoint_models/checkpoint-426' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.
Device set to use mps


In [3]:
loaded_model_pipeline("What is the capital of Turkey?")

[{'generated_text': 'What is the capital of Turkey? The capital of Turkey is Ankara.'}]

In [18]:
def format_message(input):
    return [{'role': 'user', 'content': input}]

input_message = 'bug√ºn evde salata balƒ±k ve rakƒ±dan olu≈üan enfis bir yemek yedim'

input_formatted = format_message(input_message)
input_formatted

[{'role': 'user',
  'content': 'bug√ºn evde salata balƒ±k ve rakƒ±dan olu≈üan enfis bir yemek yedim'}]

In [19]:
loaded_model_pipeline(input_formatted[0]['content'])

[{'generated_text': 'bug√ºn evde salata balƒ±k ve rakƒ±dan olu≈üan enfis bir yemek yedim.'}]

In [20]:
input_prompt = loaded_model_pipeline.tokenizer.apply_chat_template(
    conversation=input_formatted,
    tokenize=False,
    add_generation_prompt=True
)

input_prompt

'<bos><start_of_turn>user\nbug√ºn evde salata balƒ±k ve rakƒ±dan olu≈üan enfis bir yemek yedim<end_of_turn>\n<start_of_turn>model\n'

In [21]:
loaded_model_outputs = loaded_model_pipeline(
    text_inputs=input_prompt,
    max_new_tokens=256,
)

# View and compare the outputs
print (f" [INFO] Input: \n{input_prompt}\n")
print (f" [INFO] Output: \n{loaded_model_outputs[0]['generated_text'][len(input_prompt) :]}")

 [INFO] Input: 
<bos><start_of_turn>user
bug√ºn evde salata balƒ±k ve rakƒ±dan olu≈üan enfis bir yemek yedim<end_of_turn>
<start_of_turn>model


 [INFO] Output: 
{'is_food_or_drink': True, 'tags': ['re', 'fi'], 'food_items': ['salata', 'balƒ±k'], 'drink_items': []}


In [None]:
# Errors in ML are good, it shows you where to improve the model
# There will *always* be errors in ML, since the whole science is probabilistic anyway

## Turn our pipeline into a demo

Our demo is simple:
* Text in, formatted LLM text out
* Also want to time how long it takes per sample

In [27]:
import time 

def pred_on_text(input_text):
    start_time = time.time()
    
    raw_output = loaded_model_pipeline(text_inputs=[{'role': 'user', 
                                                     'content': input_text}], 
                                       max_new_tokens=256,
                                       disable_compile=True)
    
    end_time = time.time()
    total_time = round(end_time - start_time, 4)
    
    generate_text = raw_output[0]['generated_text'][1]['content']
    
    return generate_text, raw_output, total_time

# pred on demo 
pred_on_text("bug√ºn evde k√∂fte patates yedim yanƒ±ndada kola i√ßtim √ßok g√ºzeldi") 

("{'is_food_or_drink': True, 'tags': ['fi'], 'food_items': ['k√∂fte', 'patates'], 'drink_items': []}",
 [{'generated_text': [{'role': 'user',
     'content': 'bug√ºn evde k√∂fte patates yedim yanƒ±ndada kola i√ßtim √ßok g√ºzeldi'},
    {'role': 'assistant',
     'content': "{'is_food_or_drink': True, 'tags': ['fi'], 'food_items': ['k√∂fte', 'patates'], 'drink_items': []}"}]}],
 3.4828)

### Gradio

In [31]:
import gradio as gr
import json

def parse_generated_text(text):
    """Parse the generated text and format it nicely"""
    try:
        # Try to parse as JSON if it's JSON formatted
        data = json.loads(text)
        return data
    except:
        # If not JSON, try to extract key-value pairs
        try:
            # Clean and parse the text
            text = text.strip()
            if text.startswith('{') and text.endswith('}'):
                data = eval(text)
                return data
        except:
            pass
    return {"raw_output": text}

def gradio_predict(input_text):
    """Wrapper function for Gradio"""
    if not input_text.strip():
        return "Please enter some text.", "0 seconds"
    
    generated_text, raw_output, total_time = pred_on_text(input_text)
    time_info = f"{total_time} seconds"
    
    # Parse the generated text
    parsed_output = parse_generated_text(generated_text)
    
    # Format output as pretty JSON
    output_json = json.dumps({
        "input": input_text,
        "model_response": parsed_output,
        "processing_time": total_time
    }, indent=2, ensure_ascii=False)
    
    return output_json, time_info

# Gradio interface
demo = gr.Interface(
    fn=gradio_predict,
    inputs=gr.Textbox(
        label="Input Text",
        placeholder="Enter your text here...",
        lines=3
    ),
    outputs=[
        gr.Code(label="Model Output (JSON)", language="json"),
        gr.Textbox(label="Processing Time")
    ],
    title="ü§ñ Fine-Tuned LLM Demo",
    description="Test your fine-tuned model. Enter text and see the model's response.",
    examples=[
        ["Today I ate meatballs and potatoes at home with cola, it was delicious"],
        ["What is the capital of Turkey?"],
        ["Hello, how are you?"],
        ["British Breakfast with baked beans, fried eggs, black pudding, sausages, bacon, mushrooms, a cup of tea and toast and fried tomatoes"],
    ],
    theme=gr.themes.Soft()
)

# Launch the demo
demo.launch(share=False)

  super().__init__(


  super().__init__(


* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.




In [34]:
tags_dict = {'np': 'nutrition_panel',
'il': 'ingredient list',
'me': 'menu',
're': 'recipe',
'fi': 'food_items',
'di': 'drink_items',
'fa': 'food_advertistment',
'fp': 'food_packaging'}

## Upload our model to Hugging Face

In [None]:
from huggingface_hub import HfApi, create_repo

api = HfApi()
repo_id = 'berkeruveyik/food-nutrition-analyzer-gemma3-270m'

# Repo'yu olu≈ütur
create_repo(repo_id=repo_id, repo_type='model', exist_ok=True)

RepoUrl('https://huggingface.co/berkeruveyik/food-nutrition-analyzer-gemma3-270m', endpoint='https://huggingface.co', repo_type='model', repo_id='berkeruveyik/food-nutrition-analyzer-gemma3-270m')

In [45]:
api.upload_folder(
    folder_path='/Users/berkeruveyik/pythonDersleri/finetune-llm/checkpoint_models/checkpoint-426',
    repo_id=repo_id,
    repo_type='model',
)

Processing Files (7 / 7): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.65GB / 1.65GB, 62.1MB/s  
New Data Upload: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.61GB / 1.61GB, 62.1MB/s  


Processing Files (7 / 7): 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.65GB / 1.65GB, 62.1MB/s  
New Data Upload: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1.61GB / 1.61GB, 62.1MB/s  


CommitInfo(commit_url='https://huggingface.co/berkeruveyik/food-nutrition-analyzer-gemma3-270m/commit/b26bd55aa28b8779847dca4ed3c2210ea74e3dfd', commit_message='Upload folder using huggingface_hub', commit_description='', oid='b26bd55aa28b8779847dca4ed3c2210ea74e3dfd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/berkeruveyik/food-nutrition-analyzer-gemma3-270m', endpoint='https://huggingface.co', repo_type='model', repo_id='berkeruveyik/food-nutrition-analyzer-gemma3-270m'), pr_revision=None, pr_num=None)

## Upload our demo to Hugging Face

To make our demo, we need:
* 'app.py' - Entry point for our app
* 'README.md' - Tells people what our app does
* requirements.txt' - Tells Hugging Face Spaces what our app requires
    * only need: torch, transformers, gradio

In [69]:
import gradio as gr
import json
import time
import spaces
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

MODEL_PATH = 'berkeruveyik/food-nutrition-analyzer-gemma3-270m'

# Load model and tokenizer
loaded_model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype='auto',
    device_map='auto',
    attn_implementation='eager'
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

loaded_model_pipeline = pipeline(
    'text-generation',
    model=loaded_model,
    tokenizer=tokenizer
)

@spaces.GPU
def pred_on_text(input_text):
    """Generate prediction from input text"""
    start_time = time.time()

    raw_output = loaded_model_pipeline(
        text_inputs=[{'role': 'user', 'content': input_text}], 
        max_new_tokens=256
    )

    end_time = time.time()
    total_time = round(end_time - start_time, 4)

    generated_text = raw_output[0]['generated_text'][1]['content']

    return generated_text, raw_output, total_time

def parse_generated_text(text):
    """Parse the generated text and format it nicely"""
    try:
        data = json.loads(text)
        return data
    except:
        try:
            text = text.strip()
            if text.startswith('{') and text.endswith('}'):
                data = eval(text)
                return data
        except:
            pass
    return {"raw_output": text}

def format_output(input_text, parsed_output, total_time):
    """Format output as readable text with each field on new line"""
    output_lines = []
    output_lines.append(f"üìù Input: {input_text}")
    output_lines.append("")
    output_lines.append("‚îÅ" * 50)
    output_lines.append("")
    
    if "is_food_or_drink" in parsed_output:
        output_lines.append(f"üçΩÔ∏è is_food_or_drink: {parsed_output['is_food_or_drink']}")
    
    if "tags" in parsed_output:
        output_lines.append(f"üè∑Ô∏è tags: {parsed_output['tags']}")
    
    if "food_items" in parsed_output:
        output_lines.append(f"üçî food_items: {parsed_output['food_items']}")
    
    if "drink_items" in parsed_output:
        output_lines.append(f"ü•§ drink_items: {parsed_output['drink_items']}")
    
    output_lines.append("")
    output_lines.append("‚îÅ" * 50)
    output_lines.append(f"‚è±Ô∏è processing_time: {total_time} seconds")
    
    return "\n".join(output_lines)

def gradio_predict(input_text):
    """Wrapper function for Gradio"""
    if not input_text.strip():
        return "Please enter some text."

    generated_text, raw_output, total_time = pred_on_text(input_text)
    parsed_output = parse_generated_text(generated_text)
    
    formatted_output = format_output(input_text, parsed_output, total_time)
    
    return formatted_output

# Gradio interface
demo = gr.Interface(
    fn=gradio_predict,
    inputs=gr.Textbox(
        label="Input Text",
        placeholder="Enter your text here...",
        lines=3
    ),
    outputs=gr.Textbox(
        label="Model Output",
        lines=12
    ),
    title="üçî Food & Nutrition Analyzer",
    description="Enter text describing food or drinks to extract structured nutrition information using a fine-tuned Gemma3 model.",
    examples=[
        ["Today I ate meatballs and potatoes at home with cola, it was delicious"],
        ["British Breakfast with baked beans, fried eggs, black pudding, sausages, bacon, mushrooms, a cup of tea and toast"],
        ["I had a chicken salad with olive oil dressing and sparkling water"],
        ["For lunch I ordered pizza margherita with extra cheese and a glass of lemonade"],
        ["Grilled salmon with steamed vegetables and white wine for dinner"],
        ["My morning started with oatmeal, fresh berries, honey and a cup of black coffee"],
        ["We shared nachos with guacamole, sour cream, and margaritas at the Mexican restaurant"],
        ["Japanese ramen with pork belly, soft boiled egg, nori and green tea"],
        ["Homemade pasta carbonara with parmesan cheese and a bottle of red wine"],
        ["Smoothie bowl with banana, mango, chia seeds, granola and almond milk"],
    ],
    theme=gr.themes.Soft()
)

if __name__ == "__main__":
    demo.launch()

`torch_dtype` is deprecated! Use `dtype` instead!
Device set to use mps
  super().__init__(


* Running on local URL:  http://127.0.0.1:7864
* To create a public link, set `share=True` in `launch()`.


In [62]:
%%writefile demo/FoodExtractApp/README.md
---
title: Food & Nutrition Analyzer
emoji: üçî
colorFrom: green
colorTo: yellow
sdk: gradio
sdk_version: 4.0.0
app_file: app.py
pinned: false
---

# Food & Nutrition Analyzer üçî             

A fine-tuned Gemma3-270M model for extracting structured nutrition information from food descriptions.

## Description

This demo uses a fine-tuned language model to analyze text descriptions of food and drinks, extracting structured information about nutrition and ingredients.

## Usage

Simply enter a description of what you ate or drank, and the model will extract relevant information in a structured JSON format.

## Examples

- "Today I ate meatballs and potatoes at home with cola, it was delicious"
- "I had a chicken salad with olive oil dressing and sparkling water"

## Model

- **Base Model**: Gemma3-270M
- **Fine-tuned on**: Food and nutrition data
- **Model ID**: berkeruveyik/food-nutrition-analyzer-gemma3-270m

## License

Please check the model license on the Hugging Face model page.

Overwriting demo/FoodExtractApp/README.md


In [63]:
%%writefile demo/FoodExtractApp/requirements.txt
transformers
torch
gradio
accelerate

Overwriting demo/FoodExtractApp/requirements.txt


### upload our demo to the Hugging Face Hub

In [64]:
from huggingface_hub import create_repo, get_full_repo_name, upload_folder

# 1. Define the parameters for upload
LOCAL_DEMO_FOLDER_PATH = "demo/FoodExtractApp/"
HF_SPACE_NAME = "food-nutrition-analyzer"
HF_REPO_TYPE = "space"
HF_SPACE_SDK = "gradio"

# 2. Create a Space repository on Hugging Face Hub
print(f"[INFO] Creating repo on Hugging Face Hub with name: {HF_SPACE_NAME}")
create_repo(
    repo_id=HF_SPACE_NAME,
    repo_type=HF_REPO_TYPE,
    private=False,
    space_sdk=HF_SPACE_SDK,
    exist_ok=True
)

# 3. Get the full repository name
full_repo_name = get_full_repo_name(model_id=HF_SPACE_NAME)
print(f"[INFO] Full Hugging Face Hub repo name: {full_repo_name}")

# 4. Upload demo folder to Hugging Face Space
print(f"[INFO] Uploading {LOCAL_DEMO_FOLDER_PATH} to repo: {full_repo_name}")
upload_folder(
    folder_path=LOCAL_DEMO_FOLDER_PATH,
    repo_id=full_repo_name,
    repo_type=HF_REPO_TYPE
)

print(f"[INFO] Upload complete! View your Space at: https://huggingface.co/spaces/{full_repo_name}")

[INFO] Creating repo on Hugging Face Hub with name: food-nutrition-analyzer
[INFO] Full Hugging Face Hub repo name: berkeruveyik/food-nutrition-analyzer
[INFO] Uploading demo/FoodExtractApp/ to repo: berkeruveyik/food-nutrition-analyzer
[INFO] Upload complete! View your Space at: https://huggingface.co/spaces/berkeruveyik/food-nutrition-analyzer


## Speeding up our inference time with batched inference
Right now our model can predict on a single sample in about 0.3 ‚Üí 1.0s.

However, if we wanted to run this at scale on say 100M+ samples, this would take far too long.

So we need a way to speed up our model's inference.

One way to do that is batched inference.

In batched inference mode, your model performs predictions on number of samples at once, this can dramatically improve sample throughput.

The number of samples you can predict on at once will depend on a few factors:

* The size of your model (e.g.if your model is quite large,it may only be able to predict on 1 sample at time)
* The your compute VRAM (e.g. if your compute VRAM is already saturated, add multiple samples a time may result in errors)
* The size of your samples lif one of your samples is 100x the size of others, this may cause errors with batched inferencel


To find an optimal batch size for our setup, we can run an experiment:
* Loop through different batch sizes and measure the throughput for each batch size.
    * Why do we do this?
        * It's hard to tell the ideal batch size ahead of time.
        * So we experiment from say 1, 2, 4, 16, 32, 64 batch sizes and see which performs best.
        * Just because we may get a speed up from using batch size 8, doesn't mean 64 will be better.

In [None]:
# Step 1: Need to turn our samples into batches
# Step 2: Need to perform batched inference
# Step 3: Unwind batched samples and prediction outputs to view them as normal

In [65]:
from datasets import load_dataset

dataset = load_dataset("mrdbourke/FoodExtract-1k")

print(f"[INFO] Number of samples in the dataset: {len(dataset['train'])}")

def sample_to_conversation(sample):
    return {
        "messages": [
            {"role": "user", "content": sample["sequence"]}, # Load the sequence from the dataset
            {"role": "system", "content": sample["gpt-oss-120b-label-condensed"]} # Load the gpt-oss-120b generated label
        ]
    }

# Map our sample_to_conversation function to dataset 
dataset = dataset.map(sample_to_conversation,
                      batched=False)

# Create a train/test split
dataset = dataset["train"].train_test_split(test_size=0.2,
                                            shuffle=False,
                                            seed=42)

# Number #1 rule in machine learning
# Always train on the train set and test on the test set
# This gives us an indication of how our model will perform in the real world
dataset

[INFO] Number of samples in the dataset: 1420


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1420/1420 [00:00<00:00, 11781.29 examples/s]


[INFO] Number of samples in the dataset: 1420


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1420/1420 [00:00<00:00, 11781.29 examples/s]


DatasetDict({
    train: Dataset({
        features: ['sequence', 'image_url', 'class_label', 'source', 'char_len', 'word_count', 'syn_or_real', 'uuid', 'gpt-oss-120b-label', 'gpt-oss-120b-label-condensed', 'target_food_names_to_use', 'caption_detail_level', 'num_foods', 'target_image_point_of_view', 'messages'],
        num_rows: 1136
    })
    test: Dataset({
        features: ['sequence', 'image_url', 'class_label', 'source', 'char_len', 'word_count', 'syn_or_real', 'uuid', 'gpt-oss-120b-label', 'gpt-oss-120b-label-condensed', 'target_food_names_to_use', 'caption_detail_level', 'num_foods', 'target_image_point_of_view', 'messages'],
        num_rows: 284
    })
})

In [66]:
# Step 1: Need to turn our samples into batches (e.g. lists of samples)
print(f"[INFO] Formatting test samples into list prompts...")
test_input_prompts = [
    loaded_model_pipeline.tokenizer.apply_chat_template(
        item["messages"][:1],
        tokenize=False,
        add_generation_prompt=True
    )
    for item in dataset["test"]
]
print(f"[INFO] Number of test sample prompts: {len(test_input_prompts)}")
test_input_prompts[0]

[INFO] Formatting test samples into list prompts...
[INFO] Number of test sample prompts: 284


'<bos><start_of_turn>user\nLiving Planet Goat Milk Whole Milk, 1 Litre, GMO Free, Australian Dairy, 8.75g Protein Per Serve, Good Source of Calcium.<end_of_turn>\n<start_of_turn>model\n'

In [None]:
# Step 2: Need to perform batched inference and time each step
import time
from tqdm.auto import tqdm

all_outputs = []

# Let's write a list of batch sizes to test
chunk_sizes_to_test = [1, 4, 8, 16, 32, 64, 128]
timing_dict = {}

# Loop through each batch size and time the inference
for CHUNK_SIZE in chunk_sizes_to_test:
    print(f"[INFO] Making predictions with batch size: {CHUNK_SIZE}")
    start_time = time.time()

    for chunk_number in tqdm(range(round(len(test_input_prompts) / CHUNK_SIZE))):
        batched_inputs = test_input_prompts[(CHUNK_SIZE * chunk_number): CHUNK_SIZE * (chunk_number + 1)]
        batched_outputs = loaded_model_pipeline(text_inputs=batched_inputs,
                                                batch_size=CHUNK_SIZE,
                                                max_new_tokens=256,
                                                disable_compile=True)
        
        all_outputs += batched_outputs
    
    end_time = time.time()
    total_time = end_time - start_time
    timing_dict[CHUNK_SIZE] = total_time
    print()
    print(f"[INFO] Total time for batch size {CHUNK_SIZE}: {total_time:.2f}s")
    print("="*80 + "\n\n")

In [None]:
import matplotlib.pyplot as plt

# Data
data = timing_dict

total_samples = len(dataset["test"])

batch_sizes = list(data.keys())
inference_times = list(data.values())
samples_per_second = [total_samples / time for bs, time in data.items()]

# Create side-by-side plots
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 5))

# --- Left plot: Total Inference Time ---
ax1.bar([str(bs) for bs in batch_sizes], inference_times, color='steelblue')
ax1.set_xlabel('Batch Size')
ax1.set_ylabel('Total Inference Time (s)')
ax1.set_title('Inference Time by Batch Size')

for i, v in enumerate(inference_times):
    ax1.text(i, v + 1, f'{v:.1f}', ha='center', fontsize=9)

# --- ARROW LOGIC (Left) ---
# 1. Identify Start (Slowest) and End (Fastest)
start_val = max(inference_times)
end_val = min(inference_times)
start_idx = inference_times.index(start_val)
end_idx = inference_times.index(end_val)

speedup = start_val / end_val

# 2. Draw Arrow (No Text)
# connectionstyle "rad=-0.3" arcs the arrow upwards
ax1.annotate("",
             xy=(end_idx, end_val+(0.5*end_val)),
             xytext=(start_idx+0.25, start_val+10),
             arrowprops=dict(arrowstyle="->", color='green', lw=1.5, connectionstyle="arc3,rad=-0.3"))

# 3. Place Text at Midpoint
mid_x = (start_idx + end_idx) / 2
# Place text slightly above the highest point of the two bars
text_y = max(start_val, end_val) + (max(inference_times) * 0.1)

ax1.text(mid_x+0.5, text_y-150, f"{speedup:.1f}x speedup",
         ha='center', va='bottom', fontweight='bold',
         bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="none", alpha=0.8))

ax1.set_ylim(0, max(inference_times) * 1.35) # Increase headroom for text


# --- Right plot: Samples per Second ---
ax2.bar([str(bs) for bs in batch_sizes], samples_per_second, color='coral')
ax2.set_xlabel('Batch Size')
ax2.set_ylabel('Samples per Second')
ax2.set_title('Throughput by Batch Size')

for i, v in enumerate(samples_per_second):
    ax2.text(i, v + 0.05, f'{v:.2f}', ha='center', fontsize=9)

# --- ARROW LOGIC (Right) ---
# 1. Identify Start (Slowest) and End (Fastest)
start_val_t = min(samples_per_second)
end_val_t = max(samples_per_second)
start_idx_t = samples_per_second.index(start_val_t)
end_idx_t = samples_per_second.index(end_val_t)

speedup_t = end_val_t / start_val_t

# 2. Draw Arrow (No Text)
ax2.annotate("",
             xy=(end_idx_t-(0.05*end_idx_t), end_val_t+(0.025*end_val_t)),
             xytext=(start_idx_t, start_val_t+0.6),
             arrowprops=dict(arrowstyle="->", color='green', lw=1.5, connectionstyle="arc3,rad=-0.3"))

# 3. Place Text at Midpoint
mid_x_t = (start_idx_t + end_idx_t) / 2
text_y_t = max(start_val_t, end_val_t) + (max(samples_per_second) * 0.1)

ax2.text(mid_x_t-0.5, text_y_t-4.5, f"{speedup_t:.1f}x speedup",
         ha='center', va='bottom', fontweight='bold',
         bbox=dict(boxstyle="round,pad=0.3", fc="white", ec="none", alpha=0.8))

ax2.set_ylim(0, max(samples_per_second) * 1.35) # Increase headroom

plt.suptitle("Inference with Fine-Tuned Gemma 3 270M on NVIDIA DGX Spark")
plt.tight_layout()
plt.savefig('inference_benchmark.png', dpi=150)
plt.show()

In [None]:
samples_per_second = round(len(dataset["test"]) / min(timing_dict.values()), 2)
seconds_in_a_day = 86_400
samples_per_day = seconds_in_a_day * samples_per_second

print(f"[INFO] Number of samples per second: {samples_per_second} | Number of samples per day: {samples_per_day}")

