In [1]:
%pip install pandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.11 -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

In [3]:
def json_to_matrix_df(data):
    # Convert each metric's dictionary to a pandas Series
    series_dict = {
        metric: pd.Series(stats) / 1_000 if metric != "chunkCount" else pd.Series(stats)
        for metric, stats in data.items()
    }
    
    # Combine all series into a DataFrame
    df = pd.DataFrame(series_dict).T
    
    return df

In [4]:
import json

def compare_and_display_data(source1: str, source2: str):
    with open(source1) as file:
        d1 = json.loads(file.read())

    with open(source2) as file:
        d2 = json.loads(file.read())
    
    for key in d1:
        print("-" * 5, key, "-" * 5)
        display(json_to_matrix_df(d1[key]))
        display(json_to_matrix_df(d2[key]))

### Ollama vs vLLM

1. **concurrent, cold-start:** 10 concurrent requests when container is _NOT_ up 
2. **concurrent, warm:** 10 concurrent requests after container is already up **(warm)**
3.  **sequential, cold-start:** 10 _sequential_ requests, when the container is NOT up

In [5]:
compare_and_display_data("./ollama/stats.json", "./vllm/stats.json")

----- concurrent-10-same-prompt-cold-start -----


Unnamed: 0,min,max,average,p50,p75,p95,p99,p100
timeToFirstChunk,10.226645,25.438313,16.485572,18.036188,18.531193,25.382757,25.427202,25.438313
totalTime,17.76459,32.602925,23.893794,25.32923,25.864102,32.005623,32.483465,32.602925
chunkCount,303.0,444.0,361.2,356.5,377.0,419.25,439.05,444.0
averageChunksPerSecond,0.010967,0.020995,0.015874,0.013759,0.020418,0.020844,0.020965,0.020995


Unnamed: 0,min,max,average,p50,p75,p95,p99,p100
timeToFirstChunk,66.337837,66.653838,66.620191,66.651594,66.653132,66.653663,66.653803,66.653838
totalTime,74.204132,79.332479,76.766023,76.854729,77.470559,78.673945,79.200772,79.332479
chunkCount,257.0,435.0,345.0,348.5,370.25,410.7,430.14,435.0
averageChunksPerSecond,0.003463,0.005483,0.004485,0.004529,0.004788,0.005218,0.00543,0.005483


----- concurrent-10-same-prompt-warm -----


Unnamed: 0,min,max,average,p50,p75,p95,p99,p100
timeToFirstChunk,2.452957,16.46163,7.562445,8.465916,9.485647,15.183736,16.206051,16.46163
totalTime,7.81995,21.55972,14.301585,14.989873,18.00039,21.08079,21.463934,21.55972
chunkCount,251.0,425.0,323.1,323.5,346.0,392.6,418.52,425.0
averageChunksPerSecond,0.013915,0.034972,0.025003,0.021904,0.033514,0.034747,0.034927,0.034972


Unnamed: 0,min,max,average,p50,p75,p95,p99,p100
timeToFirstChunk,0.568239,0.617344,0.59366,0.593684,0.594649,0.608836,0.615642,0.617344
totalTime,8.443688,13.016415,10.833495,10.78835,11.460038,12.890034,12.991139,13.016415
chunkCount,267.0,424.0,348.0,346.0,369.25,419.5,423.1,424.0
averageChunksPerSecond,0.031621,0.032574,0.032088,0.032071,0.032228,0.032544,0.032568,0.032574


----- sequential-same-prompt -----


Unnamed: 0,min,max,average,p50,p75,p95,p99,p100
timeToFirstChunk,0.472128,10.196388,1.46701,0.503713,0.519377,5.843355,9.325782,10.196388
totalTime,4.511455,14.493526,6.119197,5.294502,5.494137,10.559759,13.706773,14.493526
chunkCount,313.0,404.0,360.7,363.5,375.75,398.6,402.92,404.0
averageChunksPerSecond,0.023252,0.070899,0.065328,0.069923,0.070198,0.070723,0.070863,0.070899


Unnamed: 0,min,max,average,p50,p75,p95,p99,p100
timeToFirstChunk,0.446023,63.891905,6.837025,0.49556,0.535088,35.399704,58.193465,63.891905
totalTime,8.470826,74.389394,16.393501,10.149277,11.081401,45.995497,68.710614,74.389394
chunkCount,291.0,393.0,348.5,352.0,378.75,391.65,392.73,393.0
averageChunksPerSecond,0.005135,0.034804,0.03169,0.034678,0.034697,0.034798,0.034803,0.034804


## Ollama vs vLLM: batched concurrent requests: 1, 2, ... 10

In [6]:
import json
import pandas as pd
from typing import Dict, Any
from pathlib import Path

def batched_json_to_df(file_path: str, field: str) -> pd.DataFrame:
    """
    Convert a nested JSON file containing performance metrics into a 2D pandas DataFrame.
    
    Parameters:
    -----------
    file_path : str
        Path to the JSON file containing nested performance metrics
    field : str
        The metric field to extract (e.g., "timeToFirstChunk", "totalTime")
        
    Returns:
    --------
    pd.DataFrame
        A DataFrame where:
        - Index: Number of concurrent requests (1, 2, 3, ...)
        - Columns: Statistics (min, max, average, p50, ...)
        - Values: The corresponding metric values
        
    Example:
    --------
    >>> df = json_to_df("metrics.json", "timeToFirstChunk")
    >>> print(df)
              min       max   average       p50       p75       p95       p99      p100
    1  542.062209 542.06221 542.06221 542.06221 542.06221 542.06221 542.06221 542.06221
    2  369.981792 644.45421 507.21800 507.21800 575.83610 630.73059 641.70948 644.45421
    """
    
    # Validate and read JSON file
    try:
        with open(file_path, 'r') as f:
            data: Dict[str, Any] = json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f"Could not find file: {file_path}")
    except json.JSONDecodeError:
        raise ValueError(f"Invalid JSON format in file: {file_path}")
    
    # Validate field exists
    if not any(field in entry for entry in data.values()):
        raise ValueError(f"Field '{field}' not found in data")
    
    # Define stat columns in desired order
    stat_columns = ['min', 'max', 'average', 'p50', 'p75', 'p95', 'p99', 'p100']
    
    # Extract data into dictionary for DataFrame
    df_data = {}
    for req_num, metrics in data.items():
        if field in metrics:
            df_data[int(req_num)] = {
                stat: metrics[field][stat]
                for stat in stat_columns
                if stat in metrics[field]
            }
    
    # Create DataFrame and sort index
    df = pd.DataFrame.from_dict(df_data, orient='index')
    df.sort_index(inplace=True)
    
    return df

## Ollama vs vLLM: batched concurrent requests: 1, 2, ... 10

In [7]:
print("----- time to first chunk (seconds) -----")
print("ollma:")
display(batched_json_to_df("./ollama/batched-concurrent-1-10-same-prompt-warm/stats.json", "timeToFirstChunk") / 1_000)
print("vLLM:")
display(batched_json_to_df("./vllm/batched-concurrent-1-10-same-prompt-warm/stats.json", "timeToFirstChunk") / 1_000)

----- time to first chunk (seconds) -----
ollma:


Unnamed: 0,min,max,average,p50,p75,p95,p99,p100
1,0.635483,0.635483,0.635483,0.635483,0.635483,0.635483,0.635483,0.635483
2,0.343561,0.499343,0.421452,0.421452,0.460398,0.491554,0.497785,0.499343
3,0.350625,0.504209,0.411276,0.378995,0.441602,0.491688,0.501705,0.504209
4,0.43106,0.505022,0.450045,0.432048,0.450763,0.49417,0.502851,0.505022
5,0.355785,7.517658,1.806057,0.386096,0.386708,6.091468,7.23242,7.517658
6,0.350514,8.97655,2.850625,0.499497,4.837467,8.303116,8.841863,8.97655
7,0.344395,8.467391,3.587836,0.5175,7.436944,8.177896,8.409492,8.467391
8,0.349951,8.905904,4.152758,3.745244,7.53283,8.76531,8.877785,8.905904
9,0.346472,15.153808,5.659753,8.038291,8.45667,12.853918,14.69383,15.153808
10,0.35437,15.297733,6.2067,7.267466,8.127731,14.974463,15.233079,15.297733


vLLM:


Unnamed: 0,min,max,average,p50,p75,p95,p99,p100
1,0.542062,0.542062,0.542062,0.542062,0.542062,0.542062,0.542062,0.542062
2,0.369982,0.644454,0.507218,0.507218,0.575836,0.630731,0.641709,0.644454
3,0.359018,0.489078,0.407376,0.37403,0.431554,0.477573,0.486777,0.489078
4,0.354277,0.480627,0.391681,0.365909,0.395155,0.463533,0.477208,0.480627
5,0.361354,0.495664,0.422386,0.38299,0.495014,0.495534,0.495638,0.495664
6,0.35505,0.463725,0.400752,0.374868,0.440305,0.463296,0.463639,0.463725
7,0.363855,0.501545,0.42847,0.379763,0.499517,0.501363,0.501509,0.501545
8,0.351668,0.465507,0.40169,0.386588,0.406433,0.465042,0.465414,0.465507
9,0.361868,0.48269,0.412114,0.384279,0.473506,0.479588,0.48207,0.48269
10,0.364537,0.515742,0.415856,0.39498,0.396548,0.514997,0.515593,0.515742


In [8]:
print("----- tokens per second -----")
print("ollma:")
display(batched_json_to_df("./ollama/batched-concurrent-1-10-same-prompt-warm/stats.json", "averageChunksPerSecond"))
print("vLLM:")
display(batched_json_to_df("./vllm/batched-concurrent-1-10-same-prompt-warm/stats.json", "averageChunksPerSecond"))

----- tokens per second -----
ollma:


Unnamed: 0,min,max,average,p50,p75,p95,p99,p100
1,63.69926,63.69926,63.69926,63.69926,63.69926,63.69926,63.69926,63.69926
2,56.466506,57.223739,56.845122,56.845122,57.034431,57.185877,57.216167,57.223739
3,46.263104,49.889997,48.441111,49.170233,49.530115,49.818021,49.875602,49.889997
4,43.625138,48.814938,45.466923,44.713808,45.932112,48.238373,48.699625,48.814938
5,29.539593,45.959037,41.115805,43.730573,44.030917,45.573413,45.881912,45.959037
6,22.424792,44.574557,37.472731,43.114258,44.155058,44.527403,44.565126,44.574557
7,21.534279,44.524877,34.585958,42.978923,43.664167,44.465176,44.512937,44.524877
8,20.816513,44.44313,33.445156,34.047287,44.002529,44.372355,44.428975,44.44313
9,13.15186,44.479134,30.753927,23.99555,44.220978,44.420851,44.467477,44.479134
10,17.413757,44.130455,30.245062,23.932957,43.767398,44.024654,44.109295,44.130455


vLLM:


Unnamed: 0,min,max,average,p50,p75,p95,p99,p100
1,34.703613,34.703613,34.703613,34.703613,34.703613,34.703613,34.703613,34.703613
2,33.822076,34.388686,34.105381,34.105381,34.247034,34.360356,34.38302,34.388686
3,33.908592,34.119917,34.027472,34.053906,34.086912,34.113316,34.118597,34.119917
4,33.005715,34.307448,33.728006,33.79943,34.045246,34.255008,34.29696,34.307448
5,33.026098,34.023151,33.392229,33.219605,33.476973,33.913915,34.001304,34.023151
6,32.809783,33.800736,33.373115,33.410089,33.562994,33.744727,33.789534,33.800736
7,32.74445,33.53908,33.099907,33.002321,33.277219,33.525108,33.536286,33.53908
8,31.97884,33.388816,32.953494,33.113384,33.225179,33.358438,33.382741,33.388816
9,32.263658,33.304383,32.749525,32.607271,33.046268,33.20495,33.284496,33.304383
10,32.287576,32.787016,32.543437,32.533002,32.606683,32.760491,32.781711,32.787016
