In [None]:
import pandas as pd
import utils as ut
import re

import plotly.express as px

## List of models

The models tested range from 7b to 70B parameters, all quantized versions.  
The models not directly available in quantized version are converted with the koboldcpp tools().  

In [24]:
models_dic = {
    "airoboros-7B": "airoboros-mistral2.2-7b.Q4_K_S.gguf",
    "llama3-70B-inst": "Meta-Llama-3.1-70B-Instruct-IQ4_XS.gguf",
    "llama3-8B-inst": "Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf",
    "mistral-small-24B": "mistralai_Mistral-Small-3.2-24B-Instruct-2506-IQ4_XS.gguf",
    "deepseek-8B": "DeepSeek-R1-0528-Qwen3-8B-Q4_K_S.gguf",
    "deepseek-8B-UD": "DeepSeek-R1-0528-Qwen3-8B-UD-IQ1_S.gguf",
    "olmo-7B": "OLMo-2-1124-7B-Instruct-Q4_K_S.gguf",
    "qwen2.5-coder-7B": "Qwen2.5.1-Coder-7B-Instruct-Q4_K_S.gguf",
    "qwen2.5-coder-32B": "Qwen2.5-Coder-32B-Instruct-Q4_K_S.gguf",
    "nemotron-70B": "Llama-3.1-Nemotron-70B-Instruct-HF-IQ4_XS.gguf"
}


Models quick summary

In [None]:
df_models = pd.DataFrame([
    {"name": k, "full_name": v, "size": ut.size(k)}
    for k, v in models_dic.items()
])

In [49]:
df_models.sort_values(by="size", ascending=False, inplace=True)
df_models

Unnamed: 0,name,full_name,size
1,llama3-70B-inst,Meta-Llama-3.1-70B-Instruct-IQ4_XS.gguf,70.0
9,nemotron-70B,Llama-3.1-Nemotron-70B-Instruct-HF-IQ4_XS.gguf,70.0
8,qwen2.5-coder-32B,Qwen2.5-Coder-32B-Instruct-Q4_K_S.gguf,32.0
3,mistral-small-24B,mistralai_Mistral-Small-3.2-24B-Instruct-2506-...,24.0
5,deepseek-8B-UD,DeepSeek-R1-0528-Qwen3-8B-UD-IQ1_S.gguf,8.0
2,llama3-8B-inst,Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf,8.0
4,deepseek-8B,DeepSeek-R1-0528-Qwen3-8B-Q4_K_S.gguf,8.0
0,airoboros-7B,airoboros-mistral2.2-7b.Q4_K_S.gguf,7.0
7,qwen2.5-coder-7B,Qwen2.5.1-Coder-7B-Instruct-Q4_K_S.gguf,7.0
6,olmo-7B,OLMo-2-1124-7B-Instruct-Q4_K_S.gguf,7.0


Note that each model is from a model family, sometimes also re-finetuned, e.g. the Nemotron70B used comes from llama3.1.  
**Base model** : meta-llama/Llama-3.1-70B  
-> **Finetuned** : meta-llama/Llama-3.1-70B-Instruct  
--> **Finetuned** : nvidia/Llama-3.1-Nemotron-70B-Instruct-HF  
---> **Quantized** : Llama-3.1-Nemotron-70B-Instruct-HF-IQ4_XS.gguf (this model)

## Steps
 - Launch GGUF model
 - Eval test prompt with *perf.py*
 - Stats


In [None]:
# repeated for each model 
# 1 - to launch KoboldCPP with the specified model
model = "../airoboros-mistral2.2-7b.Q4_K_S.gguf"
!../koboldcpp --model $model --launch
# 2 - koboldcpp eval, works directly with latest inference
python3 ./perf.py 

## Results


Once launched, we interact with the model using the KoboldCPP interface.  
Prompts : these are the average speeds over 5X the same 2  prompts on different topics (general nutrition or code)

In [None]:
with open('bench.log') as f:
    df_logs = ut.log_to_df(f.read())

df_logs.columns = df_logs.columns.str.replace(' Speed', '')
df_logs

Unnamed: 0,Model,Processing,Generation,Total Tokens per Second
0,airoboros-mistral2.2-7b.Q4_K_S,424.66,106.57,98.52
1,DeepSeek-R1-0528-Qwen3-8B-Q4_K_S.gguf,1105.26,82.43,589.11
2,DeepSeek-R1-0528-Qwen3-8B-UD-IQ1_S.gguf,210.0,88.6,111.93
3,LLama-3.1-Nemotron-70B-Instruct-HF-IQ4_XS.gguf,589.53,15.46,633.74
4,Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf,1074.07,90.7,581.07
5,Meta-Llama-3.1-70B-Instruct-IQ4_XS.gguf,315.22,15.61,170.53
6,mistralai_Mistral-Small-3.2-24B-Instruct-2506-...,1157.89,40.86,618.32
7,OLMo-2-1124-7B-Instruct-Q4_K_S.gguf,2000.0,91.14,1096.0
8,Qwen2.5-Coder-32B-Instruct-Q4_K_S.gguf,1333.33,29.06,720.0
9,Qwen2.5.1-Coder-7B-Instruct-Q4_K_S.gguf,2000.0,97.17,544.0


In [None]:
# Sort by total performance
df_speed = df_logs.sort_values('Total Tokens per Second', ascending=False)

fig1 = px.bar(df_speed, 
             x='Model', 
             y='Total Tokens per Second',
             color='Total Tokens per Second',
             color_continuous_scale='Viridis',
             title='LLM Benchmark: Total Tokens per Second',
             labels={'Total Tokens per Second': 'Total Speed (tokens/sec)'})

fig1.update_layout(xaxis_tickangle=-45, 
                 coloraxis_showscale=False,
                 hovermode='x unified')
fig1.show()

In [80]:
fig2 = px.bar(df_speed,
             x='Model',
             y=['Processing', 'Generation'],
             title='Processing vs Generation Speed Breakdown',
             labels={'value': 'Speed (tokens/sec)'},
             color_discrete_map={'Processing':'#636EFA', 'Generation':'#EF553B'})

fig2.update_layout(barmode='stack', 
                 xaxis_tickangle=-45,
                 hovermode='x unified')
fig2.show()

In [83]:
df_speed

Unnamed: 0,Model,Processing,Generation,Total Tokens per Second,Size
7,OLMo-2-1124-7B-Instruct-Q4_K_S.gguf,2000.0,91.14,1096.0,7.0
8,Qwen2.5-Coder-32B-Instruct-Q4_K_S.gguf,1333.33,29.06,720.0,32.0
3,LLama-3.1-Nemotron-70B-Instruct-HF-IQ4_XS.gguf,589.53,15.46,633.74,70.0
6,mistralai_Mistral-Small-3.2-24B-Instruct-2506-...,1157.89,40.86,618.32,24.0
1,DeepSeek-R1-0528-Qwen3-8B-Q4_K_S.gguf,1105.26,82.43,589.11,8.0
4,Meta-Llama-3.1-8B-Instruct-IQ4_XS.gguf,1074.07,90.7,581.07,8.0
9,Qwen2.5.1-Coder-7B-Instruct-Q4_K_S.gguf,2000.0,97.17,544.0,7.0
5,Meta-Llama-3.1-70B-Instruct-IQ4_XS.gguf,315.22,15.61,170.53,70.0
2,DeepSeek-R1-0528-Qwen3-8B-UD-IQ1_S.gguf,210.0,88.6,111.93,8.0
0,airoboros-mistral2.2-7b.Q4_K_S,424.66,106.57,98.52,


In [82]:
# Extract model size from names
df_speed['Size'] = df_speed['Model'].str.extract(r'(\d+)B').astype(float)

fig3 = px.scatter(df_speed,
                 x='Size',
                 y='Total Tokens per Second',
                 size='Processing',
                 color='Generation',
                 hover_name='Model',
                 title='Gen Performance vs Model Size',
                 labels={'Size':'Model Size (B)',
                        'Total Tokens per Second':'Total Speed',
                        'Processing':'Processing Speed',
                        'Generation':'Generation Speed'},
                 log_x=True)

fig3.update_traces(marker=dict(opacity=0.7, line=dict(width=1, color='DarkSlateGrey')))
fig3.show()

## Direct benchmark results
The koboldcpp comes with direct benchmark option (update Feb 24'), here are the results :


In [71]:
# add size
df_bench = pd.read_csv("../models/bench.csv", index_col=False)
df_bench.head(3)


Length of header or names does not match length of data. This leads to a loss of data with index_col=False.



Unnamed: 0,Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags
0,2025-06-25 07:47:04.689613+00:00,koboldcpp_cublas.so,35,airoboros-mistral2.2-7b.Q4_K_S,4096,100,0.92,4362.45,1.25,80.13,2.16,1 1 1 1,NoAVX2=False Threads=17 HighPriority=False Cub...
1,2025-06-25 07:47:17.604407+00:00,koboldcpp_cublas.so,39,DeepSeek-R1-0528-Qwen3-8B-Q4_K_S,4096,100,0.99,4056.85,1.53,65.4,2.51,1 1 1 1,NoAVX2=False Threads=17 HighPriority=False Cub...
2,2025-06-25 07:47:28.277415+00:00,koboldcpp_cublas.so,39,DeepSeek-R1-0528-Qwen3-8B-UD-IQ1_S,4096,100,1.14,3511.42,1.22,82.24,2.35,1 1 1 1,NoAVX2=False Threads=17 HighPriority=False Cub...


In [72]:
df_models["Model"] = df_models["full_name"].str.replace(r"\.gguf(\.\d+)?$", "", regex=True)
df_merged = df_bench.merge(df_models, on="Model", how="left")
df_merged.drop(columns=["full_name", "name"], inplace=True)
df_merged.head(3)

Unnamed: 0,Timestamp,Backend,Layers,Model,MaxCtx,GenAmount,ProcessingTime,ProcessingSpeed,GenerationTime,GenerationSpeed,TotalTime,Output,Flags,size
0,2025-06-25 07:47:04.689613+00:00,koboldcpp_cublas.so,35,airoboros-mistral2.2-7b.Q4_K_S,4096,100,0.92,4362.45,1.25,80.13,2.16,1 1 1 1,NoAVX2=False Threads=17 HighPriority=False Cub...,7.0
1,2025-06-25 07:47:17.604407+00:00,koboldcpp_cublas.so,39,DeepSeek-R1-0528-Qwen3-8B-Q4_K_S,4096,100,0.99,4056.85,1.53,65.4,2.51,1 1 1 1,NoAVX2=False Threads=17 HighPriority=False Cub...,8.0
2,2025-06-25 07:47:28.277415+00:00,koboldcpp_cublas.so,39,DeepSeek-R1-0528-Qwen3-8B-UD-IQ1_S,4096,100,1.14,3511.42,1.22,82.24,2.35,1 1 1 1,NoAVX2=False Threads=17 HighPriority=False Cub...,8.0


In [None]:
fig = px.scatter(df_merged, 
                 x="size", 
                 y="GenerationSpeed", 
                 size="Layers",
                 color="Model",
                 hover_name="Model",
                 log_x=True,
                 title="Generation Speed vs Model Size",
                 labels={"GenerationSpeed": "Generation Speed (tokens/sec)", 
                         "size": "Model Size (B)"})
fig.update_traces(marker=dict(opacity=0.8, line=dict(width=1, color='DarkSlateGrey')))
fig.update_layout(plot_bgcolor='rgba(240,240,240,0.9)')
fig.show()

In [68]:
fig = px.bar(df_merged, 
             x="Model", 
             y=["ProcessingTime", "GenerationTime"],
             title="Time Breakdown per Model",
             labels={"value": "Time (seconds)"},
             color_discrete_sequence=['#636EFA', '#EF553B'])
fig.update_layout(barmode='stack', xaxis_tickangle=-45)
fig.show()

In [None]:
# sorted manual df
df_sorted = df_merged.copy()
df_sorted['TotalTime'] = df_sorted['ProcessingTime'] + df_sorted['GenerationTime']
df_sorted = df_sorted.sort_values('TotalTime', ascending=False)

fig = px.bar(
    df_sorted, 
    x="Model", 
    y=["ProcessingTime", "GenerationTime"],
    title="Time Breakdown per Model (High to Low)",
    labels={"value": "Time (seconds)"},
    color_discrete_sequence=['#636EFA', '#EF553B']
)
fig.update_layout(
    barmode='stack', 
    xaxis_tickangle=-45,
    xaxis={'categoryorder': 'total descending'}  # ordered
)
fig.show()

In [70]:
fig = px.scatter(df_merged, 
                 x="Layers", 
                 y=["ProcessingSpeed", "GenerationSpeed"],
                 facet_col="variable",
                 color="Model",
                 hover_name="Model",
                 title="Speed Metrics vs Layer Count",
                 labels={"value": "Speed (tokens/sec)"})
fig.update_traces(marker=dict(size=12))
fig.show()

In [84]:
fig = px.scatter_3d(df_merged, 
                    x='size', 
                    y='Layers', 
                    z='GenerationSpeed',
                    color='Model',
                    size='TotalTime',
                    hover_name='Model',
                    title="3D Performance Landscape")
fig.update_layout(scene=dict(xaxis_title='Model Size (B)',
                             yaxis_title='Layers',
                             zaxis_title='Gen Speed (tokens/sec)'))
fig.show()