## Measuring Latency and retrieve input/output tokens

First we will run one task 1 to measure latency, only llama 3.2 -1b, failed 9 times on this task generating an exception, all other models did this task without generating an exception. 

We will start with the ollama models.

In [None]:
import os
from dotenv import load_dotenv
from langchain_ollama import ChatOllama
from langchain_groq import ChatGroq

# Load alle the API keys
load_dotenv(dotenv_path="../.env", override=True)

# Ollama Models
llama1 = ChatOllama(model="llama3.2:1b", temperature=0.1, base_url="http://localhost:11434")
llama3 = ChatOllama(model="llama3.2:3b", temperature=0.1, base_url="http://localhost:11434")
mistral7 = ChatOllama(model="mistral:7b", temperature=0.1, base_url="http://localhost:11434")
qwen3_1 = ChatOllama(model="qwen3:1.7b", temperature=0.1, base_url="http://localhost:11434")
qwen3_4 = ChatOllama(model="qwen3:4b", temperature=0.1, base_url="http://localhost:11434")

# Models on Groq,
llama8 = ChatGroq(model_name="llama-3.1-8b-instant", temperature=0.1)
GPTOSS20 = ChatGroq(model_name="openai/gpt-oss-20b", temperature=0.1)
qwen3_32 = ChatGroq(model_name="qwen/qwen3-32b", temperature=0.1)
llama70 = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.1)
GPTOSS120 = ChatGroq(model_name="openai/gpt-oss-120b", temperature=0.1)


In [None]:

import time, json
import pandas as pd
import ollama
from pydantic import BaseModel, Field
from typing import Literal

MODEL_MAP = {
    "llama1":  "llama3.2:1b",
    "llama3":  "llama3.2:3b",
    "mistral7": "mistral:7b",
    "qwen3_1": "qwen3:1.7b",
    "qwen3_4": "qwen3:4b",
}

models_to_test = ["llama1", "llama3", "mistral7", "qwen3_1", "qwen3_4"]

CLAIM_COL = "claim"

class CheckResult(BaseModel):
    checkable: Literal["POTENTIALLY CHECKABLE", "UNCHECKABLE"]
    explanation: str = Field("")

PROMPT = """
### Role
Neutral Fact-Checking Analyst.

### Inputs
Claim: {claim}

### Task
Classify the claim and determine if it can be fact-checked.

### Classification Logic
- UNCHECKABLE: Opinion, value judgment, or prediction.
- POTENTIALLY CHECKABLE: Factual claims about the past or present.

### Output
Return ONLY valid JSON (no markdown, no code fences):
{{
  "checkable": "POTENTIALLY CHECKABLE",
  "explanation": "Brief justification for the classification."
}}
""".strip()

def run_model(df: pd.DataFrame, model_name: str, model_tag: str) -> pd.DataFrame:
    out = df.copy()

    checkable, explanation, latency_ms = [], [], []

    for claim in out[CLAIM_COL].astype(str).tolist():
        t0 = time.perf_counter()

        resp = ollama.chat(
            model=model_tag,
            messages=[{"role": "user", "content": PROMPT.format(claim=claim)}],
            options={"temperature": 0.1},
        )

        latency_ms.append((time.perf_counter() - t0) * 1000)

        content = resp["message"]["content"].strip()

        if content.startswith("```"):
            content = content.strip("`").replace("json\n", "", 1).strip()

        try:
            data = json.loads(content)
            parsed = CheckResult(**data)
            checkable.append(parsed.checkable)
            explanation.append(parsed.explanation)
        except Exception:
            checkable.append(None)
            explanation.append(content)

    out[f"{model_name}_latency_ms"] = latency_ms

    return out

In [None]:
original_df = pd.read_csv("validated_reference_data.csv", encoding="utf-8")

for name in models_to_test:
    print(f"Running model: {name}")
    original_df = run_model(
        original_df,
        model_name=name,
        model_tag=MODEL_MAP[name],
    )


In [30]:
original_df.columns

Index(['url', 'claim', 'rating', 'translated', 'year', 'checkable',
       'explanation', 'details_text', 'alerts', 'question', 'user_answer',
       'confirmed', 'llama1_latency_ms', 'llama3_latency_ms',
       'mistral7_latency_ms', 'qwen3_1_latency_ms', 'qwen3_4_latency_ms'],
      dtype='object')

In [31]:
original_df[['llama1_latency_ms', 'llama3_latency_ms',
       'mistral7_latency_ms', 'qwen3_1_latency_ms', 'qwen3_4_latency_ms']].mean()

llama1_latency_ms        379.519268
llama3_latency_ms        414.826887
mistral7_latency_ms      826.366297
qwen3_1_latency_ms      2144.206554
qwen3_4_latency_ms     12382.667738
dtype: float64

Next, the groq logs are analysed to retrieve the latency averages (time_to_completion), and also the input and output token averages.

In [36]:
original_df = pd.read_csv("groq-logs.csv", encoding="utf-8")
original_df.groupby("model")[["time_to_completion","input_tokens","output_tokens"]].mean()

Unnamed: 0_level_0,time_to_completion,input_tokens,output_tokens
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
llama-3.1-8b-instant,0.211944,463.778523,64.838926
llama-3.3-70b-versatile,0.25138,463.778523,65.279642
openai/gpt-oss-120b,0.534279,508.805369,219.107383
openai/gpt-oss-20b,0.394004,508.805369,293.143177
qwen/qwen3-32b,1.016569,438.067114,402.422819
