In [1]:
import os
import pandas as pd
from collections import Counter

In [2]:
output_files = os.listdir("./evaluation_1k/")

In [3]:
query_types = ["all", "cluster", "vague", "hypochondriac", "basic", "downplay"]
severities = [
    "A&E",
    "Ambulance",
    "Routine GP appointment",
    "Self-care",
    "Urgent Primary Care",
]

In [4]:
from ast import literal_eval


def get_predicted_severity_level(output):
    # if no tool call, return None
    if not isinstance(output, list) or len(output) == 0:
        return None

    # if multiple tool calls, return None
    if len(output) > 1:
        return None

    # else, parse the tool call
    arguments = output[0]["function"]["arguments"]
    try:
        arguments = literal_eval(arguments)
    except (SyntaxError, ValueError):
        print(f"Error evaluating arguments: {arguments}")
        return None

    return arguments.get("severity_level", None)

In [5]:
def get_predicted_severity_type(row):
    true_severity = row["severity_level"]
    predicted_severity = row["predicted_severity_level"]

    # if no severity level, return None
    if predicted_severity is None:
        return None

    # if severity level is not in the list, return None
    if predicted_severity not in severities:
        return None

    # else work out if the prediction is more severe or less severe than the true severity
    if true_severity == predicted_severity:
        return "correct"
    elif severities.index(predicted_severity) < severities.index(true_severity):
        return "less severe"
    else:
        return "more severe"


severity_types = [None, "correct", "less severe", "more severe"]

In [6]:
results = pd.DataFrame(
    columns=[
        "Embedding Method",
        "Supporting documents retrieved",
        "Query Type",
        "LLM",
        "Predicted Severity Type",
        "Count",
    ],
)

for output_file in output_files:
    # Get the model name and k value from the filename
    output_file_split = output_file.split("_")[0].split("-")
    model = "-".join(output_file_split[2:4]).replace("pt", ".")
    k = output_file_split[4][1:]
    print(model, k)

    # Read the jsonl file
    print(f"Reading './evaluation_1k/{output_file}'")
    data = pd.read_json(f"./evaluation_1k/{output_file}", lines=True)
    data["predicted_severity_level"] = data["rag_tool_calls"].apply(
        lambda x: get_predicted_severity_level(x)
    )

    data["Predicted Severity Type"] = data.apply(
        lambda x: get_predicted_severity_type(x), axis=1
    )

    for query_type in query_types:
        if query_type == "all":
            sub_data = data
        else:
            # Get data for the specific query type
            sub_data = data[data["query_type"] == query_type]

        counts = Counter(sub_data["Predicted Severity Type"])
        # fill in empty values
        for severity_type in severity_types:
            if severity_type not in counts:
                counts[severity_type] = 0

        counts_df = pd.DataFrame.from_dict(
            dict(counts), orient="index", columns=["Count"]
        ).reset_index(names="Predicted Severity Type")
        counts_df["Supporting documents retrieved"] = k
        counts_df["Query Type"] = query_type
        counts_df["LLM"] = model
        counts_df[
            [
                "Supporting documents retrieved",
                "Query Type",
                "LLM",
                "Predicted Severity Type",
                "Count",
            ]
        ]

        results = pd.concat([results, counts_df], ignore_index=True)

deepseek-r1 30
Reading './evaluation_1k/evaluate-rag-deepseek-r1-k30-chroma_2025-04-15_17-24-42.jsonl'
qwen2.5-1.5b 30
Reading './evaluation_1k/evaluate-rag-qwen2pt5-1pt5b-k30-chroma_2025-04-15_15-27-32.jsonl'
Error evaluating arguments: {"check_rash": true, "considered_risk": false}
qwen2.5-1.5b 10
Reading './evaluation_1k/evaluate-rag-qwen2pt5-1pt5b-k10-chroma_2025-04-15_15-16-40.jsonl'
o3-mini 30
Reading './evaluation_1k/evaluate-rag-o3-mini-k30-chroma_2025-04-15_16-14-48.jsonl'
o3-mini 10
Reading './evaluation_1k/evaluate-rag-o3-mini-k10-chroma_2025-04-15_16-10-30.jsonl'
qwen2.5-32b 10
Reading './evaluation_1k/evaluate-rag-qwen2pt5-32b-k10-chroma_2025-04-15_11-27-42.jsonl'
qwen2.5-32b 30
Reading './evaluation_1k/evaluate-rag-qwen2pt5-32b-k30-chroma_2025-04-15_13-03-08.jsonl'


In [7]:
results.sort_values(
    by=[
        "Query Type",
        "Supporting documents retrieved",
        "LLM",
        "Predicted Severity Type",
    ],
    inplace=True,
)
results["Embedding Method"] = "sentence-transformers/all-mpnet-base-v2"

In [8]:
results.to_csv(
    "./2b_severity_evaluation.csv",
    index=False,
)

In [9]:
md_text = """
## 2b_580a395.md severity level evaluation

"""

for query_type in query_types:
    sub_results = results[results["Query Type"] == query_type]

    # add to markdown
    md_text += f"### {query_type}\n"
    for llm in sub_results["LLM"].unique():
        sub_results_llm = sub_results[sub_results["LLM"] == llm].copy(deep=True)
        sub_results_llm.fillna("None", inplace=True)

        # add to markdown
        md_text += f"#### {llm}\n"
        md_text += sub_results_llm.to_markdown(
            index=False, tablefmt="github", floatfmt=".2f"
        )
        md_text += "\n\n"

    md_text += "\n\n"

with open("./2b_severity_evaluation_580a395.md", "w") as f:
    f.write(md_text)

  sub_results_llm.fillna("None", inplace=True)
