# Select model for regression testing

Conduct quantitative evaluation of different models on generating realistic simulated conversations.


In [1]:
# Merge all dataframes into one. Iterate through each file and append the data
import polars as pl
from pathlib import Path
from hpms.loading.constants import DATA_DIR

DATASET_DIR_2: Path = DATA_DIR / Path("paper/round-2-generation")

df_list = []
# Iterate through all the dataset-round-*.json files
for model_file in DATASET_DIR_2.glob("dataset-round-*.json"):
    # Read the JSON file into a Polars DataFrame
    df = pl.read_json(model_file)
    # Drop the cost_in_usd column if it exists
    if "cost_in_usd" in df.columns:
        df = df.drop("cost_in_usd")

    # Append to the list
    df_list.append(df)

# Concatenate all DataFrames into one
merged_df_2 = pl.concat(df_list, how="vertical")

In [2]:
import polars as pl


def extract_unique_content_by_model(df: pl.DataFrame) -> pl.DataFrame:
    """
    Extract unique content values from conversation column grouped by model.

    Args:
        df: Input dataframe with conversation data

    Returns:
        pl.DataFrame: Dataframe with new content_set column containing unique content per model
    """
    # Extract all content from conversations and create a mapping by model
    content_by_model = (
        df.with_columns(
            # Extract content from each conversation message
            pl.col("conversation")
            .list.eval(pl.element().struct.field("content"))
            .alias("all_content")
        )
        .group_by("model")
        .agg(
            # Flatten all content lists and get unique values
            pl.col("all_content").list.explode().unique().alias("unique_content_set")
        )
    )

    # Join back to original dataframe
    result = (
        df.join(
            content_by_model.select(["model", "unique_content_set"]),
            on="model",
            how="left",
        )
        .with_columns(pl.col("unique_content_set").alias("content_set"))
        .drop("unique_content_set")
    )

    return result

## Create table of unique messages by model


In [3]:
def analyze_unique_content_by_model(df: pl.DataFrame) -> pl.DataFrame:
    """
    Analyze unique content count by model and return summary statistics.

    Args:
        df: Input dataframe with conversation data

    Returns:
        pl.DataFrame: Summary table with model, unique content count, and frequency
    """
    # Extract unique content by model
    df_with_content = extract_unique_content_by_model(df)

    # Convert content_set to count of unique messages
    df_with_counts = df_with_content.with_columns(
        pl.col("content_set").map_elements(lambda x: len(x), return_dtype=pl.Int64)
    )

    # Group by model and content count to get summary statistics
    summary = (
        df_with_counts.group_by(["model", "content_set"])
        .agg(pl.len().alias("count"))
        .sort("content_set", descending=True)
    )

    return summary


# Usage
summary_df_2 = analyze_unique_content_by_model(merged_df_2)
summary_df_2

model,content_set,count
str,i64,u32
"""gpt-4o-mini-2024-07-18""",1000,100
"""gpt-4.1-nano-2025-04-14""",1000,100
"""gpt-5-nano-2025-08-07""",999,100
"""gpt-4o-2024-08-06""",993,100
"""gemini-2.5-flash""",979,100
"""gemini-2.5-flash-lite""",956,100
"""gemini-2.0-flash""",944,100
"""meta-llama/Meta-Llama-3.1-70B-…",941,100
"""gemini-2.0-flash-lite""",936,100
"""meta-llama/Llama-3.3-70B-Instr…",781,100
