# Testing methods for understanding whether the language model has seen a dataset before

In [1]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from llm_elicited_priors.utils import load_nested_dict_to_pandas

In [2]:
result_save_path = Path("results", "table_tests")
result_save_path.mkdir(parents=True, exist_ok=True)

In [3]:
# loading results
header_test_files = list(result_save_path.glob("header_test_*.json"))
row_test_files = list(result_save_path.glob("row_test_*.json"))

print(
    "Loading header test results:", 
    "\n", 
    "\n ".join([str(s) for s in header_test_files])
)

print(
    "Loading row test results:", 
    "\n", 
    "\n ".join([str(s) for s in row_test_files])
)


header_test_df = pd.concat(
    [
        load_nested_dict_to_pandas(
            json.load(open(f, "r")), 
            level_names=["dataset"],
        )
        for f in header_test_files
    ]
)

row_test_df = pd.concat(
    [
        load_nested_dict_to_pandas(
            json.load(open(f, "r")), 
            level_names=["dataset", "run"],
        )
        for f in row_test_files
    ]
)

Loading header test results: 
 results/table_tests/header_test_diabetes.json
 results/table_tests/header_test_wine_quality.json
 results/table_tests/header_test_fake_data.json
 results/table_tests/header_test_breast_cancer.json
 results/table_tests/header_test_heart_disease.json
 results/table_tests/header_test_california_housing.json
 results/table_tests/header_test_hypothyroid.json
Loading row test results: 
 results/table_tests/row_test_wine_quality.json
 results/table_tests/row_test_diabetes.json
 results/table_tests/row_test_hypothyroid.json
 results/table_tests/row_test_california_housing.json
 results/table_tests/row_test_fake_data.json
 results/table_tests/row_test_breast_cancer.json
 results/table_tests/row_test_heart_disease.json


In [4]:
dataset_rename = {
    "fake_data": r"$y = 2 x_1 - x_2 + x_3$",
    "breast_cancer": "Breast Cancer",
    "california_housing": "California Housing",
    "wine_quality": "Wine Quality",
    "heart_disease": "Heart Disease",
    "diabetes": "Diabetes",
    "hypothyroid": "Hypothyroid",
}

dataset_order = [
    #r"$y = 2 x_1 - x_2 + x_3$",
    "Heart Disease",
    "Diabetes",
    "Hypothyroid",
    "Breast Cancer",
    # "California Housing",
    # "Wine Quality",
]

header_levenshtein_score = (
    header_test_df
    # normalise the levenshtein score
    .assign(
        normalised_levenshtein_score = lambda x: 
        x["levenshtein_score"] / np.maximum(
            x["header_completion"].apply(len), x["llm_completion"].apply(len)
        )
    )
    [["dataset", "normalised_levenshtein_score"]]
    .assign(
        normalised_levenshtein_score = lambda x: 
        x["normalised_levenshtein_score"].apply(lambda y: f"{y:.2f}")
    )
    .set_index("dataset")
    .rename_axis("")
    .transpose()
    .reset_index(drop=True)
    .rename(columns=dataset_rename)
    [dataset_order]
    
)
header_levenshtein_score

Unnamed: 0,Heart Disease,Diabetes,Hypothyroid,Breast Cancer
0,0.0,0.21,0.22,0.0


In [5]:
print(header_levenshtein_score.to_latex(index=False))

\begin{tabular}{llll}
\toprule
Heart Disease & Diabetes & Hypothyroid & Breast Cancer \\
\midrule
0.00 & 0.21 & 0.22 & 0.00 \\
\bottomrule
\end{tabular}



In [6]:
header_completion, llm_completion = (
    header_test_df.iloc[-1][["header_completion", "llm_completion"]].values
)
print("Example completion:\n")
print("True header:\n", header_completion)
print("LLM completion:\n", llm_completion)

Example completion:

True header:
 ,0,0,0,0,0,0,0,0,0,0.00025,0.023,0.128,0.104,0.121,3
0.48,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00208,0.02,0.086,0.078,0.11,3
0.67,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0013,0.024,0.087,0.109,0.08,3
0.76,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0001,0.029,0.124,0.128,0.097,3
0.62,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0.011,0.008,0.073,0.074,0.098,2
0.18,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0.0001,0.023,0.098,0.085,0.115,3
0.59,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00
LLM completion:
 ,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00025,0.015,0.1,0.08,0.12,3
0.64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00025,0.024,0.076,0.07,0.109,3
0.55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00025,0.015,0.114,0.081,0.141,3
0.78,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00025,0.015,0.1,0.085,0.118,3
0.7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00025,0.015,0.1,0.09,0.111,3
0.68,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.00025,0.015,0.1,0.09,0.111,3
0.5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [7]:
dataset_rename = {
    "fake_data": r"$y = 2 x_1 - x_2 + x_3$",
    "breast_cancer": "Breast Cancer",
    "california_housing": "California Housing",
    "wine_quality": "Wine Quality",
    "heart_disease": "Heart Disease",
    "diabetes": "Diabetes",
    "hypothyroid": "Hypothyroid",
}

dataset_order = [
    #r"$y = 2 x_1 - x_2 + x_3$",
    "Heart Disease",
    "Diabetes",
    "Hypothyroid",
    "Breast Cancer",
    # "California Housing",
    # "Wine Quality",
]

row_levenshtein_score = (
    row_test_df
    # normalise the levenshtein score
    .assign(
        normalised_levenshtein_score = lambda x: 
        x["levenshtein_score"] / np.maximum(
            x["row_completion"].apply(len), x["llm_completion"].apply(len)
        )
    )
    [["dataset", "normalised_levenshtein_score"]]
    .groupby("dataset")
    ["normalised_levenshtein_score"]
    .apply(
        lambda x: (
            f"{np.mean(x):.2f}"
            + " ± "
            f"{np.std(x):.2f}"
        )
    )
    .to_frame()
    .rename_axis("")
    .transpose()
    .reset_index(drop=True)
    .rename(columns=dataset_rename)
    [dataset_order]
)
row_levenshtein_score

Unnamed: 0,Heart Disease,Diabetes,Hypothyroid,Breast Cancer
0,0.33 ± 0.05,0.38 ± 0.05,0.20 ± 0.05,0.43 ± 0.13


In [8]:
print(row_levenshtein_score.to_latex(index=False))

\begin{tabular}{llll}
\toprule
Heart Disease & Diabetes & Hypothyroid & Breast Cancer \\
\midrule
0.33 ± 0.05 & 0.38 ± 0.05 & 0.20 ± 0.05 & 0.43 ± 0.13 \\
\bottomrule
\end{tabular}



In [9]:
row_completion, llm_completion = (
    row_test_df.iloc[-1][["row_completion", "llm_completion"]].values
)
print("Example completion:\n")
print("True row:\n", row_completion)
print("LLM completion:\n", llm_completion)

Example completion:

True row:
 47,1,3,130,253,0,0,179,0,0.0,1,0.0,3.0,0
LLM completion:
 63,1,4,150,407,0,2,154,0,4.0,2,3.0,7.0,3
