# Testing methods for understanding whether the language model has seen a dataset before

In [1]:
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from my_code.utils import load_nested_dict_to_pandas

In [2]:
result_save_path = Path("results", "table_tests")
result_save_path.mkdir(parents=True, exist_ok=True)

In [3]:
# loading results
header_test_files = list(result_save_path.glob("header_test_*.json"))
row_test_files = list(result_save_path.glob("row_test_*.json"))

print(
    "Loading header test results:", 
    "\n", 
    "\n ".join([str(s) for s in header_test_files])
)

print(
    "Loading row test results:", 
    "\n", 
    "\n ".join([str(s) for s in row_test_files])
)


header_test_df = pd.concat(
    [
        load_nested_dict_to_pandas(
            json.load(open(f, "r")), 
            level_names=["dataset"],
        )
        for f in header_test_files
    ]
)

row_test_df = pd.concat(
    [
        load_nested_dict_to_pandas(
            json.load(open(f, "r")), 
            level_names=["dataset", "run"],
        )
        for f in row_test_files
    ]
)

Loading header test results: 
 results/table_tests/header_test_wine_quality.json
 results/table_tests/header_test_fake_data.json
 results/table_tests/header_test_breast_cancer.json
 results/table_tests/header_test_heart_disease.json
 results/table_tests/header_test_california_housing.json
Loading row test results: 
 results/table_tests/row_test_wine_quality.json
 results/table_tests/row_test_california_housing.json
 results/table_tests/row_test_fake_data.json
 results/table_tests/row_test_breast_cancer.json
 results/table_tests/row_test_heart_disease.json


In [4]:
dataset_rename = {
    "fake_data": r"$y = 2 x_1 - x_2 + x_3$",
    "breast_cancer": "Breast Cancer",
    "california_housing": "California Housing",
    "wine_quality": "Wine Quality",
    "heart_disease": "Heart Disease",
}

dataset_order = [
    r"$y = 2 x_1 - x_2 + x_3$",
    "Breast Cancer",
    "California Housing",
    "Wine Quality",
    "Heart Disease",
]

header_levenshtein_score = (
    header_test_df
    # normalise the levenshtein score
    .assign(
        normalised_levenshtein_score = lambda x: 
        x["levenshtein_score"] / np.maximum(
            x["header_completion"].apply(len), x["llm_completion"].apply(len)
        )
    )
    [["dataset", "normalised_levenshtein_score"]]
    .assign(
        normalised_levenshtein_score = lambda x: 
        x["normalised_levenshtein_score"].apply(lambda y: f"{y:.2f}")
    )
    .set_index("dataset")
    .rename_axis("")
    .transpose()
    .reset_index(drop=True)
    .rename(columns=dataset_rename)
    [dataset_order]
    
)
header_levenshtein_score

Unnamed: 0,$y = 2 x_1 - x_2 + x_3$,Breast Cancer,California Housing,Wine Quality,Heart Disease
0,0.7,0.0,0.15,0.21,0.0


In [5]:
print(header_levenshtein_score.to_latex(index=False))

\begin{tabular}{lllll}
\toprule
$y = 2 x_1 - x_2 + x_3$ & Breast Cancer & California Housing & Wine Quality & Heart Disease \\
\midrule
0.70 & 0.00 & 0.15 & 0.21 & 0.00 \\
\bottomrule
\end{tabular}



In [6]:
dataset_rename = {
    "fake_data": r"$y = 2 x_1 - x_2 + x_3$",
    "breast_cancer": "Breast Cancer",
    "california_housing": "California Housing",
    "wine_quality": "Wine Quality",
    "heart_disease": "Heart Disease",
}

dataset_order = [
    r"$y = 2 x_1 - x_2 + x_3$",
    "Breast Cancer",
    "California Housing",
    "Wine Quality",
    "Heart Disease",
]

row_levenshtein_score = (
    row_test_df
    # normalise the levenshtein score
    .assign(
        normalised_levenshtein_score = lambda x: 
        x["levenshtein_score"] / np.maximum(
            x["row_completion"].apply(len), x["llm_completion"].apply(len)
        )
    )
    [["dataset", "normalised_levenshtein_score"]]
    .groupby("dataset")
    ["normalised_levenshtein_score"]
    .apply(
        lambda x: (
            f"{np.mean(x):.2f}"
            + " ± "
            f"{np.std(x):.2f}"
        )
    )
    .to_frame()
    .rename_axis("")
    .transpose()
    .reset_index(drop=True)
    .rename(columns=dataset_rename)
    [dataset_order]
)
row_levenshtein_score

Unnamed: 0,$y = 2 x_1 - x_2 + x_3$,Breast Cancer,California Housing,Wine Quality,Heart Disease
0,0.74 ± 0.04,0.43 ± 0.13,0.58 ± 0.04,0.31 ± 0.04,0.33 ± 0.05


In [7]:
print(row_levenshtein_score.to_latex(index=False))

\begin{tabular}{lllll}
\toprule
$y = 2 x_1 - x_2 + x_3$ & Breast Cancer & California Housing & Wine Quality & Heart Disease \\
\midrule
0.74 ± 0.04 & 0.43 ± 0.13 & 0.58 ± 0.04 & 0.31 ± 0.04 & 0.33 ± 0.05 \\
\bottomrule
\end{tabular}



In [11]:
row_test_df.iloc[0][["row_completion", "llm_completion"]].values

array(['7.1,0.13,0.38,1.8,0.046,14.0,114.0,0.9925,3.32,0.9,11.7,6,white',
       '6.5,0.34,0.28,1.8,0.041,43.0,188.0,0.9928,3.13,0.37,9.6,6,white'],
      dtype=object)