In [None]:
%load_ext autoreload
%autoreload 2

import json
import os
import warnings

import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns

from setlexsem.analyze.hypothesis_testing_utils import (
    add_nl,
    add_text,
    agg,
    concat_sets,
    create_fig_path,
    create_filtered_df_for_hypothesis,
    get_config,
    get_stats,
    save_config_and_data,
)
from setlexsem.analyze.visualize import create_violin_agg, viz_barplot

# import matplotlib.pyplot as plt
from setlexsem.constants import (
    PATH_HYPOTHESIS_CONFIG_ROOT,
    PATH_POSTPROCESS,
    PATH_ROOT,
    STUDY2DECEPTIVE_WORD_SAMPLER,
    STUDY2MODEL,
    TOKEN_ORDER,
)
from setlexsem.utils import convert_model_name, load_processed_data, make_nice

warnings.filterwarnings("ignore")

# location to store the results as table and
FOLDER_NAME = "camera_ready_figures"
SUPPLEMENTARY_ROOT = os.path.join(
    PATH_ROOT, "manuscript/supplementary_materials_camera_ready/experiments"
)
os.makedirs(SUPPLEMENTARY_ROOT, exist_ok=True)

In [None]:
STUDY_LIST = list(STUDY2MODEL.keys())

df_results = pd.DataFrame()
for study_name in STUDY_LIST:
    csv_path = os.path.join(PATH_POSTPROCESS, f"{study_name}.csv")
    df_temp = load_processed_data(csv_path)
    df_results = pd.concat([df_results, df_temp]).reset_index(drop=True)

df_results, ugly_map = make_nice(df_results)

## Hypothesis Testing and Visualization

In [None]:
with open(
    os.path.join(PATH_HYPOTHESIS_CONFIG_ROOT, "hypothesis.json"), "r"
) as f:
    hypothesis_configs = json.load(f)

## Hypothesis: Prompt Language Comparison

In [None]:
hypothesis = "Prompt Language Comparison"
hypothesis_config = hypothesis_configs[hypothesis]
hypo_name = hypothesis_config["hypo_name"]

df_new = create_filtered_df_for_hypothesis(df_results, hypothesis_config)

fig = create_violin_agg(
    df_new,
    x_name="Demonstration phrasing",
    figure_size=(14, 4),
    save_fig=create_fig_path(hypo_name, folder=FOLDER_NAME),
    fontsize=18,
    split_bar=False,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

## Hypothesis: Operation Type

In [None]:
hypothesis = "Operation Type Comparison"
hypothesis_config = hypothesis_configs[hypothesis]
hypo_name = hypothesis_config["hypo_name"]

df_new = create_filtered_df_for_hypothesis(df_results, hypothesis_config)

fig = create_violin_agg(
    df_new,
    x_name="Set operation",
    figure_size=(7, 4),
    save_fig=create_fig_path(hypo_name, folder=FOLDER_NAME),
    split_bar=False,
    fontsize=21,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

## Hypothesis: Numbers vs Words

In [None]:
hypo_name = "Numbers vs Words"
hypothesis_config = hypothesis_configs[hypothesis]
hypo_name = hypothesis_config["hypo_name"]

df_new = create_filtered_df_for_hypothesis(df_results, hypothesis_config)

fig = create_violin_agg(
    df_new,
    x_name="Set operation",
    hue="Token type",
    figure_size=(8, 4),
    save_fig=create_fig_path(hypo_name, folder=FOLDER_NAME),
    legend_loc="lower right",
    fontsize=18,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

## Hypothesis: Operand Size (filtered to include all LLMs besides Haiku)

In [None]:
hypothesis = "Operand Size"
hypothesis_config = hypothesis_configs[hypothesis]
hypo_name = hypothesis_config["hypo_name"]

df_new = create_filtered_df_for_hypothesis(df_results, hypothesis_config)

# Add new line
df_new["Operand size"] = df_new["Operand size"].apply(add_nl)

hypo_name = "operand_size"
fig = create_violin_agg(
    df_new,
    x_name="Operand size",
    figure_size=(8, 4),
    save_fig=create_fig_path(hypo_name, folder=FOLDER_NAME),
    split_bar=False,
    fontsize=18,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

## Hypothesis: LLM Comparison

In [None]:
hypothesis = "LLM Comparison"
hypothesis_config = hypothesis_configs[hypothesis]
hypo_name = hypothesis_config["hypo_name"]

df_new = create_filtered_df_for_hypothesis(df_results, hypothesis_config)

fig = create_violin_agg(
    df_new,
    x_name="LLM",
    figure_size=(12, 4),
    save_fig=create_fig_path(hypo_name, folder=FOLDER_NAME),
    split_bar=False,
    fontsize=14,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

## Hypothesis: Token Frequency

In [None]:
hypothesis = "Token Frequency"
hypothesis_config = hypothesis_configs[hypothesis]
hypo_name = hypothesis_config["hypo_name"]

df_new = create_filtered_df_for_hypothesis(df_results, hypothesis_config)


df_new["Token frequency"] = df_new["Token frequency"].apply(add_text)
df_new["Token frequency"] = df_new["Token frequency"].astype("category")
df_new["Token frequency"] = df_new["Token frequency"].cat.set_categories(
    TOKEN_ORDER, ordered=True
)

fig = create_violin_agg(
    df_new,
    x_name="Set operation",
    hue="Token frequency",
    figure_size=(12, 4),
    save_fig=create_fig_path(hypo_name, folder=FOLDER_NAME),
    split_bar=False,
    legend_loc="outer right",
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

### Deciles: Distribution Plots & Table


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Create a figure and a grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=4, figsize=(8, 4), dpi=600)
axes = axes.flatten()  # Flatten the axes to make it a 1D array

full_accuracy = []
for i, token_freq in enumerate(TOKEN_ORDER[1:]):

    a = df_new[
        (df_new["Token length"] == 3)
        & (df_new["Token frequency"] == token_freq)
    ]
    b = df_new[
        (df_new["Token length"] == 5)
        & (df_new["Token frequency"] == token_freq)
    ]

    # merge by columns and then add "Accuracy" column back
    c = a.merge(
        b,
        on=[
            "Token type",
            "Set operation",
            "Demonstration phrasing",
            "Prompting method",
            "Operand size",
            "Number of demonstrations",
            "Max Value",
            "Token similarity",
            "Token frequency",
            "Relationship between sets A and B",
            "N Samples",
            "LLM",
        ],
        how="inner",
        suffixes=("_tl_3", "_tl_5"),
    )
    c["Accuracy"] = c["Avg Accuracy_tl_5"] - c["Avg Accuracy_tl_3"]

    full_accuracy.append(
        {
            "decile": token_freq,
            "accuracy_diff_mean": c["Accuracy"].mean(),
            "accuracy_diff_std": c["Accuracy"].std(),
            "accuracy_diff_min": c["Accuracy"].min(),
            "accuracy_diff_max": c["Accuracy"].max(),
            "n_comparisons": len(c),
        }
    )

    # Plot the histogram
    if i < len(axes):
        # get binwidth fixed
        sns.histplot(data=c, x="Accuracy", ax=axes[i], kde=True, bins=20)
        axes[i].set_title(f"{token_freq}")
        axes[i].set_xlim(-82, 82)
        axes[i].set_ylim(0, 50)
        if i != 0 and i != 4:
            # remove ytick and ylabel
            axes[i].set_yticks([])
            axes[i].set_ylabel("")
        if i < 4:
            axes[i].set_xticks([])
            axes[i].set_xlabel("")
        else:
            axes[i].set_xticks([-60, -30, 0, 30, 60])
    else:
        break

# Adjust the spacing between subplots
plt.tight_layout()

fontsize = 18
# set fontsize
for ax in fig.axes:
    for item in (
        [ax.title, ax.xaxis.label, ax.yaxis.label]
        # + ax.get_xticklabels()
        # + ax.get_yticklabels()
    ):
        item.set_fontsize(fontsize)

fontsize = 14
# set fontsize
for ax in fig.axes:
    for item in [] + ax.get_xticklabels() + ax.get_yticklabels():
        item.set_fontsize(fontsize)

# plt.subplots_adjust(hspace=0.2)

# Display the plot
plt.show()
fig.savefig(
    create_fig_path("deciles_token_length", folder=FOLDER_NAME),
    bbox_inches="tight",
    backend="pdf",
)

df_deciles = pd.DataFrame(full_accuracy).round(2)

## Hypothesis: Deceptiveness

In [None]:
hypothesis = "Deceptiveness Swapping Effect"
hypothesis_config = hypothesis_configs[hypothesis]
hypo_name = hypothesis_config["hypo_name"]

df_new = create_filtered_df_for_hypothesis(df_results, hypothesis_config)

df_new["Relationship between sets A and B"] = (
    df_new["Relationship between sets A and B"]
    .replace(
        {
            0: "Semantically disjoint",
            1: "Semantically intermingled",
        }
    )
    .astype("category")
)
# order these categories
df_new["Relationship between sets A and B"] = df_new[
    "Relationship between sets A and B"
].cat.reorder_categories(
    ["Semantically disjoint", "Semantically intermingled"]
)


fig = create_violin_agg(
    df_new,
    x_name="Set operation",  # "Operand size",
    hue="Relationship between sets A and B",
    figure_size=(12, 6),
    save_fig=create_fig_path(hypo_name, folder=FOLDER_NAME),
    split_bar=True,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

In [None]:
df_new["Token similarity"] = df_new["Token similarity"].replace(
    {
        0: "No semantic manipulation",
        1: "Intermingled sets with similar semantic words",
    }
)
df_new = df_new.drop(columns=["Relationship between sets A and B"])
df_new = df_new.rename(
    columns={"Token similarity": "Relationship between sets A and B"}
)

fig = create_violin_agg(
    df_new,
    x_name="Set operation",  # "Operand size",
    hue="Relationship between sets A and B",
    figure_size=(12, 6),
    save_fig=create_fig_path(hypo_name, folder=FOLDER_NAME),
    split_bar=True,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

In [None]:
hypothesis = "Deceptiveness Effect K Shots"
hypothesis_config = hypothesis_configs[hypothesis]
hypo_name = hypothesis_config["hypo_name"]


df_new = create_filtered_df_for_hypothesis(df_results, hypothesis_config)

df_new["Relationship between sets A and B"] = (
    df_new["Relationship between sets A and B"]
    .replace(
        {
            0: "Semantically disjoint",
            1: "Semantically intermingled",
        }
    )
    .astype("category")
)
fig = create_violin_agg(
    df_new,
    x_name="Number of demonstrations",  # "Operand size",
    hue="Relationship between sets A and B",
    figure_size=(12, 6),
    save_fig=create_fig_path(hypo_name, folder=FOLDER_NAME),
    split_bar=True,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

## Word Source(Deceptive Words for Word Sampler (Random Baseline vs Overlap Sampler vs Non-Overlap Sampler))

In [None]:
hypothesis = "Deceptiveness Effect For Word and Overlap Samplers"
hypothesis_config = hypothesis_configs[hypothesis]
hypo_name = hypothesis_config["hypo_name"]

df_new = create_filtered_df_for_hypothesis(df_results, hypothesis_config)

df_new["Word sampler sampling method"] = df_new["Study Name"].apply(
    lambda x: STUDY2DECEPTIVE_WORD_SAMPLER[x]
)

fig = create_violin_agg(
    df_new,
    x_name="Word sampler sampling method",
    figure_size=(12, 6),
    save_fig=create_fig_path(hypo_name),
    split_bar=False,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

In [None]:
hypothesis = "Deceptiveness Effect For Word and Overlap Samplers For Each Set Operation"
hypothesis_config = hypothesis_configs[hypothesis]
hypo_name = hypothesis_config["hypo_name"]

df_new = create_filtered_df_for_hypothesis(df_results, hypothesis_config)

df_new["Word sampler sampling method"] = df_new["Study Name"].apply(
    lambda x: STUDY2DECEPTIVE_WORD_SAMPLER[x]
)
fig = create_violin_agg(
    df_new,
    x_name="Set operation",
    hue="Word sampler sampling method",
    figure_size=(14, 8),
    save_fig=create_fig_path(hypo_name),
    split_bar=False,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

## Hypothesis: Overlap Effect

In [None]:
## Numbers

hypothesis = "Is Overlap Important in Numbers?"
hypothesis_config = hypothesis_configs[hypothesis]
hypo_name = hypothesis_config["hypo_name"]

df_new = create_filtered_df_for_hypothesis(df_results, hypothesis_config)

fig = create_violin_agg(
    df_new,
    x_name="Set operation",
    hue="Token type",
    figure_size=(8, 4),
    save_fig=create_fig_path(hypo_name, folder=FOLDER_NAME),
    split_bar=True,
    fontsize=18,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

# hypo_overall
fig = create_violin_agg(
    df_new,
    x_name="Token type",
    figure_size=(6, 4),
    save_fig=create_fig_path(hypo_name + "_overall", folder=FOLDER_NAME),
    split_bar=False,
    fontsize=18,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name + "_overall", SUPPLEMENTARY_ROOT)


## Words
hypothesis = "Is Overlap Important in Words?"
hypothesis_config = hypothesis_configs[hypothesis]
hypo_name = hypothesis_config["hypo_name"]

df_new = create_filtered_df_for_hypothesis(df_results, hypothesis_config)

fig = create_violin_agg(
    df_new,
    x_name="Set operation",
    hue="Token type",
    figure_size=(8, 4),
    save_fig=create_fig_path(hypo_name, folder=FOLDER_NAME),
    split_bar=True,
    fontsize=18,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name, SUPPLEMENTARY_ROOT)

# hypo_overall
fig = create_violin_agg(
    df_new,
    x_name="Token type",
    figure_size=(6, 4),
    save_fig=create_fig_path(hypo_name + "_overall", folder=FOLDER_NAME),
    split_bar=False,
    fontsize=18,
    save_raw_experiment=1,
    supp_root=SUPPLEMENTARY_ROOT,
)

save_config_and_data(df_new, hypo_name + "_overall", SUPPLEMENTARY_ROOT)

### Deciles: Count 

In [None]:
import re
from collections import Counter

import matplotlib
import pandas as pd

from setlexsem.constants import PATH_DATA_ROOT

llm_name = "Claude Haiku"

# Define the directory where your CSV files are located
data_path = os.path.join(PATH_DATA_ROOT, "decile_words")

# Create a dictionary to store DataFrames for each string pattern match
dfs_by_deciles = {}

# Iterate through each file in the directory
for file in os.listdir(data_path):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(data_path, file))
        # Iterate through patterns "Item-{i}" for i in range(1,10)
        for i in range(1, 10):
            pattern = f"Decile-{i}"
            # If pattern is found in the file name
            if pattern in file:
                # If i-specific list doesn't exist, create one
                for j in range(1, 6):
                    pattern_L = f"L-{j}"
                    if pattern_L in file:
                        key = f"Decile-{i}_L-{j}"
                        if key not in dfs_by_deciles:
                            dfs_by_deciles[key] = []
                        # Append DataFrame to corresponding list
                        dfs_by_deciles[key].append(df)
                        break  # Stop searching for other patterns in the same file

unique_words_dict = {key: 0 for key in dfs_by_deciles.keys()}

# Unique words for each token length in a decile
for i, dfs in dfs_by_deciles.items():

    total_word_count = 0
    all_words = []

    for df in dfs_by_deciles[i]:
        # Concatenate sets from both columns into a single list
        combined_sets = df.apply(concat_sets, axis=1).tolist()

        # Flatten the list of sets into a single list of words
        all_words_j = [word for set_ in combined_sets for word in set_]

        all_words.extend(all_words_j)

    # Count the unique words
    unique_word_count = len(set(all_words))

    total_word_count += unique_word_count

    unique_words_dict[i] = total_word_count


def get_decile_and_length(s):
    # Use regular expression to find numbers following 'Key-' and 'Value-'
    decile = re.search(r"Decile-(\d+)", s).group(1)
    L = re.search(r"L-(\d+)", s).group(1)

    return decile, L


data = []

for key, value in unique_words_dict.items():
    decile, L = get_decile_and_length(key)
    data.append(
        {
            "Token frequency": decile,
            "Token length": L,
            "Unique Words": value,
            "LLM": llm_name,
        }
    )

df_deciles_L = pd.DataFrame(data)


matplotlib.rcParams["mathtext.rm"] = "Bitstream Vera Sans"
matplotlib.rcParams["mathtext.it"] = "Bitstream Vera Sans:italic"
matplotlib.rcParams["mathtext.bf"] = "Bitstream Vera Sans:bold"
matplotlib.rcParams["mathtext.fontset"] = "stix"
matplotlib.rcParams["font.family"] = "STIXGeneral"

# Set the size of the plot
plt.figure(figsize=(12, 6))  # Increase horizontally and vertically

df_deciles_L["Token frequency"] = pd.Categorical(
    df_deciles_L["Token frequency"],
    categories=sorted(df_deciles_L["Token frequency"].unique()),
)

# Set the font scale
sns.set(font_scale=1.5)  # Increase font size by 20%

# Create the barplot
sns.barplot(
    data=df_deciles_L,
    x="Token frequency",
    y="Unique Words",
    hue="Token length",
)

plt.gca().set_facecolor("white")

plt.gca().grid(axis="y", color="lightgray")

# Ensure that the x and y axis lines are visible by setting their color and linewidth
ax = plt.gca()
ax.spines["left"].set_color("black")
ax.spines["left"].set_linewidth(1.5)
ax.spines["bottom"].set_color("black")
ax.spines["bottom"].set_linewidth(1.5)

# Set labels and title
plt.xlabel("Deciles")
plt.ylabel("Count")
plt.title("")

# Save the figure as a PDF
plt.savefig(
    os.path.join(
        PATH_ROOT, f"manuscript/{FOLDER_NAME}/deciles_unique_words.pdf"
    )
)

# Show the plot
plt.show()

## END