In [1]:
%load_ext autoreload 
%autoreload 2

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import json
import pandas as pd
from matplotlib.colors import LinearSegmentedColormap
import numpy as np
from chatbot_personalization.utils.helper_functions import get_base_dir_path

ModuleNotFoundError: No module named 'matplotlib'

# Generate Figure 1

In [None]:
def create_heatmap(pivot_table, title_name, save_path, vmin=None, vmax=None):
    # Create the custom color map
    color_map = LinearSegmentedColormap.from_list("custom_cmap", ["#F8F8FF", "#3753A5"])

    # Determine vmin and vmax if not provided
    if vmin is None:
        vmin = pivot_table.min().min()
    if vmax is None:
        vmax = pivot_table.max().max()

    # Set the figure size
    plt.figure(figsize=(10, 9))

    sns.heatmap(
        pivot_table,
        annot=True,
        cmap=color_map,
        linewidths=2.0,
        fmt=".2f",
        cbar=False,
        vmin=vmin,
        vmax=vmax,
        annot_kws={"size": 20},
    )

    # Adjust the size of the ticks
    plt.tick_params(axis="both", which="major", labelsize=20)
    ticks_font = 30
    plt.yticks(fontsize=ticks_font, rotation=0)
    plt.xticks(fontsize=ticks_font, rotation=0)
    # Adjust the font size for the heatmap and axis titles
    plt.xlabel("Distribution of Predicted Ratings", fontsize=32, labelpad=15)
    plt.ylabel("Participant Ratings", fontsize=32)

    # Set the title if provided
    if title_name:
        plt.title(title_name, fontsize=25)

    # Save the heatmap figure in PDF format without the color bar
    plt.savefig(f"{save_path}.pdf", bbox_inches="tight")

In [None]:
def create_histo_heatmap(pivot_table, title_name, save_path, vmin=None, vmax=None):
    # Create the custom color map
    color_map = LinearSegmentedColormap.from_list("custom_cmap", ["#F8F8FF", "#3753A5"])

    # Determine vmin and vmax if not provided
    if vmin is None:
        vmin = pivot_table.min().min()
    if vmax is None:
        vmax = pivot_table.max().max()

    # Set the figure size
    plt.figure(figsize=(15, 9))

    sns.heatmap(
        pivot_table,
        annot=True,
        cmap=color_map,
        linewidths=2.0,
        fmt=".2f",
        cbar=False,
        vmin=vmin,
        vmax=vmax,
        annot_kws={"size": 20},
    )

    # Adjust the size of the ticks
    plt.tick_params(axis="both", which="major", labelsize=20)
    ticks_font = 30
    plt.yticks(fontsize=ticks_font, rotation=0)

    # Adjust the font size for the heatmap and axis titles
    plt.xlabel("Expected Value of Predicted Ratings", fontsize=30, labelpad=15)
    plt.ylabel("Participant Ratings", fontsize=32)

    # Set the title if provided
    if title_name:
        plt.title(title_name, fontsize=25)

    # Set x-axis ticks for bin intervals
    bin_intervals = [
        "[0, 0.5)",
        "[0.5, 1)",
        "[1, 1.5)",
        "[1.5, 2)",
        "[2, 2.5)",
        "[2.5, 3)",
        "[3, 3.5)",
        "[3.5, 4)",
    ]
    tick_positions = np.arange(len(bin_intervals)) + 0.5  # Adjust tick positions
    plt.xticks(
        ticks=tick_positions, labels=bin_intervals, rotation=0, ha="center", fontsize=23
    )

    # Save the heatmap figure in PDF format without the color bar
    plt.savefig(f"{save_path}.pdf", bbox_inches="tight")

In [None]:
def get_probabilities(*, completion, token_idx):
    """
    Given a GPT completion object, and token_idx of the token we care about, return the actual probabilities of the completion.
    """
    completion = completion["choices"]
    assert len(completion) == 1
    try:
        logprobs = completion[0]["logprobs"]["top_logprobs"][token_idx]
    except IndexError:
        return []
    logprobs = pd.Series(logprobs)
    probs = logprobs.apply(np.exp)
    return probs


def expected_value(array):
    return sum(index * value for index, value in enumerate(array))

# Load data

In [None]:
filename = get_base_dir_path() / "data/validate_disc_query_logs.csv"

df = pd.read_csv(filename, index_col=0)
df["probs"] = df["completion"].apply(
    lambda x: get_probabilities(completion=json.loads(x), token_idx=1).values
)

In [None]:
freq_table = pd.DataFrame(columns=range(0, 5), index=range(0, 5)).astype(float)
for i in range(5):
    freq_table.loc[i] = (
        df[df["correct_choice"] == i]["probs"].apply(pd.Series).mean().tolist()
    )

# Generate Figure 1a

In [None]:
save_path = get_base_dir_path() / "plots/fig1_disc_query_eval_avg"
create_heatmap(freq_table, "", save_path, vmax=0.5)

In [None]:
bins = 9

df = df.copy()
df["expected_value"] = df["probs"].apply(expected_value)
df = df[["expected_value", "correct_choice"]]

# Define bin edges
bins_edges = np.linspace(0, 4, num=bins)
# Bin the 'expected_value' column
df.loc[:, "bins"] = pd.cut(
    df["expected_value"], bins=bins_edges, labels=False, include_lowest=True
)
# Count occurrences
count_df = df.groupby(["correct_choice", "bins"]).size().reset_index(name="count")
# Create the pivot table
pivot_table = count_df.pivot(
    index="correct_choice", columns="bins", values="count"
).fillna(0)
# Normalize the counts to get fractions
pivot_table = pivot_table.div(pivot_table.sum(axis=1), axis=0)

# Generate Figure 1b

In [None]:
save_path2 = get_base_dir_path() / "plots/fig1_disc_query_eval_ev"
create_histo_heatmap(pivot_table, "", save_path2, vmax=0.5)