This notebook produces figures related to the counterfactual-inspired experiment for the appendix.

# Imports

In [None]:
# to import from mturk folder
import os, sys, inspect

In [None]:
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
mturkdir = os.path.join(os.path.dirname(os.path.dirname(currentdir)), "mturk")
sys.path.insert(0, mturkdir)

In [None]:
from mturk import RepeatedTaskResult
import numpy as np
from matplotlib import pyplot as plt
import pickle
from glob import glob
import pandas as pd
import seaborn as sns
import json


In [None]:
import utils_figures as utf
import utils_figures_helper as utf_helper
import utils_MTurk_figures as utf_mturk
import utils_data as utd

# Parameters

In [None]:
# path to the folder containing the pkl files generated by the experiment's code
raw_results_folder = "data/counterfactual_experiment"

save_csv = False
include_baselines = True

# START for figures
save_fig = False
# name of the folder in ./figures/ where all resulting figures will be saved
exp_str = "counterfactual_experiment"
instr_type_list = ["optimized", "natural", "mixed", "blur", "none"]
branches_labels_list = ["3x3", "pool"]
kernel_size_list = ["1", "3"]
# END for figures

# START for payment
mturk_payment_one_HIT = 2.34
mturk_payment_one_HIT_none = 0.84
repetition_factor_due_to_exclusion = 1.35
expected_distinct_workers = 50
# END for payment

In [None]:
instruction_labels = {
    "optimized": "Synthetic",
    "natural": "Natural",
    "mixed": "Mixed",
    "none": "None",
    "blur": "Blur",
}
labels = [instruction_labels[it] for it in instr_type_list]

# Load data

In [None]:
"""
Check if the `calculate_relative_activation_difference.py` script was already run for all json configurations
If not, do so.
to add the query activation information to the structure
save the resulting files with the filenames shown below
"""

if include_baselines:
    structure_json_map = {
        "natural": "natural_with_baselines.json",
        "optimized": "optimized_with_baselines.json",
        "mixed": "mixed_with_baselines.json",
        "blur": "natural_blur_with_baselines.json",
        "none": "natural_with_baselines.json",
    }
else:
    structure_json_map = {
        "natural": "natural.json",
        "optimized": "optimized.json",
        "mixed": "mixed.json",
        "blur": "natural_blur.json",
        "none": "natural.json",
    }

trial_structures = utd.load_and_parse_trial_structure(
    raw_results_folder, [structure_json_map[it] for it in instr_type_list]
)
trial_structures = {k: v for k, v in zip(instr_type_list, trial_structures)}
df, df_checks, df_feedback = utd.load_and_parse_all_results(
    raw_results_folder, instr_type_list
)

Add a column to the result df indicating whether the row belongs to an excluded or included response

In [None]:
def get_map_excluded_responses(column_name="passed_checks"):
    def map_excluded_responses(row):
        rows = df_checks[
            (df_checks["task_id"] == row["task_id"])
            & (df_checks["response_index"] == row["response_index"])
        ]
        result = not rows[column_name].item()
        return result

    return map_excluded_responses


df["excluded_response"] = df.apply(get_map_excluded_responses("passed_checks"), axis=1)

Create a unique column based on task id and response id (unique within each task)

In [None]:
df, df_checks = utd.add_task_response_id(df, df_checks)

In [None]:
df_main = (
    df[(df["catch_trial"] == False) & (df["is_demo"] == False)]
    .reset_index()
    .drop("index", axis=1)
)
df_catch_trials = (
    df[(df["catch_trial"] == True) & (df["is_demo"] == False)]
    .reset_index()
    .drop("index", axis=1)
)
df_demo_trials = df[df["is_demo"] == True].reset_index().drop("index", axis=1)

Append structure information such as layer, kernel size, etc. to the dataframe

In [None]:
df_main = utd.append_trial_structure_to_results(df_main, trial_structures)
df_catch_trials = utd.append_trial_structure_to_results(
    df_catch_trials, trial_structures
)

Split data up in trials belonging to excluded responses, and those that passed the exclusion criteria

In [None]:
df_main_excluded = df_main[df_main["excluded_response"]]
df_main_not_excluded = df_main[~df_main["excluded_response"]]

df_catch_trials_excluded = df_catch_trials[df_catch_trials["excluded_response"]]
df_catch_trials_not_excluded = df_catch_trials[~df_catch_trials["excluded_response"]]

df_demo_trials_excluded = df_demo_trials[df_demo_trials["excluded_response"]]
df_demo_trials_not_excluded = df_demo_trials[~df_demo_trials["excluded_response"]]

Calculate how often the demo trials had to be repeated

In [None]:
df_checks = utd.checks_add_demo_trial_repetitions(df_demo_trials, df_checks)

In [None]:
df, df_checks = utd.process_checks(df, df_checks)

In [None]:
df_catch_trials_not_excluded_ignoring_catch_trials = utd.get_catch_trials_as_main_data(
    df_catch_trials, df_checks
)

In [None]:
df_checks_not_excluded = df_checks[df_checks["passed_checks"]]
df_checks_excluded = df_checks[~df_checks["passed_checks"]]

In [None]:
if save_csv:
    # save dataframes to csv
    df_checks.to_csv(os.path.join(raw_results_folder, "df_exclusion_criteria.csv"))
    df.to_csv(os.path.join(raw_results_folder, "df_trials.csv"))

# Plot it!

In [None]:
figures_folder = os.path.join(
    "figures", exp_str
)
if save_fig:
    os.makedirs(figures_folder, exist_ok=True)
    print("Saving results to", figures_folder)

# Figure 8: Overlap between labes of query and reference images

Please see the following file to produce is figure:`tools/data-generation/causal-occlusion/C_get_labels_of_natural_reference_and_default_images.ipynb`.

# Figure 12: Confidence

In [None]:
utf.make_plot_natural_are_better_wrt_confidence(
    df_main_not_excluded,
    results_folder=figures_folder,
    save_fig=save_fig,
    instr_type_list=instr_type_list,
    conditioned_on=None,
    labels=labels,
)

utf.make_plot_natural_are_better_wrt_confidence(
    df_main_not_excluded,
    results_folder=figures_folder,
    save_fig=save_fig,
    instr_type_list=instr_type_list,
    conditioned_on="correctness",
    labels=labels,
)

utf.make_plot_natural_are_better_wrt_confidence(
    df_main_not_excluded,
    results_folder=figures_folder,
    save_fig=save_fig,
    instr_type_list=instr_type_list,
    conditioned_on="falseness",
    labels=labels,
)

# Figure 13: Reaction Times

In [None]:
utf.make_plot_natural_are_better_wrt_reaction_time(
    df_main_not_excluded,
    results_folder=figures_folder,
    save_fig=True,
    instr_type_list=instr_type_list,
    conditioned_on=None,
    labels=labels,
)

utf.make_plot_natural_are_better_wrt_reaction_time(
    df_main_not_excluded,
    results_folder=figures_folder,
    save_fig=True,
    instr_type_list=instr_type_list,
    conditioned_on="correctness",
    labels=labels,
)

utf.make_plot_natural_are_better_wrt_reaction_time(
    df_main_not_excluded,
    results_folder=figures_folder,
    save_fig=True,
    instr_type_list=instr_type_list,
    conditioned_on="falseness",
    labels=labels,
)

# Figure 14: Analyze performance for each unique batch of data of selected units

In [None]:
unit_dict = {}
unit_dict["easy"] = {"layer": "7", "kernel_size": "3", "marker": "x"}
unit_dict["intermediate"] = {"layer": "6", "kernel_size": "3", "marker": "+"}
unit_dict["difficult"] = {"layer": "1", "kernel_size": "3", "marker": "3"}

In [None]:
def generate_dfs_grouped_by_batch(df, batch_ids):
    df = df.copy()

    for bid in batch_ids:
        selected_df = df[df["batch"] == bid]
        yield selected_df

In [None]:
for difficulty_i, unit_spec_i in unit_dict.items():
    print(difficulty_i)

    fig, axes = plt.subplots(2, 5)
    fig.set_size_inches((5.4992 * 2, 5))

    df = df_main_not_excluded[
        (df_main_not_excluded["layer"] == unit_spec_i["layer"])
        & (df_main_not_excluded["kernel_size"] == unit_spec_i["kernel_size"])
    ].copy()

    batch_ids = sorted(df["batch"].unique().tolist())
    row_i = 0
    col_i = 0
    for batch_idx, (batch_df) in enumerate(
        zip(generate_dfs_grouped_by_batch(df, batch_ids))
    ):

        dict_acc = {}
        for reference_type_i in instr_type_list:
            df_factor_i = batch_df[0][batch_df[0]["mode"] == reference_type_i].copy()

            if len(df_factor_i) == 0:
                dict_acc[reference_type_i] = np.nan
            else:
                accuracy = (df_factor_i["correct"] == True).sum() / df_factor_i.shape[0]
                dict_acc[reference_type_i] = accuracy

        # loop through conditions
        axes[row_i, col_i].bar(
            range(len(dict_acc)),
            list(dict_acc.values()),
            color=[utf.colors[it] for it in instr_type_list],
        )

        axes[row_i, col_i].axhline(
            0.5, color="k", linestyle="--", linewidth=1, label="Chance"
        )

        axes[row_i, col_i].set_ylim(0, 1)
        axes[row_i, col_i].set_title(f"Image Set {batch_idx}")
        if col_i == 0:
            axes[row_i, col_i].set_ylabel("Proportion Correct")
        else:
            axes[row_i, col_i].spines["left"].set_visible(False)
            axes[row_i, col_i].set_yticks([])
            axes[row_i, col_i].set_yticklabels([])
        axes[row_i, col_i].set_xticks([])
        axes[row_i, col_i].set_xticklabels([])

        # no axis on top and right
        axes[row_i, col_i].spines["top"].set_visible(False)
        axes[row_i, col_i].spines["right"].set_visible(False)

        col_i += 1
        if col_i == 5:
            row_i = 1
            col_i = 0

    plt.tight_layout()

    if save_fig:
        plot_name = f"accuracy_per_batch"
        for version in range(100):
            file_name = os.path.join(
                figures_folder, f"{plot_name}_{difficulty_i}_{version}.pdf"
            )
            # if file_name does not yet exist, use it
            if not os.path.exists(file_name):
                break
        print(f"figure saved under {file_name}")
        plt.savefig(file_name, bbox_inches="tight")

    plt.show()

# Figure 15: Cohen's kappa per batch

In [None]:
# Load expert data
df_expert_baseline = pd.read_csv("data/baselines2/df_main_trials.csv")
df_expert_baseline["expert_baseline"] = True
df_expert_baseline["mode_extended"] = df_expert_baseline.apply(
    lambda row: "e_" + row["mode"], axis=1  # e for expert
)
df_expert_baseline["kernel_size"] = df_expert_baseline.apply(
    lambda row: str(row["kernel_size"]), axis=1
)
df_expert_baseline["layer"] = df_expert_baseline.apply(
    lambda row: str(row["layer"]), axis=1
)

# extend worker df with new columns
df_main_not_excluded_copy = df_main_not_excluded.copy()
df_main_not_excluded_copy["expert_baseline"] = False
df_main_not_excluded_copy["mode_extended"] = df_main_not_excluded_copy.apply(
    lambda row: "w_" + row["mode"], axis=1  # w for worker
)
df_main_not_excluded_with_expert_baseline = pd.concat(
    (df_expert_baseline, df_main_not_excluded_copy)
).reset_index(drop=True)

# load primary object baseline
if os.path.exists("data/baselines2/df_primary_object_baseline.csv"):
    df_primary_object_baseline = pd.read_csv(
        "data/baselines2/df_primary_object_baseline.csv"
    )

    def parse_primary_object_baseline(row):
        mask = (
            (df_primary_object_baseline["batch"] == row["batch"])
            & (df_primary_object_baseline["layer"] == row["layer"])
            & (df_primary_object_baseline["kernel_size"] == row["kernel_size"])
        )
        selected_rows = df_primary_object_baseline[mask]
        if not len(selected_rows) == 1:
            print(
                "missing information for row:",
                row[["batch", "trial_index", "mode", "task_number"]],
            )
            print()
        return selected_rows.iloc[0]["primary_object_choice"]

    df_main_not_excluded_with_expert_baseline[
        "primary_object_baseline_choice"
    ] = df_main_not_excluded_with_expert_baseline.apply(
        axis=1, func=parse_primary_object_baseline
    )

    # clean up
    del df_primary_object_baseline
else:
    print(
        "Could not find objects baselines csv and, thus, cannot append this information to the dataframe"
    )

In [None]:
df_main_not_excluded_with_expert_baseline[
    "correct_center"
] = df_main_not_excluded_with_expert_baseline.apply(
    lambda row: row["max_query_center_distance"] > row["min_query_center_distance"],
    axis=1,
)
df_main_not_excluded_with_expert_baseline[
    "correct_std"
] = df_main_not_excluded_with_expert_baseline.apply(
    lambda row: row["max_query_patch_std"] < row["min_query_patch_std"], axis=1
)
df_main_not_excluded_with_expert_baseline[
    "correct_primary"
] = df_main_not_excluded_with_expert_baseline.apply(
    lambda row: True if row["primary_object_baseline_choice"] == 1 else False, axis=1
)
df_main_not_excluded_with_expert_baseline[
    "correct_saliency"
] = df_main_not_excluded_with_expert_baseline.apply(
    lambda row: row["max_query_patch_saliency"] < row["min_query_patch_saliency"],
    axis=1,
)

In [None]:
extended_mode_list = [
    "w_optimized",
    "w_natural",
    "w_mixed",
    "w_blur",
    "w_none",
    "b_center",
    #"b_primary",
    "b_std",
    "b_saliency",
]

cohens_kappa = utf_helper.get_cohens_kappa_all_conditions_with_each_other(
    df_main_not_excluded_with_expert_baseline, extended_mode_list, "mode_extended"
)

In [None]:
extended_mode_label_dict = {}
extended_mode_label_dict["w_optimized"] = "Synthetic"
extended_mode_label_dict["w_natural"] = "Natural"
extended_mode_label_dict["w_mixed"] = "Mixed"
extended_mode_label_dict["w_blur"] = "Blur"
extended_mode_label_dict["w_none"] = "None"
extended_mode_label_dict["b_center"] = "Center"
# extended_mode_label_dict["b_primary"] = "Object"
extended_mode_label_dict["b_std"] = "Variance"
extended_mode_label_dict["b_saliency"] = "Saliency"

utf_mturk.sub_plot_cohens_kappa_by_batch(
    cohens_kappa,
    extended_mode_list,
    extended_mode_label_dict,
    figures_folder,
    exp_str,
    save_fig=True,
)

# Figure 16: Relative Activation Differences

In [None]:
for kernel_size_i in sorted(df_main_not_excluded["kernel_size"].unique()):
    utf_mturk.plot_binned_accuracy_vs_relative_activation_difference(
        df_main_not_excluded[df_main_not_excluded["kernel_size"] == kernel_size_i],
        figures_folder,
        save_fig=True,
        fig_name_suffix=f"_kernel_size{kernel_size_i}",
    )

# Figure 17: Exclusion Criteria (Distribution over results)

In [None]:
utf_mturk.plot_exclusion_criteria(
    df_checks, proportion=False, results_folder=figures_folder, save_fig=True
)

In [None]:
utf_mturk.plot_task_postings(
    df_checks, proportion=False, results_folder=figures_folder, save_fig=True
)

# Figure 18: Exclusion Criteria (Distribution over values)

### Included Data

In [None]:
utf_mturk.plot_instruction_time_details_extracted(
    df_checks_not_excluded.copy(), True, figures_folder, save_fig
)

utf_mturk.plot_total_response_time_details_extracted(
    df_checks_not_excluded.copy(), True, figures_folder, save_fig
)

utf_mturk.plot_catch_trials_details_ratio_exctracted(
    df_checks_not_excluded.copy(), True, figures_folder, save_fig
)

utf_mturk.plot_row_variability_details_upper_extracted(
    df_checks_not_excluded.copy(), True, figures_folder, save_fig
)

### Excluded Data

In [None]:
utf_mturk.plot_instruction_time_details_extracted(
    df_checks_excluded.copy(), False, figures_folder, save_fig
)

utf_mturk.plot_total_response_time_details_extracted(
    df_checks_excluded.copy(), False, figures_folder, save_fig
)

utf_mturk.plot_catch_trials_details_ratio_exctracted(
    df_checks_excluded.copy(), False, figures_folder, save_fig
)

utf_mturk.plot_row_variability_details_upper_extracted(
    df_checks_excluded.copy(), False, figures_folder, save_fig
)

In [None]:
utf_mturk.plot_practice_trials_attempts(
    df_checks[df_checks["mode"] != "none"],
    proportion=False,
    results_folder=figures_folder,
    save_fig=True,
)