This notebook creates the plots for the main paper.

# Imports

In [None]:
# to import from mturk folder
import os, sys, inspect

In [None]:
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
mturkdir = os.path.join(os.path.dirname(os.path.dirname(currentdir)), "mturk")
sys.path.insert(0, mturkdir)

In [None]:
from mturk import RepeatedTaskResult
import numpy as np
from matplotlib import pyplot as plt
import pickle
from glob import glob
import pandas as pd
import seaborn as sns
import json

In [None]:
import utils_figures as utf
import utils_figures_helper as utf_helper
import utils_MTurk_figures as utf_mturk
import utils_data as utd

# Parameters

In [None]:
# path to the folder containing the pkl files generated by the experiment's code
raw_results_folder = "data/counterfactual_experiment"

save_csv = True
include_baselines = True

ignore_duplicate_participants = False # set to False when using data that has no duplicates.

# START for figures
save_fig = False
# name of the folder in ./figures/ where all resulting figures will be saved
exp_str = "counterfactual_experiment"
instr_type_list = ["optimized", "natural", "mixed", "blur", "none"]
branches_labels_list = ["3x3", "pool"]
kernel_size_list = ["1", "3"]
# END for figures

# START for payment
mturk_payment_one_HIT = 2.34
mturk_payment_one_HIT_none = 0.84
repetition_factor_due_to_exclusion = 1.35
expected_distinct_workers = 50
# END for payment

In [None]:
instruction_labels = {
    "optimized": "Synthetic",
    "natural": "Natural",
    "mixed": "Mixed",
    "none": "None",
    "blur": "Blur",
}
labels = [instruction_labels[it] for it in instr_type_list]

In [None]:
if save_fig:
    os.makedirs(os.path.join("figures", exp_str), exist_ok=True)

# Load data & preprocess

In [None]:
"""
Check if the `calculate_relative_activation_difference.py` script was already run for all json configurations
If not, do so.
to add the query activation information to the structure
save the resulting files with the filenames shown below
"""

if include_baselines:
    structure_json_map = {
        "natural": "natural_with_baselines.json",
        "optimized": "optimized_with_baselines.json",
        "mixed": "mixed_with_baselines.json",
        "blur": "natural_blur_with_baselines.json",
        "none": "natural_with_baselines.json",
    }
else:
    structure_json_map = {
        "natural": "natural.json",
        "optimized": "optimized.json",
        "mixed": "mixed.json",
        "blur": "natural_blur.json",
        "none": "natural.json",
    }

trial_structures = utd.load_and_parse_trial_structure(
    raw_results_folder, [structure_json_map[it] for it in instr_type_list]
)
trial_structures = {k: v for k, v in zip(instr_type_list, trial_structures)}
df, df_checks, df_feedback = utd.load_and_parse_all_results(
    raw_results_folder, instr_type_list
)

Add a column to the result df indicating whether the row belongs to an excluded or included response

In [None]:
def get_map_excluded_responses(column_name="passed_checks"):
    def map_excluded_responses(row):
        rows = df_checks[
            (df_checks["task_id"] == row["task_id"])
            & (df_checks["response_index"] == row["response_index"])
        ]
        result = not rows[column_name].item()
        return result

    return map_excluded_responses


df["excluded_response"] = df.apply(get_map_excluded_responses("passed_checks"), axis=1)

Create a unique column based on task id and response id (unique within each task)

In [None]:
df, df_checks = utd.add_task_response_id(df, df_checks)

In [None]:
df_main = (
    df[(df["catch_trial"] == False) & (df["is_demo"] == False)]
    .reset_index()
    .drop("index", axis=1)
)
df_catch_trials = (
    df[(df["catch_trial"] == True) & (df["is_demo"] == False)]
    .reset_index()
    .drop("index", axis=1)
)
df_demo_trials = df[df["is_demo"] == True].reset_index().drop("index", axis=1)

Append structure information such as layer, kernel size, etc. to the dataframe

In [None]:
df_main = utd.append_trial_structure_to_results(df_main, trial_structures)
df_catch_trials = utd.append_trial_structure_to_results(
    df_catch_trials, trial_structures
)

In [None]:
if ignore_duplicate_participants:
    df_duplicate_tasks = pd.read_csv(
        os.path.join(raw_results_folder, "duplicate_tasks.csv")
    )
    df_main["excluded_response"] = df_main.apply(
        axis=1,
        func=lambda row: True
        if row["excluded_response"]
        else (
            len(
                df_duplicate_tasks[
                    (df_duplicate_tasks["mode"] == row["mode"])
                    & (df_duplicate_tasks["task_number"] == row["task_number"])
                ]
            )
            > 0
        ),
    )

Split data up in trials belonging to excluded responses, and those that passed the exclusion criteria

In [None]:
df_main_excluded = df_main[df_main["excluded_response"]]
df_main_not_excluded = df_main[~df_main["excluded_response"]]

df_catch_trials_excluded = df_catch_trials[df_catch_trials["excluded_response"]]
df_catch_trials_not_excluded = df_catch_trials[~df_catch_trials["excluded_response"]]

df_demo_trials_excluded = df_demo_trials[df_demo_trials["excluded_response"]]
df_demo_trials_not_excluded = df_demo_trials[~df_demo_trials["excluded_response"]]

Calculate how often the demo trials had to be repeated

In [None]:
df_checks = utd.checks_add_demo_trial_repetitions(df_demo_trials, df_checks)

In [None]:
df, df_checks = utd.process_checks(df, df_checks)

In [None]:
df_catch_trials_not_excluded_ignoring_catch_trials = utd.get_catch_trials_as_main_data(
    df_catch_trials, df_checks
)

In [None]:
df_checks_not_excluded = df_checks[df_checks["passed_checks"]]
df_checks_excluded = df_checks[~df_checks["passed_checks"]]

In [None]:
if save_csv:
    # save dataframes to csv
    df_checks.to_csv(os.path.join(raw_results_folder, "df_exclusion_criteria.csv"))
    df.to_csv(os.path.join(raw_results_folder, "df_trials.csv"))

In [None]:
figures_folder = os.path.join(
    "figures", os.path.basename(os.path.realpath(raw_results_folder))
)
if save_fig:
    os.makedirs(figures_folder, exist_ok=True)
    print("Saving results to", figures_folder)

# Figure 1C

In [None]:
df_expert_baseline = pd.read_csv("data/baselines/df_main_trials.csv")
df_expert_baseline["expert_baseline"] = True
df_main_not_excluded_copy = df_main_not_excluded.copy()
df_main_not_excluded_copy["expert_baseline"] = False
df_main_not_excluded_with_expert_baseline = pd.concat((df_expert_baseline, df_main_not_excluded_copy)).reset_index(drop=True)


utf.make_plot_workers_understood_task(
    df_main_not_excluded_with_expert_baseline,
    figures_folder,
    exp_str,
    ["optimized", "natural", "none"],
    ["Synthetic", "Natural", "None"],
    fig_1=True,
    include_experts=False,
    save_fig=save_fig
)

# Figure 3A: Performance

In [None]:
utf.make_plot_workers_understood_task(
    df_main_not_excluded_with_expert_baseline,
    figures_folder,
    exp_str,
    instr_type_list,
    labels,
    fig_1=False,
    save_fig=save_fig
)


del df_main_not_excluded_with_expert_baseline, df_main_not_excluded_copy

# Figure 3B: Reaction Time

In [None]:
utf.make_plot_natural_are_better_wrt_reaction_time(
    df_main_not_excluded,
    results_folder=figures_folder,
    save_fig=save_fig,
    instr_type_list=instr_type_list,
    labels=labels
)

# Figure 4A: Baseline Accuracies

In [None]:
# Load expert data
df_expert_baseline = pd.read_csv("data/baselines/df_main_trials.csv")
df_expert_baseline["expert_baseline"] = True
df_expert_baseline["mode_extended"] = df_expert_baseline.apply(
    lambda row: "e_" + row["mode"], axis=1  # e for expert
)
df_expert_baseline["kernel_size"] = df_expert_baseline.apply(
    lambda row: str(row["kernel_size"]), axis=1
)
df_expert_baseline["layer"] = df_expert_baseline.apply(
    lambda row: str(row["layer"]), axis=1
)

# extend worker df with new columns
df_main_not_excluded_copy = df_main_not_excluded.copy()
df_main_not_excluded_copy["expert_baseline"] = False
df_main_not_excluded_copy["mode_extended"] = df_main_not_excluded_copy.apply(
    lambda row: "w_" + row["mode"], axis=1  # w for worker
)
df_main_not_excluded_with_expert_baseline = pd.concat(
    (df_expert_baseline, df_main_not_excluded_copy)
).reset_index(drop=True)

# load primary object baseline
if os.path.exists("data/baselines/df_primary_object_baseline.csv"):
    df_primary_object_baseline = pd.read_csv(
        "data/baselines2/df_primary_object_baseline.csv"
    )

    def parse_primary_object_baseline(row):
        mask = (
            (df_primary_object_baseline["batch"] == row["batch"])
            & (df_primary_object_baseline["layer"] == row["layer"])
            & (df_primary_object_baseline["kernel_size"] == row["kernel_size"])
        )
        selected_rows = df_primary_object_baseline[mask]
        if not len(selected_rows) == 1:
            print(
                "missing information for row:",
                row[["batch", "trial_index", "mode", "task_number"]],
            )
            print()
        return selected_rows.iloc[0]["primary_object_choice"]

    df_main_not_excluded_with_expert_baseline[
        "primary_object_baseline_choice"
    ] = df_main_not_excluded_with_expert_baseline.apply(
        axis=1, func=parse_primary_object_baseline
    )

    # clean up
    del df_primary_object_baseline
else:
    print(
        "Could not find objects baselines csv and, thus, cannot append this information to the dataframe"
    )

In [None]:
df_main_not_excluded_with_expert_baseline[
    "correct_center"
] = df_main_not_excluded_with_expert_baseline.apply(
    lambda row: row["max_query_center_distance"] > row["min_query_center_distance"],
    axis=1,
)
df_main_not_excluded_with_expert_baseline[
    "correct_std"
] = df_main_not_excluded_with_expert_baseline.apply(
    lambda row: row["max_query_patch_std"] < row["min_query_patch_std"], axis=1
)
df_main_not_excluded_with_expert_baseline[
    "correct_primary"
] = df_main_not_excluded_with_expert_baseline.apply(
    lambda row: True if row["primary_object_baseline_choice"] == 1 else False, axis=1
)
df_main_not_excluded_with_expert_baseline[
    "correct_saliency"
] = df_main_not_excluded_with_expert_baseline.apply(
    lambda row: row["max_query_patch_saliency"] < row["min_query_patch_saliency"],
    axis=1,
)

In [None]:
baseline_accuracies = {
    "Center": df_main_not_excluded_with_expert_baseline[
        ~df_main_not_excluded_with_expert_baseline["expert_baseline"] # only taking dataframe from workers
    ]["correct_center"].mean(),
    "Variance": df_main_not_excluded_with_expert_baseline[
        ~df_main_not_excluded_with_expert_baseline["expert_baseline"]
    ]["correct_std"].mean(),
    "Object": 0.6344107407407408,
    "Saliency": df_main_not_excluded_with_expert_baseline[
        ~df_main_not_excluded_with_expert_baseline["expert_baseline"]
    ]["correct_saliency"].mean(),
}

baseline_sems = {
    "Object": 0.006435863163504224,
}

utf.plot_baseline_accuracy(
    baseline_accuracies,
    sems=baseline_sems,
    results_folder=figures_folder,
    save_fig=save_fig,
    label_order=["Center", "Object", "Variance", "Saliency"],
)

# Figure 4 B and C: Cohen's Kappa

In [None]:
extended_mode_list = [
    "w_optimized",
    "w_natural",
    "w_mixed",
    "w_blur",
    "w_none",
    "b_center",
    "b_std",
    "b_saliency",
]

cohens_kappa = utf_helper.get_cohens_kappa_all_conditions_with_each_other(
    df_main_not_excluded_with_expert_baseline, extended_mode_list, "mode_extended"
)
(
    cohens_kappa_matrix,
    cohens_kappa_std_matrix,
    cohens_kappa_sem_matrix,
) = utf_helper.get_cohens_kappa_matrix_all_conditions_with_each_other(
    extended_mode_list, cohens_kappa
)

In [None]:
extended_mode_label_list = [
    "Synthetic",
    "Natural",
    "Mixed",
    "Blur",
    "None",
    "Center\nBaseline",
    "Variance\nBaseline",
    "Saliency\nBaseline",
]

extended_mode_list = [
    "w_optimized",
    "w_natural",
    "w_mixed",
    "w_blur",
    "w_none",
    "b_center",
    "b_std",
    "b_saliency",
]

utf_mturk.plot_worker_baseline_consistency_matrix(
    np.round(cohens_kappa_matrix * 100, 2).astype(int),
    np.round(2 * cohens_kappa_sem_matrix * 100, 2).astype(int),
    extended_mode_list,
    extended_mode_label_list,
    figures_folder,
    save_fig=save_fig,
    vmin=cohens_kappa_matrix.min() * 100,
    vmax=cohens_kappa_matrix.max() * 100,
)

In [None]:
extended_mode_list = ["w_optimized", "w_natural", "b_saliency"]

(
    cohens_kappa_submatrix,
    cohens_kappa_std_submatrix,
    cohens_kappa_sem_submatrix,
) = utf_helper.get_cohens_kappa_matrix_all_conditions_with_each_other(
    extended_mode_list, cohens_kappa
)

extended_mode_label_list = ["Synthetic", "Natural", "Saliency\nBaseline"]

utf_mturk.plot_worker_baseline_consistency_matrix(
    np.round(cohens_kappa_submatrix * 100, 2).astype(int),
    np.round(2 * cohens_kappa_sem_submatrix * 100, 2).astype(int),
    extended_mode_list,
    extended_mode_label_list,
    figures_folder,
    save_fig=save_fig,
    vmin=cohens_kappa_matrix.min() * 100,
    vmax=cohens_kappa_matrix.max() * 100,
)

# Figure 5A, B: Performance by Unit

In [None]:
for kernel_size_i in df_main_not_excluded["kernel_size"].unique():
    print(f"kernel_size {kernel_size_i}")
    utf_mturk.plot_accuracy_per_layer(
        df_main_not_excluded[df_main_not_excluded["kernel_size"] == kernel_size_i],
        results_folder=figures_folder,
        save_fig=save_fig,
        instr_type_list=instr_type_list,
        title_prefix=f"For kernel size {kernel_size_i}: ",
        legend=False,
    )

    # include error bars by setting show_sem=True

# Figure 5C
### Stop using the catch trials as exclusion criterion and plot the, thus, unbiased performance over these trials

In [None]:
utf_mturk.plot_accuracy_per_layer(
    df_catch_trials_not_excluded_ignoring_catch_trials,
    results_folder=figures_folder,
    save_fig=save_fig,
    instr_type_list=instr_type_list,
    legend=False,
)

# Figure 7: Relative Activation Difference

In [None]:
for kernel_size_i in sorted(df_main_not_excluded["kernel_size"].unique()):
    utf_mturk.plot_accuracy_vs_relative_activation_difference(
        df_main_not_excluded[df_main_not_excluded["kernel_size"] == kernel_size_i],
        results_folder=figures_folder,
        save_fig=save_fig,
        fig_name_suffix=f"_kernel_size{kernel_size_i}",
    )