This notebook produces figures related to the intervention-prediction experiment shown in the appendix.

# Imports

In [None]:
# to import from mturk folder
import os, sys, inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
mturkdir = os.path.join(os.path.dirname(os.path.dirname(currentdir)), "mturk")
sys.path.insert(0, mturkdir)

In [None]:
from mturk import RepeatedTaskResult
import numpy as np
from matplotlib import pyplot as plt
import pickle
from glob import glob
import pandas as pd
import seaborn as sns
import json

In [None]:
import utils_ICLR_figures as ut
import utils_ICLR_figures_helper as ut_helper
import utils_MTurk_figures as ut_mturk

# Parameters

In [None]:
results_folder = "data/intervention_experiment/"
mturk_payment_one_HIT = 1.50
repetition_factor_due_to_exclusion = 1.2
save_csv = False
save_fig = True

# Load data

In [None]:
def load_results(data_folder):
    """Load experiment results as pickled RepeatedTaskResult object"""
    result_fns = glob(os.path.join(data_folder, "result_task_*.pkl"))

    all_results = []
    for result_fn in result_fns:
        with open(result_fn, "rb") as f:
            result = pickle.load(f)
        if len(result) == 1:
            all_results += result
        else:
            all_results.append(result)

    return all_results

def parse_results(tasks, mode="natural"):
    """Convert list of RepeatedTaskResult objects to pandas dataframe"""
    dfs = []
    for i_task, task_data in enumerate(tasks):
        dfs_per_task = []

        # take the last response only, as this was the accepted one
        response_data = task_data.responses[-1]
        response_df = pd.DataFrame(response_data["main_data"]) # if you want look at the demo trials and other raw data, load pd.DataFrame(response_data["raw_data"])
        dfs_per_task.append(response_df)

        task_df = pd.concat(dfs_per_task, 0)
        task_df["task_number"] = i_task
        dfs.append(task_df)

    df = pd.concat(dfs, 0)

    df["mode"] = mode
    df = df.reset_index().drop("index", axis=1)

    return df

def parse_check_results(tasks, mode="natural"):
    df = []
    for task in tasks:
        for response_idx, response in enumerate(task.raw_responses):
            check_results = response["check_results"]
            df.append({
                "task_id": task.task_id,
                "response_index": response_idx,
                "passed_checks": response["passed_checks"],
                "worker_id": task[1][response_idx]["worker_id"],
                **{f"{k}_result": check_results[k][0] for k in check_results},
                **{f"{k}_details": check_results[k][1] for k in check_results},
            })
    df = pd.DataFrame(df)
    df["mode"] = mode

    return df

def load_and_parse_all_results(base_folder):
    results_natural = load_results(os.path.join(base_folder, "natural"))
    results_optimized = load_results(os.path.join(base_folder, "optimized"))

    df_checks_natural = parse_check_results(results_natural, "natural")
    df_checks_optimized = parse_check_results(results_optimized, "optimized")
    df_checks = pd.concat((df_checks_natural, df_checks_optimized)).reset_index().drop("index", axis=1)

    df_natural = parse_results(results_natural, "natural")
    df_optimized= parse_results(results_optimized, "optimized")
    df = pd.concat((df_natural, df_optimized)).reset_index().drop("index", axis=1)
    df["corrected_trial_index"] = df.trial_index - df.trial_index.min()

    return df, df_checks

In [None]:
def load_and_parse_trial_structure(folder):
    def parse_trials_structure(trials):
        results = []
        for trial in trials:
            query_path = trial["queries"]
            parts = query_path.split("/")
            batch = parts[-1].split("_")[-1]
            channel = parts[-3].split("_")[-1]
            kernel_size = parts[-4].split("_")[-1]
            layer = parts[-5].split("_")[-1]

            results.append(dict(batch=batch, channel=channel, kernel_size=kernel_size, layer=layer))
        return results

    with open(os.path.join(folder, "natural.json"), "r") as f:
        raw_structure = json.load(f)

    structure = {}
    for item in raw_structure["tasks"]:
        structure[item["index"]] = {k:parse_trials_structure(item[k]) for k in item if k != "index"}

    return structure

def append_trial_structure_to_results(df, structure):
    df = df.copy(deep=True)

    # merge structure with df
    batch_column = []
    channel_column = []
    kernel_size_column = []
    layer_column = []
    for i in range(len(df)):
        task_number = df.task_number[i] + 1
        trial_number = df.corrected_trial_index[i]
        info = structure[task_number]["trials"][trial_number]
        batch_column.append(info["batch"])
        channel_column.append(info["channel"])
        kernel_size_column.append(info["kernel_size"])
        layer_column.append(info["layer"])

    df["batch"] = batch_column
    df["channel"] = channel_column
    df["kernel_size"] = kernel_size_column
    df["layer"] = layer_column

    return df

In [None]:
df, df_checks = load_and_parse_all_results(results_folder)
trial_structure = load_and_parse_trial_structure(results_folder)
df = append_trial_structure_to_results(df, trial_structure)

In [None]:
df_main = df[(df["catch_trial"] == False) & (df["is_demo"] == False)]
df_catch_trials = df[(df["catch_trial"] == True) & (df["is_demo"] == False)]
df_demo_trials = df[df["is_demo"] == True]

In [None]:
df_checks["instruction_time_details_extracted"] = df_checks.apply(
    lambda row: row["instruction_time_details"]["total_time"],
    axis=1
)

In [None]:
df_checks["total_response_time_details_extracted"] = df_checks.apply(
    lambda row: row["total_response_time_details"]["total_time"],
    axis=1
)

In [None]:
df_checks["row_variability_details_details_upper_extracted"] = df_checks.apply(
    lambda row: row["row_variability_details"]["n_upper_row"],
    axis=1
)

In [None]:
df_checks["row_variability_details_details_lower_extracted"] = df_checks.apply(
    lambda row: row["row_variability_details"]["n_lower_row"],
    axis=1
)

In [None]:
df_checks["catch_trials_details_ratio_extracted"] = df_checks.apply(
    lambda row: row["catch_trials_details"]["ratio"],
    axis=1
)

In [None]:
df_checks["catch_trials_details_correctly_answered_extracted"] = df_checks.apply(
    lambda row: row["catch_trials_details"]["correctly_answered"],
    axis=1
)

In [None]:
if save_csv:
    # save dataframes to csv
    df_checks.to_csv(os.path.join(results_folder, "df_exclusion_criteria.csv"))
    df.to_csv(os.path.join(results_folder, "df_trials.csv"))

## Analyze Unique Workers

In [None]:
n_unique_tasks = df_checks.shape[0]
n_unique_workers = len(df_checks["worker_id"].unique())
print(f"We analyzed {n_unique_tasks} unique tasks")
print(f"We had {n_unique_workers} unique workers")

# Make data compatible with ICLR visualization code

In [None]:
df_main_for_ICLR_analysis = df_main.copy(deep=True)

In [None]:
# rename a few columns to make them compatible with the ICLR code
df_main_for_ICLR_analysis.rename(columns={"mode": "instr_type"}, inplace=True)
df_main_for_ICLR_analysis.rename(columns={"task_number": "subject_id"}, inplace=True)
df_main_for_ICLR_analysis.rename(columns={"confidence": "abs_conf_rating"}, inplace=True)
df_main_for_ICLR_analysis.rename(columns={"rt": "RT"}, inplace=True)

# Plot it!

In [None]:
figures_folder = os.path.join("figures", os.path.basename(os.path.realpath(results_folder)))
os.makedirs(figures_folder, exist_ok=True)

# Figure 19

## Accuracy

In [None]:
ut.make_plot_synthetic_imgs_are_helpful(
    df_main_for_ICLR_analysis, 
    figures_folder,
    exp_str="full_exp_ks_1",
    save_fig=save_fig
)

## Reaction Time

In [None]:
ut.make_plot_natural_are_better_wrt_reaction_time(
    df_main_for_ICLR_analysis, 
    figures_folder,
    exp_str="full_exp_ks_1",
    conditioned_on_correctness=True,
    save_fig=save_fig
)

In [None]:
ut.make_plot_natural_are_better_wrt_reaction_time(
    df_main_for_ICLR_analysis, 
    figures_folder,
    exp_str="full_exp_ks_1",
    save_fig=save_fig
)

In [None]:
ut.make_plot_natural_are_better_wrt_reaction_time(
    df_main_for_ICLR_analysis, 
    figures_folder,
    exp_str="full_exp_ks_1",
    conditioned_on_falseness=True,
    save_fig=save_fig
)

## Confidence

In [None]:
ut.make_plot_natural_are_better_wrt_confidence(
    df_main_for_ICLR_analysis, 
    figures_folder,
    exp_str="full_exp_ks_1",
    conditioned_on_correctness=True,
    save_fig=save_fig
)

In [None]:
ut.make_plot_natural_are_better_wrt_confidence(
    df_main_for_ICLR_analysis, 
    figures_folder,
    exp_str="full_exp_ks_1",
    save_fig=save_fig
)

In [None]:
ut.make_plot_natural_are_better_wrt_confidence(
    df_main_for_ICLR_analysis, 
    figures_folder,
    exp_str="full_exp_ks_1",
    conditioned_on_falseness=True,
    save_fig=save_fig
)

# Figure 20: Exclusion Criteria (Distribution over results)

## Postings

In [None]:
import importlib
importlib.reload(ut_mturk)

ut_mturk.plot_task_postings(
    df_checks,
    proportion=False,
    results_folder=figures_folder,
    save_fig=True
)

## Analyse Exclusion Criteria

In [None]:
import importlib
importlib.reload(ut_mturk)
importlib.reload(ut_helper)

ut_mturk.plot_exclusion_criteria(
    df_checks,
    proportion=False,
    results_folder=figures_folder,
    save_fig=True
)

In [None]:
def plot_exclusion_criteria(df_checks):
    fig, ax = plt.subplots(1)
    rects = plt.bar(np.arange(2), [
        (len(df_checks) - df_checks["passed_checks"].sum())/len(df_checks),
        df_checks["passed_checks"].sum()/len(df_checks)
    ])
    ut_helper.autolabel_counts(rects, ax)
    plt.xticks(np.arange(2), ["Failed", "Passed"])
    plt.tight_layout()
    plt.title("All Exclusion Criteria")
    plt.ylabel("Proportion of responses")
    plt.show()

    for criterion in [c for c in df_checks.columns if c.endswith("_result")]:
        fig, ax = plt.subplots(1)
        rects = plt.bar(np.arange(2), [
            (len(df_checks) - df_checks[criterion].sum())/len(df_checks),
            df_checks[criterion].sum()/len(df_checks)
        ])
        ut_helper.autolabel_counts(rects, ax)
        plt.xticks(np.arange(2), ["Failed", "Passed"])
        plt.tight_layout()
        plt.title(f"Exclusion Criterion: {criterion.replace('_result','').replace('_', ' ').title()}")
        plt.ylabel("Proportion of responses")
        plt.show()

plot_exclusion_criteria(df_checks)

# Figure 21: Exclusion Criteria (Distribution over values)

## Analyze Exclusion Criteria for Included Data

In [None]:
df_passed_checks = df_checks[df_checks["passed_checks"] == True]

In [None]:
ut_mturk.plot_instruction_time_details_extracted(
    df_passed_checks, 
    True,
    figures_folder, 
    save_fig
)

In [None]:
ut_mturk.plot_total_response_time_details_extracted(
    df_passed_checks, 
    True,
    figures_folder, 
    save_fig
)

In [None]:
ut_mturk.plot_catch_trials_details_ratio_extracted(
    df_passed_checks, 
    True,
    figures_folder, 
    save_fig
)

In [None]:
import importlib; importlib.reload(ut_mturk)

ut_mturk.plot_row_variability_details_upper_extracted(
    df_passed_checks, 
    True,
    figures_folder, 
    save_fig
)

## Analyze Exclusion Criteria for Excluded Data

In [None]:
df_failed_checks = df_checks[df_checks["passed_checks"] == False]

In [None]:
ut_mturk.plot_instruction_time_details_extracted(
    df_failed_checks,
    False,
    figures_folder, 
    save_fig
)

In [None]:
ut_mturk.plot_total_response_time_details_extracted(
    df_failed_checks, 
    False,
    figures_folder, 
    save_fig
)

In [None]:
ut_mturk.plot_catch_trials_details_ratio_extracted(
    df_failed_checks, 
    False,
    figures_folder, 
    save_fig
)

In [None]:
ut_mturk.plot_row_variability_details_upper_extracted(
    df_failed_checks, 
    False,
    figures_folder, 
    save_fig
)