In [2]:
from src.plotsutils import *
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from src.utils import set_seed
import os


wdir = os.path.abspath(os.path.dirname(__name__))
img_fold = '../figs/'
style='seaborn-v0_8-whitegrid'
# plt.style.use('ggplot')
# style = "ggplot"
white_back=False
# general settings
plt.style.use(style)
plt.rc('font', size=20)
plt.rc('legend', fontsize=20)
plt.rc('lines', linewidth=2)
plt.rc('axes', linewidth=2)
plt.rc('axes', edgecolor='k')
plt.rc('xtick.major', width=2)
plt.rc('xtick.major', size=12)
plt.rc('ytick.major', width=2)
plt.rc('ytick.major', size=12)
if style=='fivethirtyeight' and white_back==True:
    style+="WHITE"
    plt.rcParams['savefig.facecolor']='white'
    plt.rcParams['figure.facecolor'] = 'white'
    plt.rcParams['axes.facecolor'] = 'white'
    plt.rcParams['axes.grid'] = False

markers_dict = {
        'CC':"s",
     'RS': "o",
                "ASM":"d",
 'DT': "X",
 'LCE': "^",
 'OVA': "H",
 'SP': "D"}

palette = {q:sns.color_palette('colorblind')[i] for i,q in enumerate((markers_dict.keys()))}
seed = 42
set_seed(seed)
# file results
filename = "../results/all_results.csv"
if not os.path.exists(filename):
    raise FileNotFoundError("No results found. Run train.py and test.py first.")
else:
    df = pd.read_csv(filename)

Random seed 42 has been set.


In [3]:
palette

{'CC': (0.00392156862745098, 0.45098039215686275, 0.6980392156862745),
 'RS': (0.8705882352941177, 0.5607843137254902, 0.0196078431372549),
 'ASM': (0.00784313725490196, 0.6196078431372549, 0.45098039215686275),
 'DT': (0.8352941176470589, 0.3686274509803922, 0.0),
 'LCE': (0.8, 0.47058823529411764, 0.7372549019607844),
 'OVA': (0.792156862745098, 0.5686274509803921, 0.3803921568627451),
 'SP': (0.984313725490196, 0.6862745098039216, 0.8941176470588236)}

In [4]:
metric_dict = {"coef_rob": "$\\hat{\\tau}_{\mathtt{RDD}}$", "system_acc": "System Accuracy",
               "system_acc_v2": "System Accuracy", "ATT": "$\\hat{\\tau}_{\mathtt{ATD}}$",
               "classifier_all_acc": "System Accuracy", "pv_rob": "Estimated $\\tau$",
               "acc_system": "System Accuracy"}

def plot_single_classifier(df, classifier, figsize=(20, 12), title="", img_title="", metric="coef_rob", ylim = None):
    db = df[df['method']==classifier].copy()
    fig, ax = plt.subplots(1,1,figsize=figsize)
    classifiers = db["method"].unique()
    if metric == "coef_rob":
        target_covs = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
        for c in classifiers:
            db_c = db[db["method"] == c].copy()
            (_, caps, _) = ax.errorbar(db_c["target_coverage"], db_c["coef_rob"], label=c, color=palette[c],yerr=(db_c["ci_rob_u"]-db_c["ci_rob_l"])/2, alpha=0.9, capsize=10,
                        elinewidth=4, marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
            for cap in caps:
                cap.set_markeredgewidth(8)
        if ylim is not None:
            ax.set_ylim(ylim)
        else:
            ax.set_ylim(-.75, .75)
        ax.hlines(y=0, linestyles='dashed', colors='black', xmin=0, xmax=1, linewidth=4, label="Null Effect")
    elif metric == "ATT":
        target_covs = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
        for c in classifiers:
            db_c = db[db["method"] == c].copy()
            db_c = db_c[db_c["target_coverage"].isin(target_covs)]
            (_, caps, _) = ax.errorbar(db_c["target_coverage"], db_c["ATT"], label=c, color=palette[c],yerr=(db_c["ci_u_ATT"]-db_c["ci_l_ATT"])/2, alpha=0.9, capsize=10,
                        elinewidth=4, marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
            for cap in caps:
                cap.set_markeredgewidth(8)
        if ylim is not None:
            ax.set_ylim(ylim)
        else:
            ax.set_ylim(-.20, .50)
        ax.set_xlim(-0.04, 1.04)
        ax.hlines(y=0, linestyles='dashed', colors='black', xmin=0, xmax=1, linewidth=4, label="Null Effect")
    elif metric == "system_acc":
        target_covs = db["target_coverage"].unique()
        for c in classifiers:
            db_c = db[db["method"] == c].copy()
            ax.plot(db_c["target_coverage"], db_c[metric], label=c, color=palette[c], alpha=0.9,
                     marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
        if ylim is not None:
            ax.set_ylim(ylim)
        else:
            ax.set_ylim(.67, 1.01)
        class_all_acc = db_c["classifier_all_acc"].max()
        human_all_acc = db_c["human_all_acc"].max()
        ax.plot([0], [human_all_acc], label="Only Human", color="black", marker="P", markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
        ax.plot([1], [class_all_acc], label="Only ML", color="black", marker="X", markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
    elif metric == "acc_system":
        target_covs = db["target_coverage"].unique()
        for c in classifiers:
            db_c = db[db["method"] == c].copy()
            ax.plot(db_c["target_coverage"], db_c[metric], label=c, color=palette[c], alpha=0.9,
                     marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
        if ylim is not None:
            ax.set_ylim(ylim)
        else:
            ax.set_ylim(.67, 1.01)
    elif metric == "system_acc_v2":
        target_covs = db["target_coverage"].unique()
        for c in classifiers:
            db_c = db[db["method"] == c].copy()
            class_all_acc = db_c["classifier_all_acc"].max()
            human_all_acc = db_c[
        "human_all_acc"].max()
            xs = sorted([0]+list(target_covs)+[1])
            ys = [human_all_acc]+list(db_c["system_acc"].values)+[class_all_acc]
            ax.plot(xs, ys, label=c, color=palette[c], alpha=0.9,
                     marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
        ax.set_ylim(.67, 1.01)
    if "acc" in metric:
        ax.set_xlim(-0.04, 1.04)
    elif metric == "coef_rob":
        ax.set_xlim(-0.04, 1.04)
    elif metric == "ATT":
        ax.set_xlim(-0.04, 1.04)
    else:
        ax.set_xlim(0, 1)
    dataset = db["data"].unique()[0]
    if (dataset in ["hatespeech", "cifar10h", "chestxray2"])&(metric in ["system_acc", "acc_system"]):
        ax.legend(loc="lower left", fontsize=24, markerscale=1, fancybox=True, shadow=False)
    else:
        ax.legend(loc="upper left", fontsize=24, markerscale=1, fancybox=True, shadow=False)
    ax.set_xlabel("Cutoff", fontdict={"fontsize": 28})
    ax.set_ylabel(metric_dict[metric], fontdict={"fontsize": 28})
    labels_y = [str("{:.2f}".format(x)).replace("0.",".") for x in ax.get_yticks()]
    labels_y = [x if i%2==0 else "" for i, x in enumerate(labels_y)]
    ax.set_yticklabels(labels_y, fontsize=24)
    # ax.set_yticklabels(["{:.2f}".format(x) for x in ax.get_yticks()], fontsize=24)
    xticks = list(target_covs)
    ax.set_xticks(xticks)
    labels_cov = [str("{:.2f}".format(x)).replace("0.",".") for x in target_covs]
    # print(labels_cov)
    xticks_lab = ["$\\bar{}_{}$".format("{k}","{"+x+"}") for x in labels_cov]
    xticks_lab = [tick if i%2==0 else "" for i, tick in enumerate(xticks_lab)]
    # print(xticks_lab)
    ax.set_xticklabels(xticks_lab, fontsize=24)

    # ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    ax.set_title(title, fontsize=20)
    plt.savefig("{}/{}_{}_allcoeffs_{}_oneaxis.png".format(img_fold, img_title, classifier, metric), bbox_inches='tight', dpi=300)
    plt.close()



In [5]:
df.sort_values(by=["data", "method", "target_coverage"], inplace=True)

In [None]:
for data in ["synth"]:
    for classifier in ["RS", "SP", "CC", "DT", "OVA", "LCE", "ASM"]:
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="coef_rob")
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="ATT")
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="acc_system")





In [None]:
for data in ["galaxyzoo"]:
    for classifier in ["RS", "SP", "CC", "DT", "OVA", "LCE", "ASM"]:
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="coef_rob")
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="ATT")
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="acc_system")

In [None]:
for data in ["hatespeech"]:
    for classifier in ["RS", "SP", "CC", "DT", "OVA", "LCE", "ASM"]:
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="coef_rob")
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="ATT")
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="acc_system")

In [None]:
for data in ["xray-airspace"]:
    for classifier in ["RS", "SP", "CC", "DT", "OVA", "LCE", "ASM"]:
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="coef_rob")
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="ATT")
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="acc_system")

In [None]:
for data in [ "cifar10h"]:
    for classifier in ["RS", "SP", "CC", "DT", "OVA", "LCE", "ASM"]:
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="coef_rob")
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="ATT")
        plot_single_classifier(df[df["data"]==data], classifier, img_title=data, figsize=(8,6), metric="acc_system")

In [None]:
df_cal = pd.read_csv("../resultsRAW/synth/GCresultsRAW_cal_synth_RS_42_0.1_0.3_0.2_ep50.csv")
target_cov = .60
threshold = np.quantile(df_cal["rej_score"], target_cov)
colors = [sns.color_palette("colorblind")[0] if x < threshold else sns.color_palette("colorblind")[1] for x in df_cal["rej_score"]]
x = df_cal["rej_score"].values
fig, ax = plt.subplots(1,1, figsize=(8, 4))
sns.kdeplot(x, fill=False, color='grey', ax=ax)
kdeline = ax.lines[0]
xs = kdeline.get_xdata()
ys = kdeline.get_ydata()
ax.set_title('Calibration procedure for target coverage ${c}$', fontsize=20)
ax.vlines(threshold, 0, np.interp(threshold, xs, ys), color='black', ls='--', lw=3)
ax.fill_between(xs, 0, ys, facecolor=sns.color_palette("colorblind")[0], alpha=0.4)
ax.fill_between(xs, 0, ys, where=(threshold <= xs), interpolate=True, facecolor=sns.color_palette("colorblind")[1], alpha=.9)
ax.set_xticks(sorted(list(np.linspace(-1,1, 3))+[threshold]))
ax.set_xticklabels(["{:.2f}".format(x) if x!=threshold else "$\\bar{k}_{{c}}$" for x in sorted(list(np.linspace(-1,1, 3))+[threshold])], fontsize=20)
fig.text(0.44, 0.3, 'ML model\n${c}$', ha='center', fontsize=20)
fig.text(0.7, 0.3, 'Human \nExpert\n$(1-{c})$', ha='center', fontsize=20)
# fig.text(0.5, -0.13, 'Reject Score', ha='center', fontsize=20)
ax.set_ylabel("Reject Score Density", fontsize=20)
ax.set_xlabel("Reject Score", fontsize=20)
ax.set_yticklabels(["{:.2f}".format(x) for x in ax.get_yticks()], fontsize=20)
# ax.set_ylim(ymin=0)
plt.savefig("{}/calibration_example.png".format(img_fold, target_cov), bbox_inches='tight', dpi=300)
plt.show()

In [None]:
x_1 = np.linspace(-1, 1, 1000)
y_1 = np.exp(-x_1**2 + x_1*.5 -.5)
y_2 = np.exp(-x_1**2/2)

fig, ax = plt.subplots(1,1, figsize=(8, 4))
ax.plot(x_1[x_1>0], y_1[x_1>0], label="c=.60", color=sns.color_palette("colorblind")[0], lw=3)
ax.plot(x_1[x_1>0], y_2[x_1>0], label="c=.70", color=sns.color_palette("colorblind")[1], lw=3,)
ax.plot(x_1[x_1<=0], y_1[x_1<0], label="c=.60", color=sns.color_palette("colorblind")[0], lw=3)
ax.plot(x_1[x_1<=0], y_2[x_1<0], label="c=.70", color=sns.color_palette("colorblind")[1], lw=3, ls='--')
# // fill area between y_1[x_1>0] and y_2[x_1>0]
ax.fill_between(x_1[x_1>0], y_1[x_1>0], y_2[x_1>0], color=sns.color_palette("colorblind")[0], alpha=0.4)

# ax.plot(x_1, y_1-y_2, label="Difference", color='black')
# ax.vlines(0, np.interp(0, x_1, y_1), np.interp(0, x_1, y_2), color='black', ls='--', lw=3)
ax.set_title('Scenario 1', fontsize=20)
ax.set_xticks(sorted(list(np.linspace(-1,1, 5))+[0]))
ax.set_xticklabels(["{:.2f}".format(x) if x!=0 else "$\\bar{k}_{{c}}$" for x in sorted(list(np.linspace(-1,1, 5))+[0])], fontsize=20)
fig.text(0.24, 0.34, '$\mathbb{E}[T_0|K]$', ha='center', fontsize=20, color=sns.color_palette("colorblind")[0])
fig.text(0.74, 0.8, '$\mathbb{E}[T_1|K]$', ha='center', fontsize=20, color=sns.color_palette("colorblind")[1])
fig.text(0.60, 0.672, '$\\tau_{i}$', ha='center', fontsize=20)
ax.set_ylabel("Potential Outcome", fontsize=20)
ax.set_xlabel("Reject Score", fontsize=20)
ax.set_yticklabels(["{:.2f}".format(x) for x in ax.get_yticks()], fontsize=20)
# ax.set_ylim(ymin=0)
plt.savefig("{}/scenario1_example.png".format(img_fold, target_cov), bbox_inches='tight', dpi=300)

In [None]:
x_1 = np.linspace(-1, 1, 1000)
y_1 = np.exp(-x_1**2 + x_1*.5 -.5)
y_2 = np.exp(-x_1**2/2)

fig, ax = plt.subplots(1,1, figsize=(8, 4))
ax.plot(x_1[x_1>0], y_1[x_1>0], label="c=.60", color=sns.color_palette("colorblind")[0], lw=3, ls='--')
ax.plot(x_1[x_1>0], y_2[x_1>0], label="c=.70", color=sns.color_palette("colorblind")[1], lw=3,)
ax.plot(x_1[x_1<=0], y_1[x_1<0], label="c=.60", color=sns.color_palette("colorblind")[0], lw=3,)
ax.plot(x_1[x_1<=0], y_2[x_1<0], label="c=.70", color=sns.color_palette("colorblind")[1], lw=3, ls='--')

# ax.plot(x_1, y_1-y_2, label="Difference", color='black')
ax.vlines(0, np.interp(0, x_1, y_1), np.interp(0, x_1, y_2), color='black', ls='-.', lw=3)
ax.set_title('Scenario 2', fontsize=20)
ax.set_xticks(sorted(list(np.linspace(-1,1, 5))+[0]))
ax.set_xticklabels(["{:.2f}".format(x) if x!=0 else "$\\bar{k}_{{c}}$" for x in sorted(list(np.linspace(-1,1, 5))+[0])], fontsize=20)
fig.text(0.24, 0.34, '$\mathbb{E}[T_0|K]$', ha='center', fontsize=20, color=sns.color_palette("colorblind")[0])
fig.text(0.74, 0.8, '$\mathbb{E}[T_1|K]$', ha='center', fontsize=20, color=sns.color_palette("colorblind")[1])
fig.text(0.542, 0.672, '$\\tau_{\\mathtt{RD}}$', ha='center', fontsize=20)
ax.set_ylabel("Potential Outcome", fontsize=20)
ax.set_xlabel("Reject Score", fontsize=20)
ax.set_yticklabels(["{:.2f}".format(x) for x in ax.get_yticks()], fontsize=20)
# ax.set_ylim(ymin=0)
plt.savefig("{}/scenario2_example.png".format(img_fold, target_cov), bbox_inches='tight', dpi=300)

In [None]:
ax_dict = {
    "hatespeech": 2,
    "cifar10h": 0,
    "galaxyzoo": 1,
    "xray-airspace": 3
}


best_classifier = {
    "hatespeech": "RS",
    "cifar10h": "CC",
    "galaxyzoo": "CC",
    "xray-airspace": "RS"
}

def plot_all_real_best(df, datasets, figsize=(20, 12), title="", metric="coef_rob"):
    if metric == "all":
        fig, ax = plt.subplots(2,len(datasets),figsize=figsize, sharey="row", sharex=True)
        fig.tight_layout() #
    else:
        fig, ax = plt.subplots(1,len(datasets),figsize=figsize, sharey=True)
        fig.tight_layout() #
    df = df.copy()
    if metric == "all":
        metrics = ["ATT", "acc_system"]
        for metric in metrics:
            for data in datasets:
                classifier = best_classifier[data]
                db = df[(df["data"]==data) & (df['method']==classifier)].copy()
                # target_covs = db["target_coverage"].unique()
                if metric == "coef_rob":
                    pos = (0,ax_dict[data])
                    target_covs = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
                    db = db[db["target_coverage"].isin(target_covs)]
                elif metric == "system_acc":
                    pos = (1,ax_dict[data])
                elif metric == "ATT":
                    pos = (0,ax_dict[data])
                    target_covs = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
                    db = db[db["target_coverage"].isin(target_covs)]
                else:
                    pos = (1,ax_dict[data])
                print(pos)
                title = data
                classifiers = db["method"].unique()
                if metric == "coef_rob":
                    for c in classifiers:
                        db_c = db[db["method"] == c].copy()
                        (_, caps, _) = ax[pos].errorbar(db_c["target_coverage"], db_c["coef_rob"], label=c, color=palette[c],yerr=(db_c["ci_rob_u"]-db_c["ci_rob_l"])/2, alpha=0.9, capsize=10,
                                    elinewidth=4, marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
                        for cap in caps:
                            cap.set_markeredgewidth(8)
                    if pos == 0:
                        ax[pos].set_ylim(-.75, .75)
                    ax[pos].hlines(y=0, linestyles='dashed', colors='black', xmin=-0.04, xmax=1.04, linewidth=4)
                elif metric == "ATT":
                    for c in classifiers:
                        db_c = db[db["method"] == c].copy()
                        (_, caps, _) = ax[pos].errorbar(db_c["target_coverage"], db_c["ATT"], label=c, color=palette[c],yerr=(db_c["ci_u_ATT"]-db_c["ci_l_ATT"])/2, alpha=0.9, capsize=10,
                                    elinewidth=4, marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
                        for cap in caps:
                            cap.set_markeredgewidth(8)
                    if pos == (0,0):
                        ax[pos].set_ylim(-.20, .40)
                    ax[pos].hlines(y=0, linestyles='dashed', colors='black', xmin=-0.04, xmax=1.04, label="Null Effect", linewidth=4)
                elif metric == "acc_system":
                    for c in classifiers:
                        db_c = db[db["method"] == c].copy()
                        ax[pos].plot(db_c["target_coverage"], db_c[metric], label=c, color=palette[c], alpha=0.9,
                                 marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
                    ax[pos].set_ylim(.67, 1.01)
                if "acc_system" in metric:
                    ax[pos].set_xlim(-0.04, 1.04)
                else:
                    ax[pos].set_xlim(0, 1)
                dataset = db["data"].unique()[0]
                if (dataset in ["hatespeech", "cifar10h", "xray-airspace"])&("acc" in metric):
                    ax[pos].legend(loc="lower left", fontsize=30, markerscale=1, fancybox=True, shadow=False)
                else:
                    ax[pos].legend(loc="upper left", fontsize=30, markerscale=1, fancybox=True, shadow=False)
                if pos[1] == 0:
                    ax[pos].set_ylabel(metric_dict[metric], fontdict={"fontsize": 36})
                else:
                    ax[pos].set_ylabel("")
                # ax[pos].set_xlabel("Cutoff", fontdict={"fontsize": 20})
                if pos[0] == 0:
                    ax[pos].set_title(title, fontsize=40)
                ax[pos].set_xlabel("")
                fig.text(0.5, -0.04, "Cutoff", ha='center', fontsize=36)
                labels_y = [str("{:.2f}".format(x)).replace("0.",".") for x in ax[pos].get_yticks()]
                labels_y = [x if i%2==0 else "" for i, x in enumerate(labels_y)]
                ax[pos].set_yticklabels(labels_y, fontsize=28)
                target_covs = list(db["target_coverage"].unique())
                xticks = list(target_covs)
                ax[pos].set_xticks(xticks)
                labels_cov = [str("{:.2f}".format(x)).replace("0.",".") for x in target_covs]
                print(labels_cov)
                xticks_lab = ["$\\bar{}_{}$".format("{k}","{"+x+"}") for x in labels_cov]
                xticks_lab = [tick if i%2==0 else "" for i, tick in enumerate(xticks_lab)]
                print(xticks_lab)
                ax[pos].set_xticklabels(xticks_lab, fontsize=28)

        # ax.xaxis.set_major_formatter(FormatStrFormatter('%.2f'))
                ax[pos].yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    else:
        for data in datasets:
                classifier = best_classifier[data]
                db = df[(df["data"]==data) & (df['method']==classifier)].copy()
                target_covs = db["target_coverage"].unique()
                pos = ax_dict[data]
                print(pos)
                title = data
                classifiers = db["method"].unique()
                if metric == "coef_rob":
                    for c in classifiers:
                        db_c = db[db["method"] == c].copy()
                        (_, caps, _) = ax[pos].errorbar(db_c["target_coverage"], db_c["coef_rob"], label=c, color=palette[c],yerr=(db_c["ci_rob_u"]-db_c["ci_rob_l"])/2, alpha=0.9, capsize=10,
                                    elinewidth=4, marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
                        for cap in caps:
                            cap.set_markeredgewidth(8)
                    if pos == 0:
                        ax[pos].set_ylim(-.75, .75)
                    ax[pos].hlines(y=0, linestyles='dashed', colors='black', xmin=0, xmax=1, linewidth=4)
                elif metric == "acc_system":
                    for c in classifiers:
                        db_c = db[db["method"] == c].copy()
                        ax[pos].plot(db_c["target_coverage"], db_c[metric], label=c, color=palette[c], alpha=0.9,
                                 marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
                    ax[pos].set_ylim(.67, 1.01)
                    class_all_acc = db_c["classifier_all_acc"].max()
                    human_all_acc = db_c["human_all_acc"].max()
                    # ax.hlines(y=db_c["classifier_all_acc"].max(), linestyles='dashed', colors='black', xmin=0, xmax=1, linewidth=4, label = "ML model accuracy")
                    # ax.hlines(y=db_c["human_all_acc"].max(), linestyles='dashed', colors='black', xmin=0, xmax=1, linewidth=4, label = "Human expert accuracy")
                    ax[pos].plot([0], [human_all_acc], label="Only Human", color="black", marker="P", markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
                    ax[pos].plot([1], [class_all_acc], label="Only ML", color="black", marker="X", markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
                elif metric == "ATT":
                    for c in classifiers:
                        db_c = db[db["method"] == c].copy()
                        (_, caps, _) = ax[pos].errorbar(db_c["target_coverage"], db_c["ATT"], label=c, color=palette[c],yerr=(db_c["ci_u_ATT"]-db_c["ci_l_ATT"])/2, alpha=0.9, capsize=10,
                                    elinewidth=4, marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
                        for cap in caps:
                            cap.set_markeredgewidth(8)
                    if pos == 0:
                        ax[pos].set_ylim(-.75, .75)
                    ax[pos].hlines(y=0, linestyles='dashed', colors='black', xmin=0, xmax=1, linewidth=4)
                if "acc_system" in metric:
                    ax[pos].set_xlim(-0.04, 1.04)
                else:
                    ax[pos].set_xlim(0, 1)
                dataset = db["data"].unique()[0]
                if (dataset in ["hatespeech", "cifar10h", "chestxray2"])&("acc" in metric):
                    ax[pos].legend(loc="lower left", fontsize=22, markerscale=1, fancybox=True, shadow=False)
                else:
                    ax[pos].legend(loc="upper left", fontsize=22, markerscale=1, fancybox=True, shadow=False)
                if pos == 0:
                    ax[pos].set_ylabel(metric_dict[metric], fontdict={"fontsize": 20})
                else:
                    ax[pos].set_ylabel("")
                # ax[pos].set_xlabel("Cutoff", fontdict={"fontsize": 20})
                ax[pos].set_title(title, fontsize=40)
                ax[pos].set_xlabel("")
                fig.text(0.5, -0.04, "Cutoff", ha='center', fontsize=20)
                ax[pos].set_yticklabels(["{:.2f}".format(x) for x in ax[pos].get_yticks()], fontsize=20)
                target_covs = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9, 1]
                xticks = list(target_covs)
                ax[pos].set_xticks(xticks)
                labels_cov = [str("{:.2f}".format(x)).replace("0.",".") for x in target_covs]
                print(labels_cov)
                xticks_lab = ["$\\bar{}_{}$".format("{k}","{"+x+"}") for x in labels_cov]
                xticks_lab = [tick if i%2==0 else "" for i, tick in enumerate(xticks_lab)]
                print(xticks_lab)
                ax[pos].set_xticklabels(xticks_lab, fontsize=24)
    plt.savefig("{}/{}_{}_allcoeffs_{}_oneaxis.png".format(img_fold, title, "best", metric), bbox_inches='tight', dpi=300)



plot_all_real_best(df.fillna(0), ["hatespeech","cifar10h", "galaxyzoo", "xray-airspace"], figsize=(32, 12), metric="all", title="all_real")

In [None]:
df[(df["data"] == "cifar10h") & (df["method"] == "CC")]

In [None]:
df[(df["data"] == "xray-airspace") & (df["method"] == "RS")]

In [None]:
df[(df["data"] == "galaxyzoo") & (df["method"] == "CC")]

In [None]:
df[(df["data"] == "hatespeech") & (df["method"] == "RS")]

In [None]:
df[(df["data"] == "synth") & (df["method"] == "ASM")]

In [None]:
datasets = ["synth", "hatespeech", "cifar10h", "galaxyzoo", "xray-airspace"]
for dataset in datasets:
    print(dataset)
    max_value = df[df["data"] == dataset]["acc_system"].max()
    print(df[(df["data"] == dataset) & (df["acc_system"] == max_value)]["method"])

In [None]:
def plots_all_classifiers_fixed_cov(df, target_coverage, figsize=(20, 12), title="", img_title="", metric="ATT"):
    classifiers = df["method"].unique()
    db = df[df['target_coverage']==target_coverage].copy()
    fig, ax = plt.subplots(1,1,figsize=figsize)
    target_covs = db["target_coverage"].unique()
    for i,c in enumerate(sorted(classifiers)):
        db_c = db[db["method"] == c].copy()
        if metric == "coef_rob":
            if db_c is None or db_c.empty:
                print(c)
                (_, caps, _) = ax.errorbar(i, 0, label=c, color=palette[c],yerr=0, alpha=0.9, capsize=10,
                        elinewidth=4, marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
            else:
                (_, caps, _) = ax.errorbar(i, db_c["coef_rob"], label=c, color=palette[c],yerr=(db_c["ci_rob_u"]-db_c["ci_rob_l"])/2, fmt='o', alpha=0.9, capsize=10,
                            elinewidth=4, marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
            for cap in caps:
                cap.set_markeredgewidth(8)
        elif metric == "ATT":
            target_covs = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9]
            db_c = db_c[db_c["target_coverage"].isin(target_covs)]
            (_, caps, _) = ax.errorbar(i, db_c["ATT"], label=c, color=palette[c],yerr=(db_c["ci_u_ATT"]-db_c["ci_l_ATT"])/2, alpha=0.9, capsize=10,
                        elinewidth=4, marker=markers_dict[c], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
            for cap in caps:
                cap.set_markeredgewidth(8)
    ax.set_ylim(-.50, .50)
    ax.set_xlim(-0.04, 1.04)
    # ax.hlines(y=0, linestyles='dashed', colors='black', xmin=0, xmax=1, linewidth=4, label="Null Effect")

    limits_fig = (-.2, len(classifiers))
    ax.hlines(y=0, linestyles='dashed', colors='black', xmin=limits_fig[0], xmax=limits_fig[1], linewidth=4, label="")
    ax.legend(loc="lower center", fontsize=24, markerscale=1, fancybox=True, shadow=False, ncols=3)
    ax.set_xlabel("Baselines", fontdict={"fontsize": 28})
    ax.set_ylabel(metric_dict[metric], fontdict={"fontsize": 28})
    ax.set_yticklabels(["{:.2f}".format(x) for x in ax.get_yticks()], fontsize=20)
    ax.set_xticks(range(len(classifiers)))
    ax.set_xticklabels(["{}".format(x) for x in classifiers], fontsize=20, rotation=-45)
    ax.yaxis.set_major_formatter(FormatStrFormatter('%.2f'))
    ax.set_xlim(limits_fig[0], limits_fig[1])
    ax.set_title(title, fontsize=20)
    plt.savefig("{}/{}_{}_allBaselines.png".format(img_fold, img_title, target_coverage), bbox_inches='tight', dpi=300)
    return ax

In [None]:
plots_all_classifiers_fixed_cov(df[df["data"]=="synth"], .9, figsize=(8,6), img_title="synth".format("ATT"), metric="ATT")

## appendix Plot

In [None]:
dict_best = {
    "synth": "ASM",
    "hatespeech": "RS",
    "cifar10h": "CC",
    "galaxyzoo": "CC",
    "xray-airspace": "RS"
}

dict_cols = {
    "synth": 0,
    "hatespeech": 3,
    "cifar10h": 1,
    "galaxyzoo": 2,
    "xray-airspace": 4
}


def plot_all_best_checks_placebo(ylim=None):
    filename_orig = "../results/all_results.csv"
    filename_pl = "../results/all_resultsPLACEBO.csv"
    filename_pl_low = "../results/all_resultsPLACEBOCUTOFFS_LOW.csv"
    filename_pl_high = "../results/all_resultsPLACEBOCUTOFFS_HIGH.csv"
    db_orig = pd.read_csv(filename_orig).fillna(0)
    db_pl = pd.read_csv(filename_pl).fillna(0)
    db_pl_low = pd.read_csv(filename_pl_low).fillna(0)
    db_pl_high = pd.read_csv(filename_pl_high).fillna(0)
    fig, ax = plt.subplots(4,5, figsize=(28, 28), sharey=True, sharex=True)
    fig.tight_layout() #
    target_coverages = [.1, .2, .3, .4, .5, .6, .7, .8, .9]
    db_orig = db_orig[db_orig["target_coverage"].isin(target_coverages)]
    db_pl = db_pl[db_pl["target_coverage"].isin(target_coverages)]
    db_pl_low = db_pl_low[db_pl_low["target_coverage"].isin(target_coverages)]
    db_pl_high = db_pl_high[db_pl_high["target_coverage"].isin(target_coverages)]
    for data in ["synth", "hatespeech", "cifar10h", "galaxyzoo", "xray-airspace"]:
        tmp = db_orig[(db_orig["data"]==data) & (db_orig["method"] == dict_best[data])].copy()
        tmp_pl = db_pl[(db_pl["data"]==data) & (db_pl["method"] == dict_best[data])].copy()
        tmp_pl_low = db_pl_low[(db_pl_low["data"]==data) & (db_pl_low["method"] == dict_best[data])].copy()
        tmp_pl_high = db_pl_high[(db_pl_high["data"]==data) & (db_pl_high["method"] == dict_best[data])].copy()
        (_, caps, _) = ax[0, dict_cols[data]].errorbar(tmp["target_coverage"], tmp["coef_rob"], label=dict_best[data], color=palette[dict_best[data]],yerr=(tmp["ci_rob_u"]-tmp["ci_rob_l"])/2, alpha=0.9, capsize=10,
                        elinewidth=4, marker=markers_dict[dict_best[data]], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
        for cap in caps:
                cap.set_markeredgewidth(8)
        if (ylim is not None) & (dict_cols[data] == 0):
            ax[0, dict_cols[data]].set_ylim(ylim)
        ax[0, dict_cols[data]].hlines(y=0, linestyles='dashed', colors='black', xmin=0, xmax=1, linewidth=4, label="Null Effect")
        ax[0, dict_cols[data]].set_title(data, fontsize=36)
        ax[0, dict_cols[data]].set_xlabel("")
        #second row -- placebo low
        (_, caps, _) = ax[1, dict_cols[data]].errorbar(tmp_pl_low["target_coverage"], tmp_pl_low["coef_rob"], label=dict_best[data], color=palette[dict_best[data]],yerr=(tmp_pl_low["ci_rob_u"]-tmp_pl_low["ci_rob_l"])/2, alpha=0.9, capsize=10,
                        elinewidth=4, marker=markers_dict[dict_best[data]], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
        for cap in caps:
                cap.set_markeredgewidth(8)
        ax[1, dict_cols[data]].hlines(y=0, linestyles='dashed', colors='black', xmin=0, xmax=1, linewidth=4, label="Null Effect")
        if (ylim is not None) & (dict_cols[data] == 0):
            ax[1, dict_cols[data]].set_ylim(ylim)
        #third row -- placebo high
        (_, caps, _) = ax[2, dict_cols[data]].errorbar(tmp_pl_high["target_coverage"], tmp_pl_high["coef_rob"], label=dict_best[data], color=palette[dict_best[data]],yerr=(tmp_pl_high["ci_rob_u"]-tmp_pl_high["ci_rob_l"])/2, alpha=0.9, capsize=10,
                        elinewidth=4, marker=markers_dict[dict_best[data]], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
        for cap in caps:
                cap.set_markeredgewidth(8)
        ax[2, dict_cols[data]].hlines(y=0, linestyles='dashed', colors='black', xmin=0, xmax=1, linewidth=4, label="Null Effect")
        if (ylim is not None) & (dict_cols[data] == 0):
            ax[2, dict_cols[data]].set_ylim(ylim)
        #fourth row -- placebo outcome
        (_, caps, _) = ax[3, dict_cols[data]].errorbar(tmp_pl["target_coverage"], tmp_pl["coef_rob"], label=dict_best[data], color=palette[dict_best[data]],yerr=(tmp_pl["ci_rob_u"]-tmp_pl["ci_rob_l"])/2, alpha=0.9, capsize=10,
                        elinewidth=4, marker=markers_dict[dict_best[data]], markersize=20, markeredgecolor = 'black',markeredgewidth = 0.5)
        for cap in caps:
                cap.set_markeredgewidth(8)
        ax[3, dict_cols[data]].hlines(y=0, linestyles='dashed', colors='black', xmin=0, xmax=1, linewidth=4, label="Null Effect")
        if (ylim is not None) & (dict_cols[data] == 0):
            ax[3, dict_cols[data]].set_ylim(ylim)
    fig.text(0.5, -0.01, "Cutoff", ha='center', fontsize=36)
    # fig.text(-0.01, 0.5, "$\\tau_{\mathtt{RD}}$", va='center', rotation='vertical', fontsize=36)
    ax[3,dict_cols[data]].set_xticks(target_coverages)
    xticks = ["$"+"{:.2f}".format(x).replace("0.",".")+"$" for x in target_coverages]
    xticks = [el if i%2 ==0 else "" for i,el in enumerate(xticks)]
    ax[3, 0].set_xticklabels(xticks, fontsize=28)
    ax[3, 1].set_xticklabels(xticks, fontsize=28)
    ax[3, 2].set_xticklabels(xticks, fontsize=28)
    ax[3, 3].set_xticklabels(xticks, fontsize=28)
    ax[3, 4].set_xticklabels(xticks, fontsize=28)
    ax[0, 0].legend(loc="upper center", fontsize=32, markerscale=1, fancybox=True, shadow=False)
    ax[0, 1].legend(loc="lower center", fontsize=32, markerscale=1, fancybox=True, shadow=False)
    ax[0, 2].legend(loc="lower center", fontsize=32, markerscale=1, fancybox=True, shadow=False)
    ax[0, 3].legend(loc="lower center", fontsize=32, markerscale=1, fancybox=True, shadow=False)
    ax[0, 4].legend(loc="upper center", fontsize=32, markerscale=1, fancybox=True, shadow=False)
    ax[0,0].set_ylabel("$\hat{\\tau}_{\mathtt{RD}}$", fontsize=32)
    ax[1,0].set_ylabel("$\hat{\\tau}_{\mathtt{RD}}-$"+" cutoff lower", fontsize=32)
    ax[2,0].set_ylabel("$\hat{\\tau}_{\mathtt{RD}}-$"+ "cutoff higher", fontsize=32)
    ax[3,0].set_ylabel("$\hat{\\tau}_{\mathtt{RD}}-$"+" placebo outcome", fontsize=32)
    # ax[0,0].set_ylabel("$\\tau_{\mathtt{RD}}$", fontsize=36)
    # ax[1,0].set_ylabel("Robustness Coefficient", fontsize=36)
    # ax[2,0].set_ylabel("Robustness Coefficient", fontsize=36)
    # ax[3,0].set_ylabel("Robustness Coefficient", fontsize=36)
    plt.savefig("{}/all_best_checks_placebo.png".format(img_fold), bbox_inches='tight', dpi=300)
    plt.show()



In [None]:
plot_all_best_checks_placebo()