In [1]:
import itertools
import json
import re
import sys
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm import tqdm 

import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel
from statsmodels.stats.proportion import proportions_ztest

from scipy.stats import pearsonr, spearmanr, mannwhitneyu, ttest_ind

In [None]:
def load_json(path):
    with open(path) as infile:
        return json.load(infile)

def save_json(obj, path):
    with open(path, 'w') as outfile:
        return json.dump(obj, outfile, indent=2)

In [None]:
DATA_DIR = "../data/human/all_data"
DATA_FILE = "all_data.csv"
FILTER_ON_FAMILIARITY = False
DROP_NA = False # if true, then nan/inf set to 0
AVERAGE_ANNOTATIONS = False # average over human annotators before running regression
ITERS = 1_000

## Data compilation

In [2]:
auto_metric_names = [
 'c_npmi_10_full',
 'c_npmi_10_nytimes_full',
 #'c_npmi_10_test',
 'c_npmi_10_train',
 'c_npmi_10_val',
 'c_npmi_10_wikitext_full',
 #'c_uci_full',
 'c_v_full',
 'c_v_nytimes_full',
 #'c_v_test',
 'c_v_train',
 'c_v_val',
 'c_v_wikitext_full',
 #'u_mass_full'
]

In [11]:
task_data = pd.read_csv(Path(DATA_DIR, DATA_FILE))

In [None]:
# complete the out-of-sample columns
for dataset in ["wikitext", "nytimes"]:
    for metric in ["c_npmi_10", "c_v"]:
        task_data[f"{metric}_{dataset}_full"] = task_data[f"{metric}_{dataset}_full"].combine_first(task_data[f"{metric}_full"])

In [None]:
# add the constant for lin. reg
task_data = sm.add_constant(task_data)

In [None]:
# convert infinite values to nans
task_data = task_data.replace(np.inf, np.nan)

In [None]:
if FILTER_ON_FAMILIARITY:
    task_data = task_data.loc[task_data.confidences_raw == 1]

In [10]:
def find_max_coefs(coefs, lbs, ubs, contiguous=False):
    n = len(coefs)
    coefs = np.nan_to_num(coefs, nan=0)
    lbs = np.nan_to_num(lbs, nan=0)
    ubs = np.nan_to_num(ubs, nan=0)
    sorted_coef_idx = np.argsort(coefs)[::-1]
    for i, idx in enumerate(sorted_coef_idx[1:], start=1):
        # is the upper bound of this coefficient contained in the
        # lower bound of the next-largest coefficient?
        prev_idx = sorted_coef_idx[i-1] if contiguous else sorted_coef_idx[0]
        if ubs[idx] < lbs[prev_idx]:
            return np.array([j in sorted_coef_idx[:i] for j in range(n)])
    return np.full(n, True)

## Bootstrapped correlations

In [9]:
AVERAGE_ANNOTATIONS = False
FILTER_ON_FAMILIARITY = False
DROP_NA = False
USE_OLS = False
ALPHA = 0.05
N_ITERS = 500
NUM_ANNOTATORS = {"intrusions": 26, "ratings": 15}

np.random.seed(42)
rows = []
for iteration in tqdm(range(N_ITERS), total=N_ITERS):
    for task in ["ratings", "intrusions"]:
        for dataset in ["wikitext", "nytimes", "all"]:
            data_df = task_data.loc[task_data.task == task]
            if dataset != "all":
                data_df = data_df.loc[data_df.dataset == dataset]
            if FILTER_ON_FAMILIARITY:
                data_df = data_df.loc[data_df.confidences_raw == 1]

            # run regressions for each metric
            for i, metric in enumerate(auto_metric_names):
                #print(f"\n===={task}, {dataset}, {metric}====\n")
                if dataset in metric:
                    continue # don't re-do the internal
                if DROP_NA:
                    df = data_df.dropna(subset=[metric])
                else:
                    df = data_df.fillna(0)
                # sample 
                df = (
                    df.groupby(["task", "dataset", "model", "topic_idx"])
                    .sample(NUM_ANNOTATORS[task], replace=True)
                    .groupby(["task", "dataset", "model", "topic_idx"])[[metric, "scores_raw"]]
                    .mean()
                )
                spear_rho, spear_p = spearmanr(df[metric].values, df["scores_raw"].values)
                pear_rho, pear_p = pearsonr(df[metric].values, df["scores_raw"].values)
                metric_base = re.search("c_npmi_10|c_v|c_uci|u_mass", metric).group(0)
                row = {
                    "task": task,
                    "dataset": dataset,
                    "metric": metric_base,
                    "reference": metric.replace(f"{metric_base}_", ""),
                    "spear_rho": spear_rho,
                    "pear_rho": pear_rho,
                    "spear_p": spear_p,
                    "pear_p": pear_p,
                }
                rows.append(row)
correlations = pd.DataFrame(rows)
correlations.to_csv("correlation_results.csv", index=False)

100%|██████████| 10/10 [01:17<00:00,  7.79s/it]


In [11]:
correlations = pd.read_csv("correlation_results.csv")

In [12]:
correlations.loc[(correlations.dataset=="wikitext") & (correlations.reference == "full"), "reference"] = "wikitext_full"
correlations.loc[(correlations.dataset=="nytimes") & (correlations.reference == "full"), "reference"] = "nytimes_full"

In [44]:
VALUE_COL = "spear_rho"

def ci_lb(x):
    return np.quantile(x, 0.025)
def ci_ub(x):
    return np.quantile(x, 0.975)

correlations_grouped = (
    correlations.groupby(["task", "dataset", "metric", "reference"])[VALUE_COL]
               .agg(["mean", "std", ci_lb, ci_ub])
               .reset_index()
               .rename(columns={"mean": VALUE_COL, "ci_lb": "ci_0.025", "ci_ub": "ci_0.975"})
)

In [45]:
METRICS_TO_KEEP = ["c_v", "c_npmi_10"]
REPORT_SD = False
REPORT_CI = not REPORT_SD
correlations_pivot = (
    correlations_grouped.loc[correlations_grouped.metric.isin(METRICS_TO_KEEP)]
               .sort_values(["task", "dataset", "metric", "reference"])
               #.pivot(index=["Metric", "Reference"], columns=["Task", "Dataset"], values="coef")
               .pivot(index=["task", "dataset"], columns=["metric", "reference"], values=[VALUE_COL, "std", "ci_0.025",  "ci_0.975"])
)
correlations_pivot = correlations_pivot[[
    (v, m, r)
    for v in [VALUE_COL, "std", "ci_0.025",  "ci_0.975"]
    for m in ["c_npmi_10", "c_v"]
    for r in ["nytimes_full", "wikitext_full", "train", "val"] #NB: val excluded
]]

In [46]:
if REPORT_SD:
    # bold format the significant values
    for idx, row in correlations_pivot.iterrows():
        newrow = []
        for i, x in enumerate(row[VALUE_COL]):
            if np.isnan(x):
                val = "-"
            else:
                std = row["std"][i]
                val = f"${x:0.3f}_{{{std:0.3f}}}$"
                if x == row.max():
                    val = r"\textbf{" + val + "}"
            newrow.append(val)
        correlations_pivot.at[idx, VALUE_COL] = newrow
    

In [47]:
if REPORT_CI:
    # bold format the significant values
    for idx, row in correlations_pivot.iterrows():
        max_coefs = find_max_coefs(row[VALUE_COL], row["ci_0.025"], row["ci_0.975"])
        newrow = []
        for i, x in enumerate(row[VALUE_COL]):
            if np.isnan(x):
                val = "-"
            elif not np.isnan(x) and max_coefs[i]:
                val = r"\uline{" + f"{x:0.2f}" + "}"
                if x == np.max(row[VALUE_COL]):
                    val = r"\textbf{" + val + "}"
            else:
                val = f"{x:0.2f}"
            newrow.append(val)
        correlations_pivot.at[idx, VALUE_COL] = newrow

In [48]:
# make latex
correlations_pivot_values = correlations_pivot[VALUE_COL].loc[[
    ('intrusions',  'nytimes'),
    ('intrusions', 'wikitext'),
    ('intrusions',      'all'),
    (   'ratings',  'nytimes'),
    (   'ratings', 'wikitext'),
    (   'ratings',      'all'),
]]
latex = correlations_pivot_values.to_latex(escape=False, multicolumn_format='c', column_format="ll|rrrr|rrrr")
to_replace_in_latex={
    "c_v": r"$C_v$ (110-token window)",
    "test": "Test",
    "c_npmi_10": r"\abr{npmi} (10-token window)",
    "nytimes_full": r"\abr{nyt}",
    "wikitext_full": r"\abr{wiki}",
    "wikitext": r"\abr{wiki}",
    "nytimes": r"\abr{nyt}",
    "all": "Both",
    "full": "Full",
    "train": "Train",
    "val": "Val",
    "test": "Test",
    "ratings": "Rating",
    "intrusions": "Intrusion",
    "metric": "",
    "dataset": r"Train Corpus $\downarrow$",
    "reference": r"Ref. Corpus $\rightarrow$",
    "task": "",
}
for to_replace, val in to_replace_in_latex.items():
    latex = latex.replace(to_replace, val)
print(latex)


\begin{tabular}{ll|rrrr|rrrr}
\toprule
        &  & \multicolumn{4}{c}{\abr{npmi} (10-token window)} & \multicolumn{4}{c}{$C_v$ (110-token window)} \\
        & Ref. Corpus $\rightarrow$ &  \abr{nyt} &          \abr{wiki} &                  Train &   Val &  \abr{nyt} &          \abr{wiki} &         Train &           Val \\
 & Train Corpus $\downarrow$ &               &                        &                        &       &               &                        &               &               \\
\midrule
Intrusion & \abr{nyt} &          0.27 &           \uline{0.43} &                   0.27 &  0.24 &          0.34 &  \textbf{\uline{0.45}} &          0.35 &          0.34 \\
        & \abr{wiki} &  \uline{0.34} &           \uline{0.36} &  \textbf{\uline{0.39}} &  0.17 &  \uline{0.31} &           \uline{0.34} &  \uline{0.34} &          0.20 \\
        & Both &          0.29 &           \uline{0.40} &                   0.32 &  0.17 &          0.32 &  \textbf{\uline{0.40}} &  \uline{0.35

# Simple Simulation

In [10]:
def select_random_topics(df, num_topics=50):
    topic_sample = (
        df.groupby(["task", "dataset", "model", "topic_idx"])
                    .size()
                    .reset_index()
                    .groupby(["task", "dataset", "model"])
                    .sample(num_topics, replace=True)
                    .drop(columns=0)
    )
    topic_sample["rand_topic_idx"] = topic_sample.groupby(['task', 'dataset', 'model']).cumcount()
    df = df.merge(topic_sample, how='inner')
    return df

def intrusion_test(scores_a, scores_b, alternative="larger"):
    return proportions_ztest(
        [scores_a.sum(), scores_b.sum()],
        [len(scores_a), len(scores_b)],
        alternative=alternative
    )

def ratings_test(scores_a, scores_b, alternative="greater"):
    return mannwhitneyu(scores_a, scores_b, alternative=alternative)

def auto_test(scores_a, scores_b, alternative="greater"):
    return ttest_ind(scores_a, scores_b, equal_var=False, alternative=alternative)

def false_discovery_rate_sim(
    task,
    test_df,
    metric,
    alpha=0.05,
    beta=0.1,
    models=["mallet", "dvae", "etm"],
):
    false_positives, false_negatives = 0, 0
    discoveries, omissions = 0, 0
    total = 0
    if task == "intrusions":
        stat_test = intrusion_test
    elif task == "ratings":
        stat_test = ratings_test
    model_scores = test_df.groupby(['task', 'dataset', 'model', 'rand_topic_idx']).head(1)

    # Calculate the false discovery rate
    for model_a, model_b in itertools.permutations(models, 2):
        model_a_idxr = test_df.model==model_a
        model_b_idxr = test_df.model==model_b
        # run the tests for both human and auto metrics
        stat_auto, p_auto = auto_test(
            model_scores.loc[model_scores.model==model_a][metric],
            model_scores.loc[model_scores.model==model_b][metric],
        )
        stat_human, p_human = stat_test(
            test_df.loc[test_df.model==model_a]["scores_raw"],
            test_df.loc[test_df.model==model_b]["scores_raw"],
        )

        if p_auto < alpha: # auto rejects the null
            if np.random.random() < alpha:
                continue # falsely rejected the null, type I error
            discoveries += 1
            if np.random.random() < beta: # human failed to detect effect. type II error
                continue
            false_positives += p_human > alpha # human fails to reject the null

        if p_human < alpha: # human rejects the null
            if np.random.random() < alpha: # falsely rejected the null, type I error
                continue
            omissions += 1
            if np.random.random() < beta: # model failed to detect effect. type II error
                continue
            false_negatives += p_auto > alpha # auto fails to reject null
        
        # TODO: false negatives
        total += 1
    false_pos_rate = np.nan if discoveries == 0 else false_positives / discoveries
    false_neg_rate = np.nan if omissions == 0 else false_negatives / omissions
    return false_pos_rate, false_positives, discoveries, false_neg_rate, false_negatives, omissions

In [16]:
AVERAGE_ANNOTATIONS = False
FILTER_ON_FAMILIARITY = False
DROP_NA = False
USE_OLS = False
ALPHA = 0.05
BETA = 0.1
N_ITERS = 1000
NUM_TOPICS = 50
MODELS = ['mallet', 'dvae', 'etm']

np.random.seed(42)
rows = []
for iteration in tqdm(range(N_ITERS), total=N_ITERS):
    for task in ["ratings", "intrusions"]:
        for dataset in ["wikitext", "nytimes", "all"]:
            data_df = task_data.loc[task_data.task == task]
            if dataset != "all":
                data_df = data_df.loc[data_df.dataset == dataset]
            if FILTER_ON_FAMILIARITY:
                data_df = data_df.loc[data_df.confidences_raw == 1]

            # run regressions for each metric
            for i, metric in enumerate(auto_metric_names):
                #print(f"\n===={task}, {dataset}, {metric}====\n")
                if dataset in metric:
                    continue # don't re-do the internal
                if DROP_NA:
                    df = data_df.dropna(subset=[metric])
                else:
                    df = data_df.fillna(0)
                # sample at the topic level
                df = select_random_topics(df, NUM_TOPICS)
                fp_rate, fp, pos, fn_rate, fn, neg = false_discovery_rate_sim(
                    task,
                    df,
                    metric,
                    alpha=ALPHA,
                    beta=BETA,
                )
                metric_base = re.search("c_npmi_10|c_v|c_uci|u_mass", metric).group(0)
                row = {
                    "task": task,
                    "dataset": dataset,
                    "metric": metric_base,
                    "reference": metric.replace(f"{metric_base}_", ""),
                    "fp_rate": fp_rate,
                    "false_positives": fp,
                    "discoveries": pos,
                    "fn_rate": fn_rate,
                    "false_negatives": fn,
                    "omissions": neg,
                }
                rows.append(row)
simulations = pd.DataFrame(rows)
#simulations.to_csv("simple_simulation_results.csv", index=False)

100%|██████████| 50/50 [01:42<00:00,  2.05s/it]


In [35]:
simulations = pd.read_csv("simple_simulation_results.csv")

In [36]:
simulations.loc[(simulations.dataset=="wikitext") & (simulations.reference == "full"), "reference"] = "wikitext_full"
simulations.loc[(simulations.dataset=="nytimes") & (simulations.reference == "full"), "reference"] = "nytimes_full"

In [43]:
REPORT = "fnr"

def ci_lb(x):
    return np.quantile(x, 0.025)
def ci_ub(x):
    return np.quantile(x, 0.975)

simulations_grouped = (
    simulations.groupby(["task", "dataset", "metric", "reference"])[["false_positives", "discoveries", 'false_negatives', 'omissions']]
               .agg(["sum"])
               .reset_index()
)
simulations_grouped["fp_rate"] = simulations_grouped["false_positives"] / simulations_grouped["discoveries"]
simulations_grouped["fn_rate"] = simulations_grouped["false_negatives"] / simulations_grouped["omissions"]

In [52]:
METRICS_TO_KEEP = ["c_v", "c_npmi_10"]
simulations_pivot = (
    simulations_grouped.loc[simulations_grouped.metric.isin(METRICS_TO_KEEP)]
               .sort_values(["task", "dataset", "metric", "reference"])
               #.pivot(index=["Metric", "Reference"], columns=["Task", "Dataset"], values="coef")
               .pivot(index=["task", "dataset"], columns=["metric", "reference"], values=[VALUE_COL, "fn_rate"])#, "std", "ci_0.025",  "ci_0.975"])
)
simulations_pivot = simulations_pivot[[
    (v, m, r)
    for v in [VALUE_COL, "fn_rate"]#, "std", "ci_0.025",  "ci_0.975"]
    for m in ["c_npmi_10", "c_v"]
    for r in ["nytimes_full", "wikitext_full", "train"] #NB: test, val excluded
]]

In [53]:
if REPORT == "fnr":
    # bold format the significant values
    for idx, row in simulations_pivot.iterrows():
        newrow = []
        prec = 1 - row["fp_rate"]
        rec = 1 - row["fn_rate"]
        f1s = 2 * ((prec * rec) / (prec + rec))
        for i, (fpr, fnr, f1) in enumerate(zip(row["fp_rate"], row["fn_rate"], f1s)):
            if np.isnan(fpr):
                val = "-"
            else:
                val = f"{fpr*100:0.0f}"
                if f1 == np.max(f1s):
                    val = r"\textbf{" + val + "}"
                if len(val) < 2:
                    val = r"\phantom{0}" + val
                fnr_str = f"{fnr*100:0.0f}"
                if len(fnr_str) < 2:
                    fnr_str = r"\phantom{0}" + fnr_str
                val += r" / \textcolor{gray}{" + fnr_str + "}"
            newrow.append(val)
        simulations_pivot.at[idx, "fp_rate"] = newrow

In [49]:
if REPORT == "sd":
    # bold format the significant values
    for idx, row in simulations_pivot.iterrows():
        newrow = []
        for i, x in enumerate(row[VALUE_COL]):
            if np.isnan(x):
                val = "-"
            else:
                #std = row["std"][i]
                #val = f"${x*100:0.0f}_{{{std*100:0.0fx}}}$"
                val = f"{x*100:0.1f}"
                if x == np.min(row[VALUE_COL]):
                    val = r"\textbf{" + val + "}"
            newrow.append(val)
        simulations_pivot.at[idx, VALUE_COL] = newrow
    

In [50]:
if REPORT == "ci":
    # bold format the significant values
    for idx, row in simulations_pivot.iterrows():
        max_coefs = find_max_coefs(row[VALUE_COL], row["ci_0.025"], row["ci_0.975"])
        newrow = []
        for i, x in enumerate(row[VALUE_COL]):
            if np.isnan(x):
                val = "-"
            elif not np.isnan(x) and max_coefs[i]:
                val = r"\uline{" + f"{x:0.2f}" + "}"
                if x == np.min(row[VALUE_COL]):
                    val = r"\textbf{" + val + "}"
            else:
                val = f"{x:0.2f}"
            newrow.append(val)
        simulations_pivot.at[idx, VALUE_COL] = newrow

In [54]:
# make latex
simulations_pivot_values = simulations_pivot[VALUE_COL].loc[[
    ('intrusions',  'nytimes'),
    ('intrusions', 'wikitext'),
    ('intrusions',      'all'),
    (   'ratings',  'nytimes'),
    (   'ratings', 'wikitext'),
    (   'ratings',      'all'),
]]
latex = simulations_pivot_values.to_latex(escape=False, multicolumn_format='c', column_format="ll|rrr|rrr")
to_replace_in_latex={
    "c_v": r"$C_v$ (110-token window)",
    "test": "Test",
    "c_npmi_10": r"\abr{npmi} (10-token window)",
    "nytimes_full": r"\abr{nyt}",
    "wikitext_full": r"\abr{wiki}",
    "wikitext": r"\abr{wiki}",
    "nytimes": r"\abr{nyt}",
    "all": "Both",
    "full": "Full",
    "train": "Train",
    "test": "Test",
    "val": "Val",
    "ratings": "Rating",
    "intrusions": "Intrusion",
    "metric": "",
    "dataset": r"Train  $\downarrow$",
    "reference": r"Ref. $\rightarrow$",
    "task": "",
}
for to_replace, val in to_replace_in_latex.items():
    latex = latex.replace(to_replace, val)
print(latex)


\begin{tabular}{ll|rrr|rrr}
\toprule
        &  & \multicolumn{3}{c}{\abr{npmi} (10-token window)} & \multicolumn{3}{c}{$C_v$ (110-token window)} \\
        & Ref. $\rightarrow$ &               \abr{nyt} &                       \abr{wiki} &                                Train &                                  \abr{nyt} &                       \abr{wiki} &                                         Train \\
 & Train  $\downarrow$ &                            &                                     &                                      &                                               &                                     &                                               \\
\midrule
Intrusion & \abr{nyt} &  21 / \textcolor{gray}{10} &           65 / \textcolor{gray}{70} &            19 / \textcolor{gray}{11} &  \textbf{20} / \textcolor{gray}{\phantom{0}5} &           75 / \textcolor{gray}{78} &           21 / \textcolor{gray}{\phantom{0}5} \\
        & \abr{wiki} &  77 / \textcolor{gray}{75} &

## Regression 

Assess variances by topic. This may help determine if averaging is ok

In [None]:
topic_stds = task_data.groupby(['task', 'dataset', 'model', 'topic_idx']).agg({"scores_raw": "std"}).reset_index()
topic_stds["scores_raw"].hist(by=topic_stds["task"])

 First, simple linear regression (ignoring model effects)

In [55]:
AVERAGE_ANNOTATIONS = False
FILTER_ON_FAMILIARITY = False
DROP_NA = False
USE_OLS = False
ALPHA = 0.05
np.random.seed(42)
rows = []
for task in ["ratings", "intrusions"]:
    for dataset in ["wikitext", "nytimes", "all"]:
        data_df = task_data.loc[task_data.task == task]
        if dataset != "all":
            data_df = data_df.loc[data_df.dataset == dataset]
        if FILTER_ON_FAMILIARITY:
            data_df = data_df.loc[data_df.confidences_raw == 1]

        # run regressions for each metric
        for i, metric in enumerate(auto_metric_names):
            #print(f"\n===={task}, {dataset}, {metric}====\n")
            if dataset in metric:
                continue # don't re-do the internal
            if DROP_NA:
                df = data_df.dropna(subset=[metric])
            else:
                df = data_df.fillna(0)
            if AVERAGE_ANNOTATIONS:
                df = df.groupby(["model", "topic_idx"]).mean().reset_index()
                mod = sm.OLS(df["scores_raw"], df[["const", metric]])
            if USE_OLS:
                scores = df["scores_raw"] if task == "intrusions" else (df["scores_raw"] - 1) / 2
                mod = sm.OLS(scores, df[["const", metric]])
            elif task == "intrusions":
                mod = sm.Logit(df["scores_raw"], df[["const", metric]])
            elif task == "ratings":
                mod = OrderedModel(df["scores_raw"], df[[metric]], distr="probit")
            res = mod.fit(disp=0)
            metric_base = re.search("c_npmi_10|c_v|c_uci|u_mass", metric).group(0)
            ci_lb, ci_ub = res.conf_int(alpha=ALPHA).loc[metric] 
            row = {
                "task": task,
                "dataset": dataset,
                "metric": metric_base,
                "reference": metric.replace(f"{metric_base}_", ""),
                "coef": res.params[metric],
                "se": res.bse[metric],
                "p": res.pvalues[metric],
                "bic": res.bic,
                "ci_0.025": ci_lb,
                "ci_0.975": ci_ub,
            }
            rows.append(row)
regressions = pd.DataFrame(rows)

In [None]:
if not FILTER_ON_FAMILIARITY and not AVERAGE_ANNOTATIONS:
    regressions.to_csv("regression_results.csv", index=False)

### Make the latex table

We want to bold the values that are significantly larger than others using the CIs

In [None]:
def find_max_coefs(coefs, lbs, ubs, contiguous=False):
    n = len(coefs)
    coefs = np.nan_to_num(coefs, nan=0)
    lbs = np.nan_to_num(lbs, nan=0)
    ubs = np.nan_to_num(ubs, nan=0)
    sorted_coef_idx = np.argsort(coefs)[::-1]
    for i, idx in enumerate(sorted_coef_idx[1:], start=1):
        # is the upper bound of this coefficient contained in the
        # lower bound of the next-largest coefficient?
        prev_idx = sorted_coef_idx[i-1] if contiguous else sorted_coef_idx[0]
        if ubs[idx] < lbs[prev_idx]:
            return np.array([j in sorted_coef_idx[:i] for j in range(n)])
    return np.full(n, True)

In [None]:
METRICS_TO_KEEP = ["c_v", "c_npmi_10"]
regressions.loc[(regressions.dataset=="wikitext") & (regressions.reference == "full"), "reference"] = "wikitext_full"
regressions.loc[(regressions.dataset=="nytimes") & (regressions.reference == "full"), "reference"] = "nytimes_full"
regressions_pivot = (
    regressions.loc[regressions.metric.isin(METRICS_TO_KEEP)]
               .replace({""})
               .sort_values(["task", "dataset", "metric", "reference"])
               #.pivot(index=["Metric", "Reference"], columns=["Task", "Dataset"], values="coef")
               .pivot(index=["task", "dataset"], columns=["metric", "reference"], values=["coef", "ci_0.025", "ci_0.975"])
)
# order the columns
regressions_pivot = regressions_pivot[[
    (v, m, r)
    for v in ["coef", "ci_0.025", "ci_0.975"]
    for m in ["c_npmi_10", "c_v"]
    for r in ["nytimes_full", "wikitext_full", "train", "val"] #NB: test, full excluded
]]

In [None]:
# bold format the significant values
for idx, row in regressions_pivot.iterrows():
    max_coefs = find_max_coefs(row['coef'], row["ci_0.025"], row["ci_0.975"])
    newrow = []
    for i, x in enumerate(row["coef"]):
        if np.isnan(x):
            val = "-"
        elif not np.isnan(x) and max_coefs[i]:
            val = r"\uline{" + f"{x:0.2f}" + "}"
            if x == np.max(row['coef']):
                val = r"\textbf{" + val + "}"
        else:
            val = f"{x:0.2f}"
        newrow.append(val)
    regressions_pivot.at[idx, "coef"] = newrow

In [None]:
# make latex
regressions_pivot_coefs = regressions_pivot["coef"].loc[[
    ('intrusions',  'nytimes'),
    ('intrusions', 'wikitext'),
    ('intrusions',      'all'),
    (   'ratings',  'nytimes'),
    (   'ratings', 'wikitext'),
    (   'ratings',      'all'),
]]
latex = regressions_pivot_coefs.to_latex(escape=False, multicolumn_format='c', column_format="ll|rrrr|rrrr")
to_replace_in_latex={
    "c_v": r"$C_v$ (110-token window)",
    "test": "Test",
    "c_npmi_10": r"\abr{npmi} (10-token window)",
    "nytimes_full": r"\abr{nyt}",
    "wikitext_full": r"\abr{wiki}",
    "wikitext": r"\abr{wiki}",
    "nytimes": r"\abr{nyt}",
    "all": "Both",
    "full": "Full",
    "train": "Train",
    "val": "Val",
    "ratings": "Rating",
    "intrusions": "Intrusion",
    "metric": "",
    "dataset": r"Train Corpus $\downarrow$",
    "reference": r"Ref. Corpus $\rightarrow$",
    "task": "",
}
for to_replace, val in to_replace_in_latex.items():
    latex = latex.replace(to_replace, val)
print(latex)


Linear regression with model effects

Linear regression with familiarity effect

Best explanation of data

## Prediction Setup

In [None]:
def create_split(task_data, test_size):
    topic_sample = (
    task_data.groupby(["task", "dataset", "model", "topic_idx"])
                .size()
                .reset_index()
                .groupby(["task", "dataset", "model"])
                .sample(frac=1-test_size)
                .drop(columns=0)
    )
    topic_sample
    task_data["idx"] = np.arange(len(task_data))
    task_data_train = task_data.merge(topic_sample, how='inner')
    task_data_test = task_data.loc[~task_data.idx.isin(task_data_train.idx)]
    return task_data_train, task_data_test

def random_choice_prob_index(a, axis=1):
    # basically vectorized categorical draw https://stackoverflow.com/a/47722393
    r = np.expand_dims(np.random.rand(a.shape[1-axis]), axis=axis)
    return (a.cumsum(axis=axis) > r).argmax(axis=axis)

def fit_predict_intrusion(train_df, test_df, metric):
    exog_cols = ["const", metric] if isinstance(metric, str) else ["const"] + metric
    mod = sm.Logit(train_df["scores_raw"], train_df[exog_cols])
    res = mod.fit(disp=0)
    return res.model.predict(res.params, exog=test_df[exog_cols])

def fit_predict_ratings(train_df, test_df, metric):
    exog_cols = [metric] if isinstance(metric, str) else metric
    mod = OrderedModel(train_df["scores_raw"], train_df[exog_cols], distr="probit")
    res = mod.fit(method='bfgs', disp=0)
    return res.model.predict(res.params, exog=test_df[exog_cols])

def intrusion_test(scores_a, scores_b, alternative="larger"):
    return proportions_ztest(
        [scores_a.sum(), scores_b.sum()],
        [len(scores_a), len(scores_b)],
        alternative=alternative
    )

def ratings_test(scores_a, scores_b, alternative="greater"):
    return mannwhitneyu(scores_a, scores_b, alternative=alternative)

def false_discovery_rate_sim(
    task,
    test_df,
    probs,
    n_iters=100,
    alpha=0.05,
    beta=0.1,
    models=["mallet", "dvae", "etm"],
):
    false_positives, false_negatives = 0, 0
    discoveries, omissions = 0, 0
    total = 0
    if task == "intrusions":
        stat_test = intrusion_test
    elif task == "ratings":
        stat_test = ratings_test

    for i in range(n_iters):
        # sample pseduo human scores
        if task == "intrusions":
            preds = np.random.binomial(1, p=probs)
        elif task == "ratings":
            preds = random_choice_prob_index(probs)

        # Calculate the false discovery rate
        for model_a, model_b in itertools.permutations(models, 2):
            model_a_idxr = test_df.model==model_a
            model_b_idxr = test_df.model==model_b
            # run the tests for both human and auto metrics
            stat_auto, p_auto = stat_test(preds[model_a_idxr], preds[model_b_idxr])
            stat_human, p_human = stat_test(test_df.loc[model_a_idxr]["scores_raw"], test_df.loc[model_b_idxr]["scores_raw"])

            if p_auto < alpha: # auto rejects the null
                if np.random.random() < alpha:
                    continue # falsely rejected the null, type I error
                discoveries += 1
                if np.random.random() < beta: # human failed to detect effect. type II error
                    continue
                false_positives += p_human > alpha # human fails to reject the null

            # TODO: this is wrong, need to correct
            if p_human < alpha: # human rejects the null
                false_negatives += p_auto > alpha # auto fails to reject null
                omissions += 1
            total += 1
    false_pos_rate = np.nan if discoveries == 0 else false_positives / discoveries
    false_neg_rate = np.nan if omissions == 0 else false_negatives / omissions
    return false_pos_rate, discoveries, false_neg_rate, omissions, (total - (false_positives + false_negatives)) / total

In [None]:
GLOBAL_ITERS = 50
LOCAL_ITERS = 50
TEST_SIZE = 0.5
FILTER_ON_FAMILIARITY = False
DROP_NA = False
ALPHA = 0.05
BETA = 0.1
NUM_ANNOTATORS = {"intrusions": 26, "ratings": 15}
METRICS_TO_USE = [
 'c_npmi_10_full',
 'c_npmi_10_nytimes_full',
 #'c_npmi_10_test',
 'c_npmi_10_train',
 'c_npmi_10_val',
 'c_npmi_10_wikitext_full',
 'c_v_full',
 'c_v_nytimes_full',
 #'c_v_test',
 'c_v_train',
 'c_v_val',
 'c_v_wikitext_full',
]

In [None]:
if not DROP_NA:
    task_data = task_data.fillna(0)

In [None]:
rows = []
np.random.seed(42)
for iteration in tqdm(range(GLOBAL_ITERS), total=GLOBAL_ITERS):
    # iterate through variants and estimate models
    task_data_train, task_data_test = create_split(task_data, TEST_SIZE)
    
    for task in ["intrusions", "ratings"]:
        for dataset in ["wikitext", "nytimes", "all"]:
            train_df = task_data_train.loc[task_data_train.task == task]
            test_df = task_data_test.loc[task_data_test.task == task]

            if dataset != "all":
                train_df = train_df.loc[train_df.dataset == dataset]
                test_df = test_df.loc[test_df.dataset == dataset]
            if FILTER_ON_FAMILIARITY:
                train_df = train_df.loc[train_df.confidences_raw == 1]
                test_df = test_df.loc[test_df.confidences_raw == 1]

            for i, metric in enumerate(METRICS_TO_USE):
                # estimate a model for the metric, then make predictions
                if dataset in metric:
                    continue # don't re-do the internal
                elif task == "intrusions":
                    probs = fit_predict_intrusion(train_df, test_df, metric)
                elif task == "ratings":
                    probs = fit_predict_ratings(train_df, test_df, metric)
                fp_rate, pos, fn_rate, neg, agree_rate = false_discovery_rate_sim(
                    task, test_df, probs, n_iters=LOCAL_ITERS, alpha=ALPHA, beta=BETA
                )
                if isinstance(metric, str):
                    metric_base = re.search("c_npmi_10|c_v|c_uci|u_mass", metric).group(0)
                    reference = metric.replace(f"{metric_base}_", "") 
                else:
                    metric_base = "combined"
                    reference = "full"
                row = {
                    "iter": iteration,
                    "task": task,
                    "dataset": dataset,
                    "metric": metric_base,
                    "reference": reference,
                    "false_discoveries": fp_rate,
                    "discoveries": pos,
                    "false_omissions": fn_rate,
                    "omissions": neg,
                    "agree_rate": agree_rate, 
                }
                rows.append(row)
simulations = pd.DataFrame(rows)

In [162]:
#simulations.to_csv("simulation_results.csv", index=False)
simulations = pd.read_csv("simulation_results.csv")

In [163]:
simulations.loc[(simulations.dataset=="wikitext") & (simulations.reference == "full"), "reference"] = "wikitext_full"
simulations.loc[(simulations.dataset=="nytimes") & (simulations.reference == "full"), "reference"] = "nytimes_full"

In [164]:
VALUE_COL = "false_discoveries"

def ci_lb(x):
    return np.quantile(x, 0.025)
def ci_ub(x):
    return np.quantile(x, 0.975)

simulations_grouped = (
    simulations.groupby(["task", "dataset", "metric", "reference"])[VALUE_COL]
               .agg(["mean", "std", ci_lb, ci_ub])
               .reset_index()
               .rename(columns={"mean": VALUE_COL, "ci_lb": "ci_0.025", "ci_ub": "ci_0.975"})
)

In [165]:
METRICS_TO_KEEP = ["c_v", "c_npmi_10"]
simulations_pivot = (
    simulations_grouped.loc[simulations_grouped.metric.isin(METRICS_TO_KEEP)]
               .sort_values(["task", "dataset", "metric", "reference"])
               #.pivot(index=["Metric", "Reference"], columns=["Task", "Dataset"], values="coef")
               .pivot(index=["task", "dataset"], columns=["metric", "reference"], values=[VALUE_COL, "std"])
)
simulations_pivot = simulations_pivot[[
    (v, m, r)
    for v in [VALUE_COL, "std"]
    for m in ["c_npmi_10", "c_v"]
    for r in ["nytimes_full", "wikitext_full", "train", "val"] #NB: test excluded
]]

In [166]:
# bold format the significant values
for idx, row in simulations_pivot.iterrows():
    newrow = []
    for i, x in enumerate(row[VALUE_COL]):
        if np.isnan(x):
            val = "-"
        else:
            std = row["std"][i] * 100
            val = f"${x*100:0.0f}_{{{std:0.0f}}}$"
        newrow.append(val)
    simulations_pivot.at[idx, VALUE_COL] = newrow

In [168]:
# make latex
simulations_pivot_values = simulations_pivot[VALUE_COL].loc[[
    ('intrusions',  'nytimes'),
    ('intrusions', 'wikitext'),
    ('intrusions',      'all'),
    (   'ratings',  'nytimes'),
    (   'ratings', 'wikitext'),
    (   'ratings',      'all'),
]]
latex = simulations_pivot_values.to_latex(escape=False, multicolumn_format='c', column_format="ll|rrrr|rrrr")
to_replace_in_latex={
    "c_v": r"$C_v$ (110-token window)",
    "test": "Test",
    "c_npmi_10": r"\abr{npmi} (10-token window)",
    "nytimes_full": r"\abr{nyt}",
    "wikitext_full": r"\abr{wiki}",
    "wikitext": r"\abr{wiki}",
    "nytimes": r"\abr{nyt}",
    "all": "Both",
    "full": "Full",
    "train": "Train",
    "val": "Val",
    "ratings": "Rating",
    "intrusions": "Intrusion",
    "metric": "",
    "dataset": r"Train $\downarrow$",
    "reference": r"Ref. $\rightarrow$",
    "task": "",
}
for to_replace, val in to_replace_in_latex.items():
    latex = latex.replace(to_replace, val)
print(latex)


\begin{tabular}{ll|rrrr|rrrr}
\toprule
        &  & \multicolumn{4}{c}{\abr{npmi} (10-token window)} & \multicolumn{4}{c}{$C_v$ (110-token window)} \\
        & Ref. $\rightarrow$ & \abr{nyt} & \abr{wiki} &      Train &        Val & \abr{nyt} & \abr{wiki} &      Train &        Val \\
 & Train $\downarrow$ &              &               &            &            &              &               &            &            \\
\midrule
Intrusion & \abr{nyt} &    $23_{22}$ &     $25_{24}$ &  $23_{22}$ &  $26_{22}$ &    $25_{22}$ &     $26_{22}$ &  $24_{21}$ &  $22_{22}$ \\
        & \abr{wiki} &    $39_{30}$ &     $40_{30}$ &  $38_{29}$ &  $75_{18}$ &    $40_{31}$ &     $40_{30}$ &  $40_{30}$ &  $84_{15}$ \\
        & Both &    $19_{16}$ &     $20_{17}$ &  $18_{15}$ &  $54_{19}$ &    $20_{16}$ &     $22_{16}$ &  $22_{16}$ &  $79_{15}$ \\
Rating & \abr{nyt} &    $43_{22}$ &     $47_{20}$ &  $40_{22}$ &  $30_{19}$ &    $40_{15}$ &     $43_{18}$ &  $38_{15}$ &  $31_{18}$ \\
        & \abr{wiki} &