In [1]:
import itertools
import json
import re
import sys
from pathlib import Path

import pandas as pd
import numpy as np
from tqdm import tqdm 

import statsmodels.api as sm
from statsmodels.miscmodels.ordinal_model import OrderedModel
from statsmodels.stats.proportion import proportions_ztest

from scipy.stats import pearsonr, spearmanr, mannwhitneyu, ttest_ind

ModuleNotFoundError: No module named 'statsmodels.miscmodels.ordinal_model'

# Setup
## Load data

In [4]:
DATA_DIR = "../data/human/all_data"
DATA_FILE = "all_data.csv"

FILTER_ON_FAMILIARITY = False
familiar = "_familiar" if FILTER_ON_FAMILIARITY else ""

In [5]:
auto_metric_names = [
 'c_npmi_10_full',
 'c_npmi_10_nytimes_full',
 #'c_npmi_10_test',
 'c_npmi_10_train',
 'c_npmi_10_val',
 'c_npmi_10_wikitext_full',
 #'c_uci_full',
 'c_v_full',
 'c_v_nytimes_full',
 #'c_v_test',
 'c_v_train',
 'c_v_val',
 'c_v_wikitext_full',
 #'u_mass_full'
]

In [6]:
task_data = pd.read_csv(Path(DATA_DIR, DATA_FILE))

# complete the out-of-sample columns
for dataset in ["wikitext", "nytimes"]:
    for metric in ["c_npmi_10", "c_v"]:
        task_data[f"{metric}_{dataset}_full"] = task_data[f"{metric}_{dataset}_full"].combine_first(task_data[f"{metric}_full"])

# add the constant for lin. reg
task_data = sm.add_constant(task_data)

# convert infinite values to nans
task_data = task_data.replace(np.inf, np.nan)

## Helper functions

In [7]:
def find_max_values(values, lbs, ubs):
    """
    Given arrays of `values` and associated lower (`lbs`) and upper bounds (`ubs`),
    for confidence intervals, return binary array of indices of `values` that have 
    CIs that overlap with the CI of the maximum value.
    """
    n = len(values)
    values = np.nan_to_num(values, nan=0)
    lbs = np.nan_to_num(lbs, nan=0)
    ubs = np.nan_to_num(ubs, nan=0)
    sorted_value_idx = np.argsort(values)[::-1]
    max_lbs = lbs[sorted_value_idx[0]]
    for i, idx in enumerate(sorted_value_idx[1:], start=1):
        # is the upper bound of this value contained in the
        # lower bound of the largest coefficient?
        if ubs[idx] < max_lbs:
            return np.array([j in sorted_value_idx[:i] for j in range(n)])
    return np.full(n, True)

def find_min_values(values, lbs, ubs):
    """
    Analogous to `find_max_values`
    """
    return find_max_values(-values, -ubs, -lbs, contiguous=contiguous)

def ci_lb(x):
    return np.quantile(x, 0.025)

def ci_ub(x):
    return np.quantile(x, 0.975)

In [8]:
LATEX_CLEANUP = {
    "c_v": r"$C_v$ (110-token window)",
    "test": "Test",
    "c_npmi_10": r"\abr{npmi} (10-token window)",
    "nytimes_full": r"\abr{nyt}",
    "wikitext_full": r"\abr{wiki}",
    "wikitext": r"\abr{wiki}",
    "nytimes": r"\abr{nyt}",
    "all": "Concatenated",
    "full": "Full",
    "train": "Train",
    "val": "Val",
    "test": "Test",
    "ratings": "Rating",
    "intrusions": "Intrusion",
    "metric": "",
    "dataset": r"Train Corpus $\downarrow$",
    "reference": r"Ref. Corpus $\rightarrow$",
    "task": "",
}

# Bootstrapped correlations

In [9]:
ALPHA = 0.05
N_ITERS = 500
NUM_ANNOTATORS = {"intrusions": 26, "ratings": 15}

np.random.seed(42)
rows = []
for iteration in tqdm(range(N_ITERS), total=N_ITERS):
    for task in ["ratings", "intrusions"]:
        for dataset in ["wikitext", "nytimes", "all"]:
            data_df = task_data.loc[task_data.task == task]
            if dataset != "all":
                data_df = data_df.loc[data_df.dataset == dataset]
            if FILTER_ON_FAMILIARITY:
                data_df = data_df.loc[data_df.confidences_raw == 1]

            # run correlations for each metric
            for i, metric in enumerate(auto_metric_names):
                if dataset in metric:
                    continue # don't re-do the internal
                if DROP_NA:
                    df = data_df.dropna(subset=[metric])
                else:
                    df = data_df.fillna(0)
                # sample 
                df = (
                    df.groupby(["task", "dataset", "model", "topic_idx"])
                      .sample(NUM_ANNOTATORS[task], replace=True)
                      .groupby(["task", "dataset", "model", "topic_idx"])[[metric, "scores_raw"]]
                      .mean()
                )
                spear_rho, spear_p = spearmanr(df[metric].values, df["scores_raw"].values)
                pear_rho, pear_p = pearsonr(df[metric].values, df["scores_raw"].values)
                metric_base = re.search("c_npmi_10|c_v|c_uci|u_mass", metric).group(0)
                row = {
                    "task": task,
                    "dataset": dataset,
                    "metric": metric_base,
                    "reference": metric.replace(f"{metric_base}_", ""),
                    "spear_rho": spear_rho,
                    "pear_rho": pear_rho,
                    "spear_p": spear_p,
                    "pear_p": pear_p,
                }
                rows.append(row)
correlations = pd.DataFrame(rows)
correlations.to_csv(f"correlation_results{familiar}.csv", index=False)

100%|██████████| 10/10 [01:17<00:00,  7.79s/it]


## Create table

In [54]:
correlations = pd.read_csv(f"correlation_results{familiar}.csv")
correlations.loc[(correlations.dataset=="wikitext") & (correlations.reference == "full"), "reference"] = "wikitext_full"
correlations.loc[(correlations.dataset=="nytimes") & (correlations.reference == "full"), "reference"] = "nytimes_full"

In [55]:
VALUE_COL = "spear_rho"
METRICS_TO_KEEP = ["c_npmi_10", "c_v"]

In [56]:
correlations_grouped = (
    correlations.groupby(["task", "dataset", "metric", "reference"])[VALUE_COL]
               .agg(["mean", "std", ci_lb, ci_ub])
               .reset_index()
               .rename(columns={"mean": VALUE_COL, "ci_lb": "ci_0.025", "ci_ub": "ci_0.975"})
)

In [57]:
correlations_pivot = (
    correlations_grouped.loc[correlations_grouped.metric.isin(METRICS_TO_KEEP)]
               .sort_values(["task", "dataset", "metric", "reference"])
               .pivot(index=["task", "dataset"], columns=["metric", "reference"], values=[VALUE_COL, "ci_0.025",  "ci_0.975"])
)
correlations_pivot = correlations_pivot[[
    (v, m, r)
    for v in [VALUE_COL, "ci_0.025",  "ci_0.975"]
    for m in METRICS_TO_KEEP
    for r in ["nytimes_full", "wikitext_full", "train", "val"]
]]

In [58]:
for idx, row in correlations_pivot.iterrows():
    # determine overlapping CIs
    max_values = find_max_values(row[VALUE_COL], row["ci_0.025"], row["ci_0.975"])
    newrow = []
    # format each item in the row,
    # bold for max value, underline for overlapping
    for i, x in enumerate(row[VALUE_COL]):
        if np.isnan(x):
            val = "-"
        elif not np.isnan(x) and max_values[i]:
            val = r"\uline{" + f"{x:0.2f}" + "}"
            if x == np.max(row[VALUE_COL]):
                val = r"\textbf{" + val + "}"
        else:
            val = f"{x:0.2f}"
        newrow.append(val)
    correlations_pivot.at[idx, VALUE_COL] = newrow

In [59]:
# make latex
correlations_pivot_values = correlations_pivot[VALUE_COL].loc[[
    ('intrusions',  'nytimes'),
    ('intrusions', 'wikitext'),
    ('intrusions',      'all'),
    (   'ratings',  'nytimes'),
    (   'ratings', 'wikitext'),
    (   'ratings',      'all'),
]]
latex = correlations_pivot_values.to_latex(escape=False, multicolumn_format='c', column_format="ll|rrrr|rrrr")
for to_replace, val in LATEX_CLEANUP.items():
    latex = latex.replace(to_replace, val)
print(latex)


\begin{tabular}{ll|rrrr|rrrr}
\toprule
        &  & \multicolumn{4}{c}{\abr{npmi} (10-token window)} & \multicolumn{4}{c}{$C_v$ (110-token window)} \\
        & Ref. Corpus $\rightarrow$ &  \abr{nyt} &          \abr{wiki} &         Train &   Val &  \abr{nyt} &          \abr{wiki} &         Train &   Val \\
 & Train Corpus $\downarrow$ &               &                        &               &       &               &                        &               &       \\
\midrule
Intrusion & \abr{nyt} &          0.34 &           \uline{0.51} &          0.32 &  0.25 &          0.44 &  \textbf{\uline{0.56}} &          0.42 &  0.38 \\
        & \abr{wiki} &  \uline{0.39} &           \uline{0.39} &  \uline{0.40} &  0.14 &  \uline{0.38} &  \textbf{\uline{0.40}} &  \uline{0.39} &  0.13 \\
        & Both &          0.36 &           \uline{0.45} &          0.36 &  0.18 &          0.41 &  \textbf{\uline{0.48}} &          0.41 &  0.26 \\
Rating & \abr{nyt} &          0.45 &  \textbf{\uline{0.59}} &   

# Simple Simulation

In [29]:
def select_random_topics(df, num_topics=50):
    """
    Sample `num_topics` random topics, with replacement, for each 
    (task, dataset, model) tuple
    """
    topic_sample = (
        df.groupby(["task", "dataset", "model", "topic_idx"])
                    .size()
                    .reset_index()
                    .groupby(["task", "dataset", "model"])
                    .sample(num_topics, replace=True)
                    .drop(columns=0)
    )
    topic_sample["rand_topic_idx"] = topic_sample.groupby(['task', 'dataset', 'model']).cumcount()
    df = df.merge(topic_sample, how='inner')
    return df

def intrusion_test(scores_a, scores_b, alternative="larger"):
    """
    Proportions test of difference in intrusion scores
    """
    return proportions_ztest(
        [scores_a.sum(), scores_b.sum()],
        [len(scores_a), len(scores_b)],
        alternative=alternative
    )

def ratings_test(scores_a, scores_b, alternative="greater"):
    """
    Mann-Whitney U-test of difference in ratings cores
    """
    return mannwhitneyu(scores_a, scores_b, alternative=alternative)

def auto_test(scores_a, scores_b, alternative="greater"):
    """
    t-test of difference in automated scores
    """
    return ttest_ind(scores_a, scores_b, equal_var=False, alternative=alternative)

def false_discovery_rate_sim(
    task,
    test_df,
    metric,
    alpha=0.05,
    beta=0.1,
    models=["mallet", "dvae", "etm"],
):
    """
    Given a sample of data, determine the number of times
    that auto metrics imply a significant difference between models
    when the human scores do not.
    """
    false_positives, false_negatives = 0, 0
    discoveries, omissions = 0, 0
    if task == "intrusions":
        stat_test = intrusion_test
    elif task == "ratings":
        stat_test = ratings_test
    # model scores are the same across all the human ratings
    model_scores = test_df.groupby(['task', 'dataset', 'model', 'rand_topic_idx']).head(1)

    # Calculate the false discovery rate
    for model_a, model_b in itertools.permutations(models, 2):
        model_a_idxr = test_df.model==model_a
        model_b_idxr = test_df.model==model_b
        # run the tests for both human and auto metrics
        stat_auto, p_auto = auto_test(
            model_scores.loc[model_scores.model==model_a][metric],
            model_scores.loc[model_scores.model==model_b][metric],
        )
        stat_human, p_human = stat_test(
            test_df.loc[test_df.model==model_a]["scores_raw"],
            test_df.loc[test_df.model==model_b]["scores_raw"],
        )

        # Count the disagreements while controlling for baseline
        # probabilities of type I and II errors
        # if p_auto < alpha: # rejects the null based on auto scores
        #     if np.random.random() < alpha:
        #         continue # falsely rejected the null, type I error
        #     discoveries += 1
        #     if np.random.random() < beta: # failed to detect effect for human. type II error
        #         continue
        #     false_positives += p_human > alpha # fail to reject the null for auto

        # if p_human < alpha: # human rejects the null
        #     if np.random.random() < alpha: # falsely rejected the null, type I error
        #         continue
        #     omissions += 1
        #     if np.random.random() < beta: # failed to detect effect for auto. type II error
        #         continue
        #     false_negatives += p_auto > alpha # fail to reject null for auto
        if p_auto < alpha and np.random.random() > alpha:
            discoveries += 1
            if np.random.random() > beta and p_human > alpha:
                false_positives += 1
        if p_human < alpha and np.random.random() > alpha:
            omissions += 1
            if np.random.random() > beta and p_auto > alpha:
                false_negatives += 1
    false_pos_rate = np.nan if discoveries == 0 else false_positives / discoveries
    false_neg_rate = np.nan if omissions == 0 else false_negatives / omissions
    return false_pos_rate, false_positives, discoveries, false_neg_rate, false_negatives, omissions

In [37]:
ALPHA = 0.05
BETA = 0.1
N_ITERS = 1000
NUM_TOPICS = 50
MODELS = ['mallet', 'dvae', 'etm']

np.random.seed(11235)
rows = []
for iteration in tqdm(range(N_ITERS), total=N_ITERS):
    for task in ["ratings", "intrusions"]:
        for dataset in ["wikitext", "nytimes", "all"]:
            data_df = task_data.copy().loc[task_data.task == task]
            if dataset != "all":
                data_df = data_df.loc[data_df.dataset == dataset]
            if FILTER_ON_FAMILIARITY:
                data_df = data_df.loc[data_df.confidences_raw == 1]

            # run regressions for each metric
            df = data_df.fillna(0)
            # sample at the topic level
            df = select_random_topics(df, NUM_TOPICS)

            for i, metric in enumerate(auto_metric_names):
                if dataset in metric:
                    continue # don't re-do the internal for `wikitext_full`, `nytimes_full`

                fp_rate, fp, pos, fn_rate, fn, neg = false_discovery_rate_sim(
                    task,
                    df,
                    metric,
                    alpha=ALPHA,
                    beta=BETA,
                    models=MODELS,
                )
                metric_base = re.search("c_npmi_10|c_v|c_uci|u_mass", metric).group(0)
                row = {
                    "task": task,
                    "dataset": dataset,
                    "metric": metric_base,
                    "reference": metric.replace(f"{metric_base}_", ""),
                    "fp_rate": fp_rate,
                    "false_positives": fp,
                    "discoveries": pos,
                    "fn_rate": fn_rate,
                    "false_negatives": fn,
                    "omissions": neg,
                }
                rows.append(row)
simulations = pd.DataFrame(rows)
simulations.to_csv(f"simple_simulation_results_11235{familiar}.csv", index=False)

  2%|▏         | 21/1000 [00:41<31:54,  1.96s/it]


KeyboardInterrupt: 

## Create latex table

In [37]:
simulations = pd.read_csv(f"simple_simulation_results{familiar}.csv")

In [23]:
simulations

Unnamed: 0,task,dataset,metric,reference,fp_rate,false_positives,discoveries,fn_rate,false_negatives,omissions
0,ratings,wikitext,c_npmi_10,full,1.000000,2,2,1.000000,3,3
1,ratings,wikitext,c_npmi_10,nytimes_full,0.333333,1,3,0.333333,1,3
2,ratings,wikitext,c_npmi_10,train,1.000000,2,2,0.000000,0,1
3,ratings,wikitext,c_npmi_10,val,0.666667,2,3,0.500000,1,2
4,ratings,wikitext,c_v,full,0.333333,1,3,0.333333,1,3
...,...,...,...,...,...,...,...,...,...,...
51995,intrusions,all,c_v,full,0.000000,0,3,0.000000,0,3
51996,intrusions,all,c_v,nytimes_full,0.000000,0,3,0.000000,0,3
51997,intrusions,all,c_v,train,0.000000,0,2,0.000000,0,2
51998,intrusions,all,c_v,val,1.000000,3,3,1.000000,3,3


In [19]:
simulations_orig

Unnamed: 0,task,dataset,metric,reference,fp_rate,false_positives,discoveries,fn_rate,false_negatives,omissions
0,ratings,wikitext,c_npmi_10,full,1.000000,2,2,1.000000,3,3
1,ratings,wikitext,c_npmi_10,nytimes_full,0.333333,1,3,0.333333,1,3
2,ratings,wikitext,c_npmi_10,train,1.000000,2,2,0.000000,0,1
3,ratings,wikitext,c_npmi_10,val,0.666667,2,3,0.500000,1,2
4,ratings,wikitext,c_v,full,0.333333,1,3,0.333333,1,3
...,...,...,...,...,...,...,...,...,...,...
51995,intrusions,all,c_v,full,0.333333,1,3,0.000000,0,1
51996,intrusions,all,c_v,nytimes_full,1.000000,2,2,0.666667,2,3
51997,intrusions,all,c_v,train,0.000000,0,3,0.000000,0,2
51998,intrusions,all,c_v,val,0.666667,2,3,0.500000,1,2


In [20]:
simulations

Unnamed: 0,task,dataset,metric,reference,fp_rate,false_positives,discoveries,fn_rate,false_negatives,omissions
0,ratings,wikitext,c_npmi_10,wikitext_full,1.000000,2,2,1.000000,3,3
1,ratings,wikitext,c_npmi_10,nytimes_full,0.333333,1,3,0.333333,1,3
2,ratings,wikitext,c_npmi_10,train,1.000000,2,2,0.000000,0,1
3,ratings,wikitext,c_npmi_10,val,0.666667,2,3,0.500000,1,2
4,ratings,wikitext,c_v,wikitext_full,0.333333,1,3,0.333333,1,3
...,...,...,...,...,...,...,...,...,...,...
51995,intrusions,all,c_v,full,0.000000,0,3,0.000000,0,3
51996,intrusions,all,c_v,nytimes_full,0.500000,1,2,0.000000,0,2
51997,intrusions,all,c_v,train,0.000000,0,3,0.000000,0,3
51998,intrusions,all,c_v,val,0.500000,1,2,0.666667,2,3


In [None]:
simulations

In [None]:
simulations.loc[(simulations.dataset=="wikitext") & (simulations.reference == "full"), "reference"] = "wikitext_full"
simulations.loc[(simulations.dataset=="nytimes") & (simulations.reference == "full"), "reference"] = "nytimes_full"

In [None]:
METRICS_TO_KEEP = ["c_npmi_10", "c_v"]

simulations_grouped = (
    simulations.groupby(["task", "dataset", "metric", "reference"])[["false_positives", "discoveries", 'false_negatives', 'omissions']]
               .agg(["sum"])
               .reset_index()
)
simulations_grouped["fp_rate"] = simulations_grouped["false_positives"] / simulations_grouped["discoveries"]
simulations_grouped["fn_rate"] = simulations_grouped["false_negatives"] / simulations_grouped["omissions"]

In [None]:
simulations_pivot = (
    simulations_grouped.loc[simulations_grouped.metric.isin(METRICS_TO_KEEP)]
               .sort_values(["task", "dataset", "metric", "reference"])
               .pivot(index=["task", "dataset"], columns=["metric", "reference"], values=["fp_rate", "fn_rate"])
)
simulations_pivot = simulations_pivot[[
    (v, m, r)
    for v in ["fp_rate", "fn_rate"]
    for m in METRICS_TO_KEEP
    for r in ["nytimes_full", "wikitext_full", "train"] #NB: test, val excluded
]]

In [None]:
# format the values
for idx, row in simulations_pivot.iterrows():
    newrow = []
    prec = 1 - row["fp_rate"]
    rec = 1 - row["fn_rate"]
    f1s = 2 * ((prec * rec) / (prec + rec)) # calculate f1s

    # format each value in the row
    for i, (fpr, fnr, f1) in enumerate(zip(row["fp_rate"], row["fn_rate"], f1s)):
        if np.isnan(fpr):
            val = "-"
        else:
            val = f"{fpr*100:0.0f}"
            # bold the max value
            if f1 == np.max(f1s):
                val = r"\textbf{" + val + "}"
            # add a leading zero if necessary
            if len(val) < 2:
                val = r"\phantom{0}" + val
            # include the false-negative rate
            fnr_str = f"{fnr*100:0.0f}"
            if len(fnr_str) < 2:
                fnr_str = r"\phantom{0}" + fnr_str
            # put it all together
            val += r" / \textcolor{gray}{" + fnr_str + "}"
        newrow.append(val)
    simulations_pivot.at[idx, "fp_rate"] = newrow

In [None]:
# make latex
simulations_pivot_values = simulations_pivot["fp_rate"].loc[[
    ('intrusions',  'nytimes'),
    ('intrusions', 'wikitext'),
    ('intrusions',      'all'),
    (   'ratings',  'nytimes'),
    (   'ratings', 'wikitext'),
    (   'ratings',      'all'),
]]
latex = simulations_pivot_values.to_latex(escape=False, multicolumn_format='c', column_format="ll|rrr|rrr")

for to_replace, val in LATEX_CLEANUP.items():
    latex = latex.replace(to_replace, val)
print(latex)


## Regression 

In [18]:
AVERAGE_ANNOTATIONS = False
DROP_NA = False
USE_OLS = False
ALPHA = 0.05

np.random.seed(42)
rows = []
for task in ["ratings", "intrusions"]:
    for dataset in ["wikitext", "nytimes", "all"]:
        data_df = task_data.loc[task_data.task == task]
        if dataset != "all":
            data_df = data_df.loc[data_df.dataset == dataset]
        if FILTER_ON_FAMILIARITY:
            data_df = data_df.loc[data_df.confidences_raw == 1]

        # run regressions for each metric
        for i, metric in enumerate(auto_metric_names):
            if dataset in metric:
                continue # don't re-do the internal for `wikitext_full`, `nytimes_full`
            if DROP_NA:
                df = data_df.dropna(subset=[metric])
            else:
                df = data_df.fillna(0)
            if AVERAGE_ANNOTATIONS:
                df = df.groupby(["model", "topic_idx"]).mean().reset_index()
                mod = sm.OLS(df["scores_raw"], df[["const", metric]])
            if USE_OLS:
                scores = df["scores_raw"] if task == "intrusions" else (df["scores_raw"] - 1) / 2
                mod = sm.OLS(scores, df[["const", metric]])
            elif task == "intrusions":
                mod = sm.Logit(df["scores_raw"], df[["const", metric]])
            elif task == "ratings":
                mod = OrderedModel(df["scores_raw"], df[[metric]], distr="probit")
            res = mod.fit(disp=0)
            metric_base = re.search("c_npmi_10|c_v|c_uci|u_mass", metric).group(0)
            ci_lb, ci_ub = res.conf_int(alpha=ALPHA).loc[metric] 
            row = {
                "task": task,
                "dataset": dataset,
                "metric": metric_base,
                "reference": metric.replace(f"{metric_base}_", ""),
                "coef": res.params[metric],
                "se": res.bse[metric],
                "p": res.pvalues[metric],
                "bic": res.bic,
                "ci_0.025": ci_lb,
                "ci_0.975": ci_ub,
            }
            rows.append(row)
regressions = pd.DataFrame(rows)
regressions.to_csv(f"regression_results{familiar}.csv", index=False)

## Make the latex table

In [None]:
regressions = pd.read_csv(f"regression_results{familiar}.csv", index=False)

In [19]:
regressions.loc[(regressions.dataset=="wikitext") & (regressions.reference == "full"), "reference"] = "wikitext_full"
regressions.loc[(regressions.dataset=="nytimes") & (regressions.reference == "full"), "reference"] = "nytimes_full"

In [20]:
METRICS_TO_KEEP = ["c_npmi_10", "c_v"]

regressions_pivot = (
    regressions.loc[regressions.metric.isin(METRICS_TO_KEEP)]
               .replace({""})
               .sort_values(["task", "dataset", "metric", "reference"])
               .pivot(index=["task", "dataset"], columns=["metric", "reference"], values=["coef", "ci_0.025", "ci_0.975"])
)
# order the columns
regressions_pivot = regressions_pivot[[
    (v, m, r)
    for v in ["coef", "ci_0.025", "ci_0.975"]
    for m in METRICS_TO_KEEP
    for r in ["nytimes_full", "wikitext_full", "train", "val"] #NB: test, full excluded
]]

In [21]:
# bold format the significant values
for idx, row in regressions_pivot.iterrows():
    max_coefs = find_max_values(row['coef'], row["ci_0.025"], row["ci_0.975"])
    newrow = []
    for i, x in enumerate(row["coef"]):
        if np.isnan(x):
            val = "-"
        elif not np.isnan(x) and max_coefs[i]:
            val = r"\uline{" + f"{x:0.2f}" + "}"
            if x == np.max(row['coef']):
                val = r"\textbf{" + val + "}"
        else:
            val = f"{x:0.2f}"
        newrow.append(val)
    regressions_pivot.at[idx, "coef"] = newrow

In [22]:
# make latex
regressions_pivot_coefs = regressions_pivot["coef"].loc[[
    ('intrusions',  'nytimes'),
    ('intrusions', 'wikitext'),
    ('intrusions',      'all'),
    (   'ratings',  'nytimes'),
    (   'ratings', 'wikitext'),
    (   'ratings',      'all'),
]]
latex = regressions_pivot_coefs.to_latex(escape=False, multicolumn_format='c', column_format="ll|rrrr|rrrr")

for to_replace, val in LATEX_CLEANUP.items():
    latex = latex.replace(to_replace, val)
print(latex)


\begin{tabular}{ll|rrrr|rrrr}
\toprule
        &  & \multicolumn{4}{c}{\abr{npmi} (10-token window)} & \multicolumn{4}{c}{$C_v$ (110-token window)} \\
        & Ref. Corpus $\rightarrow$ &  \abr{nyt} &          \abr{wiki} &         Train &   Val & \abr{nyt} & \abr{wiki} & Train &   Val \\
 & Train Corpus $\downarrow$ &               &                        &               &       &              &               &       &       \\
\midrule
Intrusion & \abr{nyt} &          3.71 &  \textbf{\uline{7.14}} &          3.04 &  2.54 &         3.34 &          4.54 &  3.23 &  2.94 \\
        & \abr{wiki} &  \uline{5.87} &  \textbf{\uline{6.46}} &  \uline{6.19} &  0.85 &         3.23 &          3.59 &  3.39 &  0.42 \\
        & Both &          4.24 &  \textbf{\uline{6.81}} &          4.17 &  0.94 &         3.18 &          4.06 &  3.30 &  0.91 \\
Rating & \abr{nyt} &          4.40 &  \textbf{\uline{5.87}} &          3.85 &  3.93 &         3.97 &          4.44 &  4.03 &  3.89 \\
        & \abr{wiki}