# Independent Model analysis 

See Section 4.2 in the paper

In [1]:
import os
import sys
import pathlib
import warnings

import matplotlib.pyplot as plt
from magis_sigdial2020.hyper_params import HyperParameters
from magis_sigdial2020.settings import REPO_ROOT
from magis_sigdial2020.datasets.xkcd import XKCD
import numpy as np
import pandas as pd
import seaborn as sns

LAB_SUBDIR_ROOT = pathlib.Path(REPO_ROOT).absolute() / "lab" / "analyses"
RESULTS_DIR = LAB_SUBDIR_ROOT / "logs" / "E004_evaluate_on_cic" / "published_version"
HPARAMS_YAML = RESULTS_DIR / "hparams.yaml"
RESULTS_CSV =  RESULTS_DIR / "results.csv"

sys.path.insert(0, str(LAB_SUBDIR_ROOT / "src"))
import cic_results_lib

sns.set_style('whitegrid')
sns.set_context('notebook')
warnings.filterwarnings("ignore")

In [2]:
%%time

hparams = HyperParameters.load(HPARAMS_YAML)
xkcd = XKCD.from_settings(coordinate_system="fft")
backoff_p_w = cic_results_lib.get_backoff_p_w(xkcd)

CPU times: user 6.66 s, sys: 445 ms, total: 7.11 s
Wall time: 7.11 s


In [3]:
%%time

results_df = cic_results_lib.load_results(RESULTS_CSV, backoff_p_w, grouping_keys=["model_name"])

CPU times: user 13.4 s, sys: 996 ms, total: 14.4 s
Wall time: 14.4 s


In [4]:
cic_results_lib.make_longform_results_df(cic_results_lib.get_sorted_results(results_df, ["model_name"], "train"))

Unnamed: 0,model_name,perplexity
35,CB-2.0-S1,14.140706
59,CB-3.0-S1,14.193521
2,CB-1.0-S1,14.433539
62,CB-4.0-S1,14.459414
83,RSA-OOC-S1,14.621594
65,CB-5.0-S1,14.870386
68,CB-6.0-S1,15.383982
83,RSA-OOC-S0,15.502777
71,CB-7.0-S1,15.96565
80,RGC-S0,16.154358


In [53]:
cic_results_lib.make_longform_results_df(cic_results_lib.get_sorted_results(results_df, ["model_name"], "dev"))

Unnamed: 0,model_name,perplexity
57,CB-3.0-S1,13.489754
33,CB-2.0-S1,13.502091
60,CB-4.0-S1,13.664379
0,CB-1.0-S1,13.821334
63,CB-5.0-S1,13.969235
81,RSA-OOC-S1,14.049532
66,CB-6.0-S1,14.364849
69,CB-7.0-S1,14.823254
81,RSA-OOC-S0,14.879086
78,RGC-S0,15.034497


##  Perplexities for Tables in Paper

In [69]:
key_models = set([
    "CB-2.0-S1",
    "RGC-S0",
    "RSA-OOC-S1",
    "RSA-OOC-S0",
    "CB-15.0-S1"
])
merged = None
for split in ["train", "dev", "test"]:
    longform_df = cic_results_lib.make_longform_results_df(cic_results_lib.get_sorted_results(results_df, ["model_name"], split))
    longform_df["perplexity"] = longform_df["perplexity"].map(lambda f: round(f, 2))
    longform_df = longform_df[longform_df.model_name.isin(key_models)].rename(columns={"perplexity": f"{split}_perplexity"})
    if merged is None:
        merged = longform_df
    else:
        merged = merged.merge(right=longform_df, on="model_name")
merged.set_index("model_name").rename({"RSA-OOC-S0": "S0"}).loc[["S0", "RGC-S0", "RSA-OOC-S1", "CB-2.0-S1", "CB-15.0-S1"]]

Unnamed: 0_level_0,train_perplexity,dev_perplexity,test_perplexity
model_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
S0,15.5,14.88,13.28
RGC-S0,16.15,15.03,13.32
RSA-OOC-S1,14.62,14.05,12.49
CB-2.0-S1,14.14,13.5,11.84
CB-15.0-S1,20.76,18.83,16.36


## Significance Testing

We report Wilcoxon signed-rank test for measuring the signifance of perplexity differences

Stats are really lacking any sort of context. TODO: Cleanup

In [6]:
from scipy.stats import wilcoxon

In [20]:
key_models = [
    ("RGC", "S0"),
    ("RSA-OOC", "S1"),
    ("CB-2.0", "S1"),
    ("CB-15.0", "S1"),
    ("RSA-OOC", "S0")
]

In [48]:
def run0(results_df, m0, m1, split="train"):
    name0, proba0 = m0
    name1, proba1 = m1
    df0 = results_df[(results_df.model_name==name0)&(results_df.split==split)].sort_values("row_indices")
    values0 = df0[f"{proba0}_adjusted"].values
    df1 = results_df[(results_df.model_name==name1)&(results_df.split==split)].sort_values("row_indices")
    values1 = df1[f"{proba1}_adjusted"].values
    print(f"{name0}-{proba0} vs {name1}-{proba1}: ",end="")
    print(f" {wilcoxon(values0, values1)}")
    
def run1(results_df, m0, m1, split="train"):
    name0, proba0 = m0
    name1, proba1 = m1
    df0 = results_df[(results_df.model_name==name0)&(results_df.split==split)].sort_values("row_indices")
    values0 = df0[f"log_{proba0}"].values
    df1 = results_df[(results_df.model_name==name1)&(results_df.split==split)].sort_values("row_indices")
    values1 = df1[f"log_{proba1}"].values
    print(f"{name0}-{proba0} vs {name1}-{proba1}: ",end="")
    print(f" {wilcoxon(values0, values1)}")

def run2(results_df, m0, m1, split="train"):
    name0, proba0 = m0
    name1, proba1 = m1
    df0 = results_df[(results_df.model_name==name0)&(results_df.split==split)].sort_values("row_indices")
    values0 = df0[f"log_{proba0}"].values
    df1 = results_df[(results_df.model_name==name1)&(results_df.split==split)].sort_values("row_indices")
    values1 = df1[f"log_{proba1}"].values
    ppl_samples0, ppl_samples1 = bootstrap_sample(values0, values1, n_samples=500, sample_size=2000, seed=0)
    print(f"{name0}-{proba0} vs {name1}-{proba1}: ",end="")
    print(f" {wilcoxon(ppl_samples0, ppl_samples1, alternative='greater')}")
    return ppl_samples0, ppl_samples1
    
    
def bootstrap_sample(log_probas0, log_probas1, n_samples, sample_size, seed):
    assert log_probas0.shape == log_probas1.shape
    ppl_samples0 = np.zeros(n_samples)
    ppl_samples1 = np.zeros(n_samples)
    for sample_index in range(n_samples):
        np.random.seed(seed+sample_index)
        indices = np.random.choice(log_probas0.shape[0], size=sample_size, replace=False)
        ppl_samples0[sample_index] = np.exp(-1 * log_probas0[indices].mean())
        ppl_samples1[sample_index] = np.exp(-1 * log_probas1[indices].mean())
    return ppl_samples0, ppl_samples1
        

In [49]:
for i, m0 in enumerate(key_models):
    for m1 in key_models[i+1:]:
        run0(results_df, m0, m1)
        run1(results_df, m0, m1)
        run2(results_df, m0, m1)
        print("--")

RGC-S0 vs RSA-OOC-S1:  WilcoxonResult(statistic=12494806.0, pvalue=2.760688792291463e-41)
RGC-S0 vs RSA-OOC-S1:  WilcoxonResult(statistic=13882841.0, pvalue=1.0285352148595537e-10)
RGC-S0 vs RSA-OOC-S1:  WilcoxonResult(statistic=125250.0, pvalue=6.323594743460744e-84)
--
RGC-S0 vs CB-2.0-S1:  WilcoxonResult(statistic=6478850.0, pvalue=0.0)
RGC-S0 vs CB-2.0-S1:  WilcoxonResult(statistic=10650828.0, pvalue=1.394143852046975e-114)
RGC-S0 vs CB-2.0-S1:  WilcoxonResult(statistic=125250.0, pvalue=6.323594743460744e-84)
--
RGC-S0 vs CB-15.0-S1:  WilcoxonResult(statistic=8821200.0, pvalue=2.6594196726044383e-224)
RGC-S0 vs CB-15.0-S1:  WilcoxonResult(statistic=11975350.0, pvalue=3.75913419640668e-58)
RGC-S0 vs CB-15.0-S1:  WilcoxonResult(statistic=0.0, pvalue=1.0)
--
RGC-S0 vs RSA-OOC-S0:  WilcoxonResult(statistic=12926989.0, pvalue=1.6481822134432864e-29)
RGC-S0 vs RSA-OOC-S0:  WilcoxonResult(statistic=13228041.0, pvalue=1.6271261042649062e-22)
RGC-S0 vs RSA-OOC-S0:  WilcoxonResult(statistic=