# Mixture Analysis

See Section 4.3 of the paper

In [12]:
import os
import sys
import pathlib
import warnings

import matplotlib.pyplot as plt
from magis_sigdial2020.datasets.color_reference_2017.vectorized import make_or_load_cic
from magis_sigdial2020.hyper_params import HyperParameters
from magis_sigdial2020.settings import REPO_ROOT
from magis_sigdial2020.datasets.xkcd import XKCD
import numpy as np
import pandas as pd
import seaborn as sns

LAB_SUBDIR_ROOT = pathlib.Path(REPO_ROOT).absolute() / "lab" / "analyses"
RESULTS_DIR = LAB_SUBDIR_ROOT / "logs" / "E004_evaluate_on_cic" / "published_version"
HPARAMS_YAML = RESULTS_DIR / "hparams.yaml"
RESULTS_CSV =  RESULTS_DIR / "results.csv"

sys.path.insert(0, str(LAB_SUBDIR_ROOT / "src"))
import cic_results_lib
import emlib

sns.set_style('whitegrid')
sns.set_context('notebook')
warnings.filterwarnings("ignore")

In [13]:
%%time

hparams = HyperParameters.load(HPARAMS_YAML)
xkcd = XKCD.from_settings(coordinate_system="fft")
backoff_p_w = cic_results_lib.get_backoff_p_w(xkcd)
cic = make_or_load_cic()

Loaded from disk
CPU times: user 15.8 s, sys: 940 ms, total: 16.7 s
Wall time: 16.7 s


In [3]:
%%time

results_df = cic_results_lib.load_results(RESULTS_CSV, backoff_p_w, grouping_keys=["model_name"])

CPU times: user 13.4 s, sys: 867 ms, total: 14.3 s
Wall time: 14.3 s


In [4]:
CB_MODELS = sorted([model_name for model_name in results_df.model_name.unique() if "CB" in model_name], key=lambda x: float(x.split("-")[1]))
print(", ".join(CB_MODELS))

CB-1.0, CB-2.0, CB-3.0, CB-4.0, CB-5.0, CB-6.0, CB-7.0, CB-8.0, CB-9.0, CB-10.0, CB-11.0, CB-12.0, CB-13.0, CB-14.0, CB-15.0, CB-16.0, CB-17.0, CB-18.0, CB-19.0, CB-20.0, CB-21.0, CB-22.0, CB-23.0, CB-24.0, CB-25.0, CB-26.0


In [5]:
%%time 

results_for_comparing_cb_alphas = []

for cb_model in CB_MODELS:
    cb_names = [cb_model]
    rgc_names = ['RGC']
    rsa_names = ['RSA-OOC']
    results_for_comparing_cb_alphas.append(emlib.em_experiment(
        df=results_df, 
        s0_model_list=rgc_names + rsa_names + cb_names, 
        s1_model_list=rsa_names + cb_names, 
        model_name_column='model_name', 
        s0_column_name='S0', 
        s1_column_name='S1', 
        backoff_p_w=backoff_p_w,
        include_baselines=True,
        num_iterations=50000
    ))

CPU times: user 10.5 s, sys: 4.05 ms, total: 10.5 s
Wall time: 10.5 s


In [51]:
best_dev_result = min(results_for_comparing_cb_alphas, key=lambda results_i: results_i["dev"]["perplexity"])
emlib.show_em_results(best_dev_result, splits_to_show=("train", "dev", "test"))

-------------------------- results -----------------------
	EM Posterior for CB-15.0_S0: 0.000011
	EM Posterior for RGC_S0: 0.332292
	EM Posterior for RSA-OOC_S0: 0.000011
	EM Posterior for CB-15.0_S1: 0.193866
	EM Posterior for RSA-OOC_S1: 0.463882
	EM Posterior for xkcd_baseline: 0.006139
	EM Posterior for random: 0.003797
	Perplexity on train: 13.466792979782523
	Perplexity on dev: 12.752243931313568
	Perplexity on test: 11.298261490041279
----------------------------------------------------------


## Matcher success by condition and dominant model

We report the matcher success rates partitioned by condition and dominant model. In other words, there are 9 buckets corresponding to a 3x3 design: what condition is the data point from and which model had the highest probability for the correct color phrase.  In each bucket, we tally the matcher success rates. 

In [40]:
from IPython.display import display

In [15]:
def get_proba(em_results_dict, model_name, split):
    model_index = em_results_dict['model_names'].index(model_name)
    return em_results_dict[split]['em_data'][:, model_index]

In [41]:
split = "train"

sigtests = cic_results_lib.SignificanceTests(
    rgc_probas=get_proba(best_dev_result, "RGC_S0", split),
    rsa_probas=get_proba(best_dev_result, "RSA-OOC_S1", split),
    cb_probas=get_proba(best_dev_result, "CB-15.0_S1", split),
    cic=cic,
    split=split
)
summary_dfs = sigtests.get_summary_dfs()
display(summary_dfs["model_only"])
display(summary_dfs["condition_only"])
display(summary_dfs["condition_model"])

Unnamed: 0_level_0,matcher_succeeded,matcher_succeeded
Unnamed: 0_level_1,mean,count
case_0,Unnamed: 1_level_2,Unnamed: 2_level_2
CB,0.979973,4394
RGC,0.923987,2197
RGC == 0,0.504537,551
RSA,0.798762,646


Unnamed: 0_level_0,matcher_succeeded,matcher_succeeded
Unnamed: 0_level_1,mean,count
condition,Unnamed: 1_level_2,Unnamed: 2_level_2
close,0.825515,1748
split,0.895449,2439
far,0.972785,3601


Unnamed: 0_level_0,Unnamed: 1_level_0,matcher_succeeded,matcher_succeeded
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
condition,case_0,Unnamed: 2_level_2,Unnamed: 3_level_2
close,CB,0.953425,730
close,RGC,0.842105,608
close,RGC == 0,0.46875,256
close,RSA,0.746753,154
split,CB,0.971878,1209
split,RGC,0.931489,759
split,RGC == 0,0.535088,228
split,RSA,0.740741,243
far,CB,0.991853,2455
far,RGC,0.977108,830


In [42]:
split = "dev"

sigtests = cic_results_lib.SignificanceTests(
    rgc_probas=get_proba(best_dev_result, "RGC_S0", split),
    rsa_probas=get_proba(best_dev_result, "RSA-OOC_S1", split),
    cb_probas=get_proba(best_dev_result, "CB-15.0_S1", split),
    cic=cic,
    split=split
)
summary_dfs = sigtests.get_summary_dfs()
display(summary_dfs["model_only"])
display(summary_dfs["condition_only"])
display(summary_dfs["condition_model"])

Unnamed: 0_level_0,matcher_succeeded,matcher_succeeded
Unnamed: 0_level_1,mean,count
case_0,Unnamed: 1_level_2,Unnamed: 2_level_2
CB,0.986806,4320
RGC,0.947549,2040
RGC == 0,0.517699,452
RSA,0.841667,600


Unnamed: 0_level_0,matcher_succeeded,matcher_succeeded
Unnamed: 0_level_1,mean,count
condition,Unnamed: 1_level_2,Unnamed: 2_level_2
close,0.857324,1577
split,0.922014,2244
far,0.978558,3591


Unnamed: 0_level_0,Unnamed: 1_level_0,matcher_succeeded,matcher_succeeded
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
condition,case_0,Unnamed: 2_level_2,Unnamed: 3_level_2
close,CB,0.974468,705
close,RGC,0.893204,515
close,RGC == 0,0.479821,223
close,RSA,0.731343,134
split,CB,0.978147,1144
split,RGC,0.947814,709
split,RGC == 0,0.578035,173
split,RSA,0.816514,218
far,CB,0.994334,2471
far,RGC,0.981618,816


In [43]:
split = "test"

sigtests = cic_results_lib.SignificanceTests(
    rgc_probas=get_proba(best_dev_result, "RGC_S0", split),
    rsa_probas=get_proba(best_dev_result, "RSA-OOC_S1", split),
    cb_probas=get_proba(best_dev_result, "CB-15.0_S1", split),
    cic=cic,
    split=split
)
summary_dfs = sigtests.get_summary_dfs()
display(summary_dfs["model_only"])
display(summary_dfs["condition_only"])
display(summary_dfs["condition_model"])

Unnamed: 0_level_0,matcher_succeeded,matcher_succeeded
Unnamed: 0_level_1,mean,count
case_0,Unnamed: 1_level_2,Unnamed: 2_level_2
CB,0.987559,5064
RGC,0.935577,2437
RGC == 0,0.561776,518
RSA,0.819588,582


Unnamed: 0_level_0,matcher_succeeded,matcher_succeeded
Unnamed: 0_level_1,mean,count
condition,Unnamed: 1_level_2,Unnamed: 2_level_2
close,0.872949,1889
split,0.916823,2657
far,0.977559,4055


Unnamed: 0_level_0,Unnamed: 1_level_0,matcher_succeeded,matcher_succeeded
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,count
condition,case_0,Unnamed: 2_level_2,Unnamed: 3_level_2
close,CB,0.974566,865
close,RGC,0.874411,637
close,RGC == 0,0.58498,253
close,RSA,0.753731,134
split,CB,0.981375,1396
split,RGC,0.92883,829
split,RGC == 0,0.567961,206
split,RSA,0.792035,226
far,CB,0.994649,2803
far,RGC,0.981462,971


## Statistical Analyses

We analyze the statistical signifance of success rates by model and by condition. 


- Q1: For each model, are the probabilities significantly different across condition
- Q2: For each condition, are the probabilities significantly different across model
- Q3: For each model, are the success rates significantly different across condition
- Q4: For each condition, are the success rates significantly different across model. 
- Q5: For each condition, are the success rates significantly different from the cases where RGC == 0?

### Q1: For each model, are the probabilities significantly different across condition

This wasn't really discussed in the paper, but...
TODO: put the nice-ified functions in. 

### Q2: For each condition, are the probabilities significantly different across model

In [77]:
split = "test"

sigtests = cic_results_lib.SignificanceTests(
    rgc_probas=get_proba(best_dev_result, "RGC_S0", split),
    rsa_probas=get_proba(best_dev_result, "RSA-OOC_S1", split),
    cb_probas=get_proba(best_dev_result, "CB-15.0_S1", split),
    cic=cic,
    split=split
)
sigtests.show_signifcance_utterance_probabilities_per_condition_across_models()

at 0.01 w/ bonferonni correction of 9 comparisons: 1.11e-03
at 0.001 w/ bonferonni correction of 9 comparisons: 1.11e-04
at 0.0001 w/ bonferonni correction of 9 comparisons: 1.11e-05
Condition:  far
Sizes:  [(4055,), (4055,), (4055,)]
Totals:  [1445.2547867815201, 1409.6339307768133, 1410.3126151758597]
***RGC greater than RSA
	WilcoxonResult(statistic=7667385.0, pvalue=0.0)
RGC less than RSA
	WilcoxonResult(statistic=7667385.0, pvalue=1.0)
RGC greater than CB
	WilcoxonResult(statistic=2518353.5, pvalue=1.0)
***RGC less than CB
	WilcoxonResult(statistic=2518353.5, pvalue=2.7571628798674348e-101)
RSA greater than CB
	WilcoxonResult(statistic=1993439.0, pvalue=1.0)
***RSA less than CB
	WilcoxonResult(statistic=1993439.0, pvalue=6.975816243477174e-178)
----------------------------------------------------------------------------------------------------
Condition:  split
Sizes:  [(2657,), (2657,), (2657,)]
Totals:  [672.3486833776246, 601.0079694965343, 593.870790350605]
***RGC greater than

### Q3: For each model, are the success rates significantly different across condition

In [50]:
split = "test"

sigtests = cic_results_lib.SignificanceTests(
    rgc_probas=get_proba(best_dev_result, "RGC_S0", split),
    rsa_probas=get_proba(best_dev_result, "RSA-OOC_S1", split),
    cb_probas=get_proba(best_dev_result, "CB-15.0_S1", split),
    cic=cic,
    split=split
)
sigtests.show_signicance_matcher_success_per_model_across_conditions()

at 0.01 w/ bonferonni correction of 9 comparisons: 1.11e-03
at 0.001 w/ bonferonni correction of 9 comparisons: 1.11e-04
at 0.0001 w/ bonferonni correction of 9 comparisons: 1.11e-05
Model:  RGC
Sizes:  [(971,), (829,), (637,)]
Totals:  [953, 770, 557]
***far vs split
	MannwhitneyuResult(statistic=381296.0, pvalue=1.9132631041346568e-08)
***far vs close
	MannwhitneyuResult(statistic=276156.5, pvalue=8.663763775508032e-19)
*split vs close
	MannwhitneyuResult(statistic=249668.0, pvalue=0.00021241598594875686)
----------------------------------------------------------------------------------------------------
Model:  RSA
Sizes:  [(222,), (226,), (134,)]
Totals:  [197, 179, 101]
far vs split
	MannwhitneyuResult(statistic=22694.0, pvalue=0.0030359944854402876)
*far vs close
	MannwhitneyuResult(statistic=12886.0, pvalue=0.00047838934784542105)
split vs close
	MannwhitneyuResult(statistic=14562.0, pvalue=0.19957489108510734)
--------------------------------------------------------------------

### Q4: For each condition, are the success rates significantly different across model. 

In [49]:
split = "test"

sigtests = cic_results_lib.SignificanceTests(
    rgc_probas=get_proba(best_dev_result, "RGC_S0", split),
    rsa_probas=get_proba(best_dev_result, "RSA-OOC_S1", split),
    cb_probas=get_proba(best_dev_result, "CB-15.0_S1", split),
    cic=cic,
    split=split
)
sigtests.show_signicance_matcher_success_per_condition_across_models()

at 0.01 w/ bonferonni correction of 9 comparisons: 1.11e-03
at 0.001 w/ bonferonni correction of 9 comparisons: 1.11e-04
at 0.0001 w/ bonferonni correction of 9 comparisons: 1.11e-05
Condition:  far
Sizes:  [(971,), (222,), (2803,)]
Totals:  [953, 197, 2788]
***RGC vs RSA
	MannwhitneyuResult(statistic=97641.5, pvalue=5.9671813930038724e-12)
**RGC vs CB
	MannwhitneyuResult(statistic=1342912.0, pvalue=7.149147074748736e-05)
***RSA vs CB
	MannwhitneyuResult(statistic=277760.5, pvalue=1.2531321626711524e-41)
----------------------------------------------------------------------------------------------------
Condition:  split
Sizes:  [(829,), (226,), (1396,)]
Totals:  [770, 179, 1370]
***RGC vs RSA
	MannwhitneyuResult(statistic=80862.5, pvalue=6.779631870722291e-10)
***RGC vs CB
	MannwhitneyuResult(statistic=548237.0, pvalue=2.0486901860474167e-10)
***RSA vs CB
	MannwhitneyuResult(statistic=127880.0, pvalue=1.9342142795304332e-37)
------------------------------------------------------------

### Q5: For each condition, are the success rates significantly different from the cases where RGC == 0?

In [80]:
split = "test"

sigtests = cic_results_lib.SignificanceTests(
    rgc_probas=get_proba(best_dev_result, "RGC_S0", split),
    rsa_probas=get_proba(best_dev_result, "RSA-OOC_S1", split),
    cb_probas=get_proba(best_dev_result, "CB-15.0_S1", split),
    cic=cic,
    split=split
)
sigtests.show_signicance_matcher_success_per_condition_vs_rgc0()

at 0.01 w/ bonferonni correction of 9 comparisons: 1.11e-03
at 0.001 w/ bonferonni correction of 9 comparisons: 1.11e-04
at 0.0001 w/ bonferonni correction of 9 comparisons: 1.11e-05
Condition:  far
Sizes:  [(971,), (222,), (2803,)]
Totals:  [953, 197, 2788]
***RGC vs RGC==0
	MannwhitneyuResult(statistic=13154.0, pvalue=2.2749755200555247e-77)
***RSA vs RGC==0
	MannwhitneyuResult(statistic=3623.5, pvalue=2.721411350433994e-14)
***CB vs RGC==0
	MannwhitneyuResult(statistic=36881.5, pvalue=4.576953809435273e-236)
----------------------------------------------------------------------------------------------------
Condition:  split
Sizes:  [(829,), (226,), (1396,)]
Totals:  [770, 179, 1370]
***RGC vs RGC==0
	MannwhitneyuResult(statistic=54573.5, pvalue=2.758459258644061e-40)
***RSA vs RGC==0
	MannwhitneyuResult(statistic=18062.0, pvalue=2.8339212460666715e-07)
***CB vs RGC==0
	MannwhitneyuResult(statistic=84344.0, pvalue=2.2305113733279624e-102)
--------------------------------------------