In [3]:
# so we can use packages from parent directory
import sys
sys.path.append("..")

In [10]:
# code copied from example experiments.py
import torch
import torch.nn as nn
import numpy as np
from monroe_data import MonroeData, MonroeDataEntry, Color # last two for reading pkl file
import caption_featurizers
from color_featurizers import ColorFeaturizer, color_phi_fourier
from models import LiteralListener, LiteralSpeaker, ImaginativeListener, CaptionEncoder, CaptionGenerator, ColorGenerator, ColorSelector, ColorOnlyBaseline
from evaluation import score_model, delta_e_dist, Speaker, Score
from experiment import FeatureHandler, evaluate_model

In [5]:
from functools import partial
from scipy import stats

In [18]:
import importlib
import experiment
importlib.reload(experiment)
from experiment import FeatureHandler, evaluate_model

In [7]:
prefix="../"
train_data = MonroeData(prefix + "data/csv/train_corpus_monroe.csv", prefix + "data/entries/train_entries_monroe.pkl")
dev_data_synth  = MonroeData(prefix + "data/csv/dev_corpus_synth_10fold.csv", prefix + "data/entries/dev_corpus_synth_10fold.pkl")

In [8]:
test_data_synth  = MonroeData(prefix + "data/csv/test_corpus_synth_10fold.csv", prefix + "data/entries/test_corpus_synth_10fold.pkl")

In [9]:
def composite_score(eval_df, speaker="gameid"):
    """
    This is the scoring function that Julia came up with
    """
    mean_scores = eval_df.groupby(speaker).numOutcome.mean()
    mean_numCleanWords = eval_df.groupby(speaker).numCleanWords.mean()
    mean_clkTime = eval_df.groupby(speaker).clkTime.mean()
    true_scores = mean_scores / mean_clkTime / mean_numCleanWords
    max_score = true_scores.max()
    true_scores /= max_score # normalize the scores
    return true_scores

In [46]:
# 4. Imaginative Listener
def imaginative_listener(model_file="../model/imaginative_listener_with_distractors_linear100hd5epoch_GLOVE_MSE.params"):
    print("Initializing featurizers")
    caption_phi = caption_featurizers.CaptionFeaturizer(tokenizer=caption_featurizers.EndingTokenizer)
    color_phi = ColorFeaturizer(color_phi_fourier, "rgb", normalized=True)

    def target_color_target(data_entry):
        return np.array(data_entry.colors[0].rgb_norm)

    feature_handler = FeatureHandler(train_data, test_data_synth, caption_phi, color_phi, target_fn=target_color_target,
                                randomized_colors=False) #using TEST data now :) 

    print("Obtaining training features") # get features even if you're runnning the pretrained model for example
    #train_features = feature_handler.train_features()
    #train_targets = feature_handler.train_targets()

    imaginative_model = ImaginativeListener(ColorGenerator, criterion=torch.nn.CosineEmbeddingLoss,
                            optimizer=torch.optim.Adam, lr=0.004, num_epochs=5)

    # Creating model
    MSELossSum = lambda: nn.MSELoss(reduction='sum') # sorry for this ugliness..... but this is me passing a parameter to the loss func
    imaginative_model = ImaginativeListener(ColorGenerator, criterion=MSELossSum,
                                optimizer=torch.optim.Adam, lr=0.001, num_epochs=5, use_color=True)
    imaginative_model.init_model(embed_dim=100, hidden_dim=50, vocab_size=feature_handler.caption_featurizer.caption_indexer.size,
                    color_in_dim=54, color_hidden_dim=50, weight_matrix=caption_featurizers.get_pretrained_glove(feature_handler.caption_featurizer.caption_indexer.idx2word.items(), 100, prefix=True))

    imaginative_model.load_model(model_file)
        
    print("Evaluating model")
    output_to_score_de = lambda outputs, targets: np.array([delta_e_dist(outputs[i], targets[i]) for i in range(len(targets))])
    # we want to score based on the model's predictions at the TARGET indices not listener clicked indices,
    # so we change the feature_handler's target function to do that:
    my_score_model = partial(score_model, speaker=Speaker.BY_GAME_ID_COND, return_df=True, score=Score.COMPOSITE)
    result = evaluate_model(test_data_synth, feature_handler, imaginative_model, output_to_score_de, my_score_model, accuracy=False)
    
    return result


In [51]:
def evaluate_imaginative_listener_samples():
    model_directory = "../imaginative_listener_samples"
    num_samples = 10
    aggregate_correlations = []
    close_correlations = []
    split_correlations = []
    far_correlations = []

    aggregate_accuracies = []
    close_accuracies = []
    split_accuracies = []
    far_accuracies = []

    for i in range(num_samples):
        print("Evaluating sample #{}".format(i))
        _, imaginative_listener_eval = imaginative_listener(model_file="{}/sample_{}.params".format(model_directory, i))
        il_true_scores = imaginative_listener_eval.groupby('gameid').numOutcome.mean()
        il_model_scores = imaginative_listener_eval.groupby('gameid').model_scores.mean()
        il_true_scores_composite = composite_score(imaginative_listener_eval)

        aggregate_correlations.append(stats.pearsonr(il_model_scores, il_true_scores_composite))
        # arbitrarily say we get it right if we assign a majority of the probability mass to it
        #aggregate_accuracies.append(sum(imaginative_listener_eval.model_scores > 0.5)/len(imaginative_listener_eval.model_scores))
        aggregate_accuracies.append(np.mean(imaginative_listener_eval.model_scores))

        # separate out conditions
        imaginative_listener_close = imaginative_listener_eval[imaginative_listener_eval.condition == "close"]
        imaginative_listener_split = imaginative_listener_eval[imaginative_listener_eval.condition == "split"]
        imaginative_listener_far = imaginative_listener_eval[imaginative_listener_eval.condition == "far"]

        imaginative_listener_close_true_scores = imaginative_listener_close.groupby('gameid').numOutcome.mean()
        imaginative_listener_close_model_scores = imaginative_listener_close.groupby('gameid').model_scores.mean()

        imaginative_listener_split_true_scores =  imaginative_listener_split.groupby('gameid').numOutcome.mean()
        imaginative_listener_split_model_scores = imaginative_listener_split.groupby('gameid').model_scores.mean()

        imaginative_listener_far_true_scores =  imaginative_listener_far.groupby('gameid').numOutcome.mean()
        imaginative_listener_far_model_scores = imaginative_listener_far.groupby('gameid').model_scores.mean()

        # turn true scores to composite, gricean scores
        imaginative_listener_close_composite = composite_score(imaginative_listener_close)
        imaginative_listener_split_composite = composite_score(imaginative_listener_split)
        imaginative_listener_far_composite   = composite_score(imaginative_listener_far)

        close_correlations.append(stats.pearsonr(imaginative_listener_close_composite, imaginative_listener_close_model_scores))
        split_correlations.append(stats.pearsonr(imaginative_listener_split_composite, imaginative_listener_split_model_scores))
        far_correlations.append(stats.pearsonr(imaginative_listener_far_composite, imaginative_listener_far_model_scores))

    #     close_accuracies.append(sum(imaginative_listener_close_model_scores > 0.5)/len(imaginative_listener_close_model_scores))
    #     split_accuracies.append(sum(imaginative_listener_split_model_scores > 0.5)/len(imaginative_listener_split_model_scores))
    #     far_accuracies.append(sum(imaginative_listener_far_model_scores > 0.5)/len(imaginative_listener_far_model_scores))
        close_accuracies.append(np.mean(imaginative_listener_close_model_scores))
        split_accuracies.append(np.mean(imaginative_listener_split_model_scores))
        far_accuracies.append(np.mean(imaginative_listener_far_model_scores))
        
        print("Most recent stats:")
        print("agg acc:", aggregate_accuracies[-1])
        print("clo acc:", close_accuracies[-1])
        print("spl acc:", split_accuracies[-1])
        print("far acc:", far_accuracies[-1])
        print("agg cor:", aggregate_correlations[-1])
        print("clo cor:", close_correlations[-1])
        print("spl cor:", split_correlations[-1])
        print("far cor:", far_correlations[-1])
        
    return {"aggregate_accuracies": aggregate_accuracies,
            "close_accuracies": close_accuracies,
            "split_accuracies": split_accuracies,
            "far_accuracies": far_accuracies,
            "aggregate_correlations": aggregate_correlations,
            "close_correlations":close_correlations,
            "far_correlations":far_correlations,
            "split_correlations": split_correlations}


In [52]:
results = evaluate_imaginative_listener_samples()

Evaluating sample #0
Initializing featurizers
Obtaining training features
Evaluating model
Got here to composite score
Most recent stats:
agg acc: 23.98516954890258
clo acc: 18.27353323876809
spl acc: 19.544536012224782
far acc: 34.14159079769889
agg cor: (-0.886422302978764, 4.250960310734327e-178)
clo cor: (-0.455535609378521, 2.0635307187084423e-28)
spl cor: (-0.5473674197436782, 1.3343968892457512e-42)
far cor: (-0.8657290229509013, 3.1337385241772896e-160)
Evaluating sample #1
Initializing featurizers
Obtaining training features
Evaluating model
Got here to composite score
Most recent stats:
agg acc: 24.211091965944536
clo acc: 18.65899781108387
spl acc: 19.767035392086786
far acc: 34.21259842311515
agg cor: (-0.8860109120142069, 1.0393073393501362e-177)
clo cor: (-0.44432579193215566, 5.887339051429764e-27)
spl cor: (-0.5267565591108372, 4.9684543265154734e-39)
far cor: (-0.8675670970509768, 1.079527020742103e-161)
Evaluating sample #2
Initializing featurizers
Obtaining training 

In [53]:
import pickle
with open("../results/imaginative_listener_assessment.pkl", "wb") as file:
    pickle.dump(results, file)

In [57]:
np.mean([cor[0] for cor in results['aggregate_correlations']])

-0.8857654592635387

In [33]:
aggregate_accuracies

[24.225927799225445]

In [34]:
close_accuracies

[18.50006086024096]

In [35]:
far_accuracies

[34.470457996575014]

In [36]:
split_accuracies

[19.710338275301137]

In [37]:
aggregate_correlations

[(-0.8799177945947197, 1.4376357425904765e-168)]

In [38]:
close_correlations

[(-0.4409533567945598, 5.226923223263904e-26)]

In [39]:
far_correlations

[(-0.8577761621065179, 5.865956224847408e-151)]

In [40]:
split_correlations

[(-0.5079681530415953, 2.896292325793216e-35)]

In [42]:
# 1. Literal Listener
# -----------------------------------------
# TODO: FILL IN PARAMETERS 
def literal_listener_experiment(train=False, epochs=5, embed_dim = 100, hidden_dim = 100, color_dim= 54, model_file="../model/literal_listener_5epoch-2.params"):

    # Initializing featurizers
    print("Initializing featurizers")
    caption_phi = caption_featurizers.CaptionFeaturizer(tokenizer=caption_featurizers.EndingTokenizer) # Use with parameter files that end in `endings_tkn`
    # caption_phi = caption_featurizers.CaptionFeaturizer(tokenizer=caption_featurizers.WhitespaceTokenizer) # Use with parameter files don't
    color_phi = ColorFeaturizer(color_phi_fourier, "rgb", normalized=True)
    feature_handler = FeatureHandler(train_data, test_data_synth, caption_phi, color_phi) # target function is initialized by default

    print("Initializing model")
    model = LiteralListener(CaptionEncoder, num_epochs = epochs)
    model.init_model(embed_dim = embed_dim, hidden_dim = hidden_dim, vocab_size = feature_handler.caption_featurizer.caption_indexer.size,
                 color_dim = color_dim)

    print(model_file)
    model.load_model(model_file)

    # convert the model output to a score for that particular round
    print("Evaluating model")
    output_to_score = lambda model_outputs, targets: np.exp(model_outputs[np.arange(len(model_outputs)), targets]) # get the model's predicted probablity at each target index and use that as the score
    my_score_model = partial(score_model, speaker=Speaker.BY_GAME_ID_COND, return_df=True, score=Score.COMPOSITE)
    eval_p = evaluate_model(test_data_synth, feature_handler, model, output_to_score, my_score_model, accuracy=False)

    output_to_score_acc = lambda model_outputs, targets: np.argmax(model_outputs, axis=1) == targets
    eval_acc = evaluate_model(test_data_synth, feature_handler, model, output_to_score_acc, my_score_model, accuracy=False)
    
    return (eval_p, eval_acc)

In [51]:
# I assume pragmatic will either be similar or need its own type of thing
def evaluate_literal_listener_samples():
    model_directory = "../literal_listener_samples"
    num_samples = 10
    aggregate_correlations = []
    close_correlations = []
    split_correlations = []
    far_correlations = []

    aggregate_accuracies = []
    close_accuracies = []
    split_accuracies = []
    far_accuracies = []


    for i in range(num_samples):
        print("Evaluating Literal Listener #{}".format(i))
        listener_eval, listener_eval_acc = literal_listener_experiment(model_file="{}/sample_{}.params".format(model_directory, i))
        _, listener_eval = listener_eval         # first item is correlation, which we will recalculate
        _, listener_eval_acc = listener_eval_acc # first item is correlation, which we will recalculate
        
        true_scores = listener_eval.groupby('gameid').numOutcome.mean()
        model_scores = listener_eval.groupby('gameid').model_scores.mean()
        true_scores_composite = composite_score(listener_eval)

        aggregate_correlations.append(stats.pearsonr(model_scores, true_scores_composite))
        # arbitrarily say we get it right if we assign a majority of the probability mass to it
        aggregate_accuracies.append(sum(listener_eval_acc.model_scores)/len(listener_eval_acc.model_scores))


        # separate out conditions
        listener_close = listener_eval[listener_eval.condition == "close"]
        listener_split = listener_eval[listener_eval.condition == "split"]
        listener_far =   listener_eval[listener_eval.condition == "far"]
        
        listener_close_acc = listener_eval_acc[listener_eval_acc.condition == "close"]
        listener_split_acc = listener_eval_acc[listener_eval_acc.condition == "split"]
        listener_far_acc =   listener_eval_acc[listener_eval_acc.condition == "far"]

        listener_close_true_scores =  listener_close.groupby('gameid').numOutcome.mean()
        listener_close_model_scores = listener_close.groupby('gameid').model_scores.mean()

        listener_split_true_scores =  listener_split.groupby('gameid').numOutcome.mean()
        listener_split_model_scores = listener_split.groupby('gameid').model_scores.mean()

        listener_far_true_scores =  listener_far.groupby('gameid').numOutcome.mean()
        listener_far_model_scores = listener_far.groupby('gameid').model_scores.mean()

        # turn true scores to composite, gricean scores
        listener_close_composite = composite_score(listener_close)
        listener_split_composite = composite_score(listener_split)
        listener_far_composite   = composite_score(listener_far)

        close_correlations.append(stats.pearsonr(listener_close_composite, listener_close_model_scores))
        split_correlations.append(stats.pearsonr(listener_split_composite, listener_split_model_scores))
        far_correlations.append(stats.pearsonr(listener_far_composite,     listener_far_model_scores))

        close_accuracies.append(sum(listener_close_acc.model_scores)/len(listener_close_acc.model_scores))
        split_accuracies.append(sum(listener_split_acc.model_scores)/len(listener_split_acc.model_scores))
        far_accuracies.append(sum(  listener_far_acc.model_scores)/len(listener_far_acc.model_scores))

    return {"aggregate_accuracies": aggregate_accuracies,
            "close_accuracies": close_accuracies,
            "split_accuracies": split_accuracies,
            "far_accuracies": far_accuracies,
            "aggregate_correlations": aggregate_correlations,
            "close_correlations":close_correlations,
            "far_correlations":far_correlations,
            "split_correlations": split_correlations}


In [52]:
lit_list_results = evaluate_literal_listener_samples()

Evaluating Literal Listener #0
Initializing featurizers
Initializing model
../literal_listener_samples/sample_0.params
Evaluating model
Got here to composite score
Got here to composite score
Evaluating Literal Listener #1
Initializing featurizers
Initializing model
../literal_listener_samples/sample_1.params
Evaluating model
Got here to composite score
Got here to composite score
Evaluating Literal Listener #2
Initializing featurizers
Initializing model
../literal_listener_samples/sample_2.params
Evaluating model
Got here to composite score
Got here to composite score
Evaluating Literal Listener #3
Initializing featurizers
Initializing model
../literal_listener_samples/sample_3.params
Evaluating model
Got here to composite score
Got here to composite score
Evaluating Literal Listener #4
Initializing featurizers
Initializing model
../literal_listener_samples/sample_4.params
Evaluating model
Got here to composite score
Got here to composite score
Evaluating Literal Listener #5
Initializ

In [55]:
lit_list_results

{'aggregate_accuracies': [0.4652272727272727,
  0.465,
  0.465530303030303,
  0.46541666666666665,
  0.46541666666666665,
  0.4625,
  0.46579545454545457,
  0.46795454545454546,
  0.46299242424242426,
  0.4668181818181818],
 'aggregate_correlations': [(0.9551759145546841, 3.0935846835781653e-280),
  (0.9568476243580771, 1.7621962353396275e-284),
  (0.9603190679507451, 7.374375789094369e-294),
  (0.9574546896746611, 4.602201869499246e-286),
  (0.9577112363404624, 9.704977797334284e-287),
  (0.95267599364886, 3.502568069624804e-274),
  (0.9572465184635212, 1.6159093251291917e-285),
  (0.9620378666348999, 8.114004343696587e-299),
  (0.9534726902789679, 4.480954099693109e-276),
  (0.9591990329967709, 9.599559029309213e-291)],
 'close_accuracies': [0.4375,
  0.43886363636363634,
  0.44363636363636366,
  0.44022727272727274,
  0.43977272727272726,
  0.43136363636363634,
  0.4420454545454545,
  0.4456818181818182,
  0.4335227272727273,
  0.44136363636363635],
 'close_correlations': [(0.855525

In [79]:
with open("../results/literal_listener_assessment_true_acc.pkl", "wb") as file:
    pickle.dump(lit_list_results, file)

In [81]:
lit_list_results

{'aggregate_accuracies': [0.46049242424242426,
  0.4615530303030303,
  0.45943181818181816,
  0.46018939393939395,
  0.4612878787878788,
  0.45943181818181816,
  0.4622727272727273,
  0.4640530303030303,
  0.45928030303030304,
  0.46291666666666664],
 'aggregate_correlations': [(0.9551759148646606, 3.0935791851169376e-280),
  (0.9568476244064827, 1.7621957268376714e-284),
  (0.9603190673145864, 7.374406257773924e-294),
  (0.9574546888418075, 4.602225052553829e-286),
  (0.9577112364008573, 9.704974230231723e-287),
  (0.9526759938115862, 3.502564978285354e-274),
  (0.9572465172587161, 1.6159210417650658e-285),
  (0.9620378670937882, 8.113979043277967e-299),
  (0.9534726907653601, 4.4809420710264456e-276),
  (0.9591990326239181, 9.599581623980341e-291)],
 'close_accuracies': [0.38825757575757575,
  0.4147727272727273,
  0.38446969696969696,
  0.3996212121212121,
  0.39015151515151514,
  0.3693181818181818,
  0.3996212121212121,
  0.42045454545454547,
  0.38446969696969696,
  0.40340909090

In [31]:
# 5. Baseline Listener
# -----------------------------------------
def baseline_listener_experiment(train=False, model_file="../model/baseline_model.params"):

    # Initializing featurizers
    print("Initializing featurizers")
    caption_phi = caption_featurizers.CaptionFeaturizer(tokenizer=caption_featurizers.EndingTokenizer) # Use with parameter files that end in `endings_tkn`
    # caption_phi = caption_featurizers.CaptionFeaturizer(tokenizer=caption_featurizers.WhitespaceTokenizer) # Use with parameter files don't
    color_phi = ColorFeaturizer(color_phi_fourier, "rgb", normalized=True)
    feature_handler = FeatureHandler(train_data, test_data_synth, caption_phi, color_phi) # target function is initialized by default

    print("Initializing model")
    model = ColorOnlyBaseline(ColorSelector, optimizer=torch.optim.Adam, lr=0.001, num_epochs=5)
    model.init_model(color_dim=54)

    print(model_file)
    model.load_model(model_file)

    # convert the model output to a score for that particular round
    print("Evaluating model")
    output_to_score = lambda model_outputs, targets: np.exp(model_outputs[np.arange(len(model_outputs)), targets]) # get the model's predicted probablity at each target index and use that as the score
    my_score_model = partial(score_model, speaker=Speaker.BY_GAME_ID_COND, return_df=True, score=Score.COMPOSITE)
    eval_p = evaluate_model(test_data_synth, feature_handler, model, output_to_score, my_score_model, accuracy=False)


    output_to_score_acc = lambda model_outputs, targets: np.argmax(model_outputs, axis=1) == targets
    eval_acc = evaluate_model(test_data_synth, feature_handler, model, output_to_score_acc, my_score_model, accuracy=False)
    
    return (eval_p, eval_acc)
    



Just noting that there are two sources of randomness: one is in the order the colors are presented and the other is the model's training 

In [53]:
# I assume pragmatic will either be similar or need its own type of thing
def evaluate_baseline_listener_samples():
    model_directory = "../baseline_listener_samples"
    num_samples = 10
    aggregate_correlations = []
    close_correlations = []
    split_correlations = []
    far_correlations = []

    aggregate_accuracies = []
    close_accuracies = []
    split_accuracies = []
    far_accuracies = []


    for i in range(num_samples):
        print("Evaluating Baseline Listener #{}".format(i))
        listener_eval, listener_eval_acc = baseline_listener_experiment(model_file="{}/sample_{}.params".format(model_directory, i))
        _, listener_eval = listener_eval # first item is correlation, which we will recalculate
        _, listener_eval_acc = listener_eval_acc # first item is correlation, which we will recalculate
        true_scores = listener_eval.groupby('gameid').numOutcome.mean()
        model_scores = listener_eval.groupby('gameid').model_scores.mean()
        true_scores_composite = composite_score(listener_eval)

        aggregate_correlations.append(stats.pearsonr(model_scores, true_scores_composite))
        # arbitrarily say we get it right if we assign a majority of the probability mass to it
        aggregate_accuracies.append(sum(listener_eval_acc.model_scores)/len(listener_eval_acc.model_scores))

        # separate out conditions
        listener_close = listener_eval[listener_eval.condition == "close"]
        listener_split = listener_eval[listener_eval.condition == "split"]
        listener_far   = listener_eval[listener_eval.condition == "far"]
        
        listener_close_acc = listener_eval_acc[listener_eval_acc.condition == "close"]
        listener_split_acc = listener_eval_acc[listener_eval_acc.condition == "split"]
        listener_far_acc   = listener_eval_acc[listener_eval_acc.condition == "far"]

        listener_close_true_scores =  listener_close.groupby('gameid').numOutcome.mean()
        listener_close_model_scores = listener_close.groupby('gameid').model_scores.mean()

        listener_split_true_scores =  listener_split.groupby('gameid').numOutcome.mean()
        listener_split_model_scores = listener_split.groupby('gameid').model_scores.mean()

        listener_far_true_scores =  listener_far.groupby('gameid').numOutcome.mean()
        listener_far_model_scores = listener_far.groupby('gameid').model_scores.mean()
        
        # turn true scores to composite, gricean scores
        listener_close_composite = composite_score(listener_close)
        listener_split_composite = composite_score(listener_split)
        listener_far_composite   = composite_score(listener_far)

        close_correlations.append(stats.pearsonr(listener_close_composite, listener_close_model_scores))
        split_correlations.append(stats.pearsonr(listener_split_composite, listener_split_model_scores))
        far_correlations.append(stats.pearsonr(  listener_far_composite  , listener_far_model_scores))

        close_accuracies.append(sum(listener_close_acc.model_scores)/ len(listener_close_acc.model_scores))
        split_accuracies.append(sum(listener_split_acc.model_scores)/ len(listener_split_acc.model_scores))
        far_accuracies.append(  sum(listener_far_acc.model_scores)  / len(listener_far_acc.model_scores))

    return {"aggregate_accuracies": aggregate_accuracies,
            "close_accuracies": close_accuracies,
            "split_accuracies": split_accuracies,
            "far_accuracies": far_accuracies,
            "aggregate_correlations": aggregate_correlations,
            "close_correlations":close_correlations,
            "far_correlations":far_correlations,
            "split_correlations": split_correlations}



In [54]:
baseline_list_results = evaluate_baseline_listener_samples()

Evaluating Baseline Listener #0
Initializing featurizers
Initializing model
../baseline_listener_samples/sample_0.params
Evaluating model
Got here to composite score
Got here to composite score
Evaluating Baseline Listener #1
Initializing featurizers
Initializing model
../baseline_listener_samples/sample_1.params
Evaluating model
Got here to composite score
Got here to composite score
Evaluating Baseline Listener #2
Initializing featurizers
Initializing model
../baseline_listener_samples/sample_2.params
Evaluating model
Got here to composite score
Got here to composite score
Evaluating Baseline Listener #3
Initializing featurizers
Initializing model
../baseline_listener_samples/sample_3.params
Evaluating model
Got here to composite score
Got here to composite score
Evaluating Baseline Listener #4
Initializing featurizers
Initializing model
../baseline_listener_samples/sample_4.params
Evaluating model
Got here to composite score
Got here to composite score
Evaluating Baseline Listener #

In [45]:
baseline_list_results

{'aggregate_accuracies': [0.37575757575757573,
  0.33367424242424243,
  0.3375378787878788,
  0.3346969696969697,
  0.3334469696969697,
  0.33837121212121213,
  0.33829545454545457,
  0.33265151515151514,
  0.33382575757575755,
  0.3346590909090909],
 'aggregate_correlations': [(0.05179548924097473, 0.23477563700595694),
  (-0.009098554629149352, 0.834777927560781),
  (0.027786434212799433, 0.5240639701217923),
  (0.007695292785320762, 0.8599734963671637),
  (-0.042234798793224636, 0.332737824226761),
  (-0.01870782040291889, 0.668002026978048),
  (0.07613760691163626, 0.08048126238032255),
  (0.05845904067603059, 0.17983925541334453),
  (-0.012640497244356912, 0.7719847736291139),
  (0.0009980441765262324, 0.9817468212819142)],
 'close_accuracies': [0.3302272727272727,
  0.3264772727272727,
  0.3390909090909091,
  0.33340909090909093,
  0.33181818181818185,
  0.34125,
  0.34329545454545457,
  0.3331818181818182,
  0.33579545454545456,
  0.34125],
 'close_correlations': [(-0.0183593130

In [32]:
baseline_results_test = baseline_listener_experiment()

Initializing featurizers
Initializing model
../model/baseline_model.params
Evaluating model
Got here to composite score
Got here to composite score


In [35]:
baseline_results_test[0][0]

(-0.0670447301558705, 0.007602225488752366)

In [36]:
baseline_results_test[1][0]

(-0.05479847871132265, 0.029193110763098763)

In [46]:
# from https://stackoverflow.com/questions/15033511/compute-a-confidence-interval-from-sample-data
import numpy as np
import scipy.stats


def mean_confidence_interval(data, confidence=0.95):
    #a = 1.0 * np.array(data)
    a = data
    n = len(a)
    m, se = np.mean(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    print("${:.4f}\pm{:.4f}$".format(m, h))
    return m, m-h, m+h

In [110]:
mean_confidence_interval([l[0] for l in lit_list_results['far_correlations']])

(0.9467041899156712, 0.9453345265907224, 0.94807385324062)

In [111]:
0.9467041899156712-0.9453345265907224

0.001369663324948811

In [None]:
0.9467041899156712-0.001369663324948811

In [87]:
lit_list_results['aggregate_correlations']

[(0.9551759148646606, 3.0935791851169376e-280),
 (0.9568476244064827, 1.7621957268376714e-284),
 (0.9603190673145864, 7.374406257773924e-294),
 (0.9574546888418075, 4.602225052553829e-286),
 (0.9577112364008573, 9.704974230231723e-287),
 (0.9526759938115862, 3.502564978285354e-274),
 (0.9572465172587161, 1.6159210417650658e-285),
 (0.9620378670937882, 8.113979043277967e-299),
 (0.9534726907653601, 4.4809420710264456e-276),
 (0.9591990326239181, 9.599581623980341e-291)]

In [89]:
mean_confidence_interval(lit_list_results['aggregate_accuracies'])

(0.46109090909090905, 0.4599297914836187, 0.4622520266981994)

In [90]:
0.4599297914836187-0.46109090909090905

-0.001161117607290374

In [None]:
0.4599297914836187-0.001

In [100]:
mean_confidence_interval([l[0] for l in results['far_correlations']])

(-0.8663899276716822, -0.8686320928835145, -0.8641477624598499)

In [99]:
-0.8663899276716822--0.8686320928835145

0.008684566179368414

In [101]:
-0.8663899276716822-0.008684566179368414

-1.7350220205551967

In [104]:
mean_confidence_interval(results['aggregate_accuracies'])

(24.060857990181823, 23.994688173655458, 24.12702780670819)

In [106]:
24.060857990181823-23.994688173655458

0.06616981652636511

In [107]:
24.060857990181823+0.06616981652636511

24.12702780670819

In [60]:
import pickle
prag_list_results = None
with open("../results/pragmatic_listener_assessment_accuracy.pkl", "rb") as file:
    prag_list_results = pickle.load(file)

In [115]:
prag_list_results

{'aggregate_accuracies': [0.4533333333333333,
  0.4526136363636364,
  0.4530681818181818,
  0.4536742424242424,
  0.4531439393939394,
  0.45325757575757575,
  0.4536742424242424,
  0.4530681818181818,
  0.45268939393939395,
  0.4520075757575758],
 'aggregate_correlations': [(0.9615612280860988, 2.027065475884201e-297),
  (0.9618056146561487, 3.91275795421677e-298),
  (0.9617898446679253, 4.352331620136946e-298),
  (0.9618575041863672, 2.7555019753713967e-298),
  (0.9615458400800124, 2.2474727470917922e-297),
  (0.9617135918003048, 7.278264227716953e-298),
  (0.9619005374934682, 2.059438343024509e-298),
  (0.9616903550580581, 8.511110396845046e-298),
  (0.9616526527680964, 1.0968855063184333e-297),
  (0.9616802273009595, 9.111561381510886e-298)],
 'close_accuracies': [0.3958333333333333,
  0.3958333333333333,
  0.3977272727272727,
  0.3996212121212121,
  0.3958333333333333,
  0.3939393939393939,
  0.3996212121212121,
  0.3939393939393939,
  0.3958333333333333,
  0.3958333333333333],
 'c

In [61]:
mean_confidence_interval(prag_list_results['aggregate_accuracies'])

$0.4693\pm0.0004$


(0.46928787878787875, 0.4688625358768855, 0.46971322169887203)

In [127]:
mean_confidence_interval([l[0] for l in prag_list_results['far_correlations']])

$0.9486\pm0.0001$


(0.9485555018949304, 0.9484303757518484, 0.9486806280380125)

In [57]:
mean_confidence_interval(baseline_list_results['aggregate_accuracies'])

$0.3373\pm0.0102$


(0.33731439393939394, 0.32710468050521974, 0.34752410737356815)

In [56]:
mean_confidence_interval(lit_list_results['aggregate_accuracies'])

$0.4653\pm0.0011$


(0.46526515151515146, 0.46412343363903413, 0.4664068693912688)