In [1]:
# so we can use packages from parent directory
import sys
sys.path.append("..")

In [2]:
# code copied from example experiments.py
import torch
import torch.nn as nn
import numpy as np
from monroe_data import MonroeData, MonroeDataEntry, Color # last two for reading pkl file
import caption_featurizers
from color_featurizers import ColorFeaturizer, color_phi_fourier
from models import LiteralListener, LiteralSpeaker, ImaginativeListener, CaptionEncoder, CaptionGenerator, ColorGenerator
from evaluation import score_model, delta_e_dist, Speaker, Score
from experiment import FeatureHandler, evaluate_model

In [5]:
from functools import partial
from scipy import stats

In [17]:
import importlib
import experiment
importlib.reload(experiment)
from experiment import FeatureHandler, evaluate_model

In [3]:
prefix="../"
train_data = MonroeData(prefix + "data/csv/train_corpus_monroe.csv", prefix + "data/entries/train_entries_monroe.pkl")
dev_data_synth  = MonroeData(prefix + "data/csv/dev_corpus_synth_10fold.csv", prefix + "data/entries/dev_corpus_synth_10fold.pkl")

In [30]:
test_data_synth  = MonroeData(prefix + "data/csv/test_corpus_synth_10fold.csv", prefix + "data/entries/test_corpus_synth_10fold.pkl")

In [15]:
def composite_score(eval_df, speaker="gameid"):
    mean_scores = eval_df.groupby(speaker).numOutcome.mean()
    mean_numCleanWords = eval_df.groupby(speaker).numCleanWords.mean()
    mean_clkTime = eval_df.groupby(speaker).clkTime.mean()
    true_scores = mean_scores / mean_clkTime / mean_numCleanWords
    max_score = true_scores.max()
    true_scores /= max_score # normalize the scores
    return true_scores

In [46]:
# 4. Imaginative Listener
def imaginative_listener(model_file="../model/imaginative_listener_with_distractors_linear100hd5epoch_GLOVE_MSE.params"):
    print("Initializing featurizers")
    caption_phi = caption_featurizers.CaptionFeaturizer(tokenizer=caption_featurizers.EndingTokenizer)
    color_phi = ColorFeaturizer(color_phi_fourier, "rgb", normalized=True)

    def target_color_target(data_entry):
        return np.array(data_entry.colors[0].rgb_norm)

    feature_handler = FeatureHandler(train_data, test_data_synth, caption_phi, color_phi, target_fn=target_color_target,
                                randomized_colors=False) #using TEST data now :) 

    print("Obtaining training features") # get features even if you're runnning the pretrained model for example
    #train_features = feature_handler.train_features()
    #train_targets = feature_handler.train_targets()

    imaginative_model = ImaginativeListener(ColorGenerator, criterion=torch.nn.CosineEmbeddingLoss,
                            optimizer=torch.optim.Adam, lr=0.004, num_epochs=5)

    # Creating model
    MSELossSum = lambda: nn.MSELoss(reduction='sum') # sorry for this ugliness..... but this is me passing a parameter to the loss func
    imaginative_model = ImaginativeListener(ColorGenerator, criterion=MSELossSum,
                                optimizer=torch.optim.Adam, lr=0.001, num_epochs=5, use_color=True)
    imaginative_model.init_model(embed_dim=100, hidden_dim=50, vocab_size=feature_handler.caption_featurizer.caption_indexer.size,
                    color_in_dim=54, color_hidden_dim=50, weight_matrix=caption_featurizers.get_pretrained_glove(feature_handler.caption_featurizer.caption_indexer.idx2word.items(), 100, prefix=True))

    imaginative_model.load_model(model_file)
        
    print("Evaluating model")
    output_to_score_de = lambda outputs, targets: np.array([delta_e_dist(outputs[i], targets[i]) for i in range(len(targets))])
    # we want to score based on the model's predictions at the TARGET indices not listener clicked indices,
    # so we change the feature_handler's target function to do that:
    my_score_model = partial(score_model, speaker=Speaker.BY_GAME_ID_COND, return_df=True, score=Score.COMPOSITE)
    result = evaluate_model(test_data_synth, feature_handler, imaginative_model, output_to_score_de, my_score_model, accuracy=False)
    
    return result


In [51]:
def evaluate_imaginative_listener_samples():
    model_directory = "../imaginative_listener_samples"
    num_samples = 10
    aggregate_correlations = []
    close_correlations = []
    split_correlations = []
    far_correlations = []

    aggregate_accuracies = []
    close_accuracies = []
    split_accuracies = []
    far_accuracies = []

    for i in range(num_samples):
        print("Evaluating sample #{}".format(i))
        _, imaginative_listener_eval = imaginative_listener("{}/sample_{}.params".format(model_directory, i))
        il_true_scores = imaginative_listener_eval.groupby('gameid').numOutcome.mean()
        il_model_scores = imaginative_listener_eval.groupby('gameid').model_scores.mean()
        il_true_scores_composite = composite_score(imaginative_listener_eval)

        aggregate_correlations.append(stats.pearsonr(il_model_scores, il_true_scores_composite))
        # arbitrarily say we get it right if we assign a majority of the probability mass to it
        #aggregate_accuracies.append(sum(imaginative_listener_eval.model_scores > 0.5)/len(imaginative_listener_eval.model_scores))
        aggregate_accuracies.append(np.mean(imaginative_listener_eval.model_scores))

        # separate out conditions
        imaginative_listener_close = imaginative_listener_eval[imaginative_listener_eval.condition == "close"]
        imaginative_listener_split = imaginative_listener_eval[imaginative_listener_eval.condition == "split"]
        imaginative_listener_far = imaginative_listener_eval[imaginative_listener_eval.condition == "far"]

        imaginative_listener_close_true_scores = imaginative_listener_close.groupby('gameid').numOutcome.mean()
        imaginative_listener_close_model_scores = imaginative_listener_close.groupby('gameid').model_scores.mean()

        imaginative_listener_split_true_scores =  imaginative_listener_split.groupby('gameid').numOutcome.mean()
        imaginative_listener_split_model_scores = imaginative_listener_split.groupby('gameid').model_scores.mean()

        imaginative_listener_far_true_scores =  imaginative_listener_far.groupby('gameid').numOutcome.mean()
        imaginative_listener_far_model_scores = imaginative_listener_far.groupby('gameid').model_scores.mean()

        # turn true scores to composite, gricean scores
        imaginative_listener_close_composite = composite_score(imaginative_listener_close)
        imaginative_listener_split_composite = composite_score(imaginative_listener_split)
        imaginative_listener_far_composite   = composite_score(imaginative_listener_far)

        close_correlations.append(stats.pearsonr(imaginative_listener_close_composite, imaginative_listener_close_model_scores))
        split_correlations.append(stats.pearsonr(imaginative_listener_split_composite, imaginative_listener_split_model_scores))
        far_correlations.append(stats.pearsonr(imaginative_listener_far_composite, imaginative_listener_far_model_scores))

    #     close_accuracies.append(sum(imaginative_listener_close_model_scores > 0.5)/len(imaginative_listener_close_model_scores))
    #     split_accuracies.append(sum(imaginative_listener_split_model_scores > 0.5)/len(imaginative_listener_split_model_scores))
    #     far_accuracies.append(sum(imaginative_listener_far_model_scores > 0.5)/len(imaginative_listener_far_model_scores))
        close_accuracies.append(np.mean(imaginative_listener_close_model_scores))
        split_accuracies.append(np.mean(imaginative_listener_split_model_scores))
        far_accuracies.append(np.mean(imaginative_listener_far_model_scores))
        
        print("Most recent stats:")
        print("agg acc:", aggregate_accuracies[-1])
        print("clo acc:", close_accuracies[-1])
        print("spl acc:", split_accuracies[-1])
        print("far acc:", far_accuracies[-1])
        print("agg cor:", aggregate_correlations[-1])
        print("clo cor:", close_correlations[-1])
        print("spl cor:", split_correlations[-1])
        print("far cor:", far_correlations[-1])
        
    return {"aggregate_accuracies": aggregate_accuracies,
            "close_accuracies": close_accuracies,
            "split_accuracies": split_accuracies,
            "far_accuracies": far_accuracies,
            "aggregate_correlations": aggregate_correlations,
            "close_correlations":close_correlations,
            "far_correlations":far_correlations,
            "split_correlations": split_correlations}


In [52]:
results = evaluate_imaginative_listener_samples()

Evaluating sample #0
Initializing featurizers
Obtaining training features
Evaluating model
Got here to composite score
Most recent stats:
agg acc: 23.98516954890258
clo acc: 18.27353323876809
spl acc: 19.544536012224782
far acc: 34.14159079769889
agg cor: (-0.886422302978764, 4.250960310734327e-178)
clo cor: (-0.455535609378521, 2.0635307187084423e-28)
spl cor: (-0.5473674197436782, 1.3343968892457512e-42)
far cor: (-0.8657290229509013, 3.1337385241772896e-160)
Evaluating sample #1
Initializing featurizers
Obtaining training features
Evaluating model
Got here to composite score
Most recent stats:
agg acc: 24.211091965944536
clo acc: 18.65899781108387
spl acc: 19.767035392086786
far acc: 34.21259842311515
agg cor: (-0.8860109120142069, 1.0393073393501362e-177)
clo cor: (-0.44432579193215566, 5.887339051429764e-27)
spl cor: (-0.5267565591108372, 4.9684543265154734e-39)
far cor: (-0.8675670970509768, 1.079527020742103e-161)
Evaluating sample #2
Initializing featurizers
Obtaining training 

In [53]:
import pickle
with open("../results/imaginative_listener_assessment.pkl", "wb") as file:
    pickle.dump(results, file)

In [57]:
np.mean([cor[0] for cor in results['aggregate_correlations']])

-0.8857654592635387

In [33]:
aggregate_accuracies

[24.225927799225445]

In [34]:
close_accuracies

[18.50006086024096]

In [35]:
far_accuracies

[34.470457996575014]

In [36]:
split_accuracies

[19.710338275301137]

In [37]:
aggregate_correlations

[(-0.8799177945947197, 1.4376357425904765e-168)]

In [38]:
close_correlations

[(-0.4409533567945598, 5.226923223263904e-26)]

In [39]:
far_correlations

[(-0.8577761621065179, 5.865956224847408e-151)]

In [40]:
split_correlations

[(-0.5079681530415953, 2.896292325793216e-35)]

In [None]:
# 1. Literal Listener
# -----------------------------------------
# TODO: FILL IN PARAMETERS 
def literal_listener_experiment(train=False, epochs=5, embed_dim = 100, hidden_dim = 100, color_dim= 54, model_file="../model/literal_listener_5epoch-2.params"):

    # Initializing featurizers
    print("Initializing featurizers")
    caption_phi = caption_featurizers.CaptionFeaturizer(tokenizer=caption_featurizers.EndingTokenizer) # Use with parameter files that end in `endings_tkn`
    # caption_phi = caption_featurizers.CaptionFeaturizer(tokenizer=caption_featurizers.WhitespaceTokenizer) # Use with parameter files don't
    color_phi = ColorFeaturizer(color_phi_fourier, "rgb", normalized=True)
    feature_handler = FeatureHandler(train_data, test_data_synth, caption_phi, color_phi) # target function is initialized by default

    print("Initializing model")
    model = LiteralListener(CaptionEncoder, num_epochs = epochs)
    model.init_model(embed_dim = embed_dim, hidden_dim = hidden_dim, vocab_size = feature_handler.caption_featurizer.caption_indexer.size,
                 color_dim = color_dim)

    model.load_model(model_file)

    # convert the model output to a score for that particular round
    print("Evaluating model")
    output_to_score = lambda model_outputs, targets: np.exp(model_outputs[np.arange(len(model_outputs)), targets]) # get the model's predicted probablity at each target index and use that as the score
    my_score_model = partial(score_model, speaker=Speaker.BY_GAME_ID_COND, return_df=True, score=Score.COMPOSITE)
    return evaluate_model(test_data_synth, feature_handler, model, output_to_score, my_score_model, accuracy=False)

In [42]:
# I assume pragmatic will either be similar or need its own type of thing
def evaluate_literal_listener_samples():
    model_directory # FILL THIS IN = "../literal_listener_samples"
    num_samples = 10
    aggregate_correlations = []
    close_correlations = []
    split_correlations = []
    far_correlations = []

    aggregate_accuracies = []
    close_accuracies = []
    split_accuracies = []
    far_accuracies = []


    for i in range(num_samples):
        _, listener_eval = literal_listener("{}/sample_{}.params".format(model_directory, i))
        true_scores = listener_eval.groupby('gameid').numOutcome.mean()
        model_scores = listener_eval.groupby('gameid').model_scores.mean()
        true_scores_composite = composite_score(listener_eval)

        aggregate_correlations.append(stats.pearsonr(model_scores, true_scores_composite))
        # arbitrarily say we get it right if we assign a majority of the probability mass to it
        aggregate_accuracies.append(sum(listener_eval.model_scores > 0.5)/len(listener_eval.model_scores))

        # separate out conditions
        listener_close = listener_eval[listener_eval.condition == "close"]
        listener_split = listener_eval[listener_eval.condition == "split"]
        listener_far =   listener_eval[listener_eval.condition == "far"]

        listener_close_true_scores =  listener_close.groupby('gameid').numOutcome.mean()
        listener_close_model_scores = listener_close.groupby('gameid').model_scores.mean()

        listener_split_true_scores =  listener_split.groupby('gameid').numOutcome.mean()
        listener_split_model_scores = listener_split.groupby('gameid').model_scores.mean()

        listener_far_true_scores =  listener_far.groupby('gameid').numOutcome.mean()
        listener_far_model_scores = listener_far.groupby('gameid').model_scores.mean()

        # turn true scores to composite, gricean scores
        listener_close_composite = composite_score(listener_close)
        listener_split_composite = composite_score(listener_split)
        listener_far_composite   = composite_score(listener_far)

        close_correlations.append(stats.pearsonr(listener_close_composite, listener_close_model_scores))
        split_correlations.append(stats.pearsonr(listener_split_composite, listener_split_model_scores))
        far_correlations.append(stats.pearsonr(listener_far_composite,     listener_far_model_scores))

        close_accuracies.append(sum(listener_close_model_scores > 0.5)/len(listener_close_model_scores))
        split_accuracies.append(sum(listener_split_model_scores > 0.5)/len(listener_split_model_scores))
        far_accuracies.append(sum(  listener_far_model_scores > 0.5)/len(listener_far_model_scores))

    return {"aggregate_accuracies": aggregate_accuracies,
            "close_accuracies": close_accuracies,
            "split_accuracies": split_accuracies,
            "far_accuracies": far_accuracies,
            "aggregate_correlations": aggregate_correlations,
            "close_correlations":close_correlations,
            "far_correlations":far_correlations,
            "split_correlations": split_correlations}
