OK, so we can write a class/function that performs the evaluation
Given the scores created by the model and the training data, output 
the summary statistic (which will likely be a correlation coefficient)

At the moment, there are three degrees of freedom:
1. Whether to use the spearman (ranked) correlation or pearson correlation (default pearson)
2. Whether to treat each game as its own speaker or to treat each unique worker as own speaker (default game)
3. Whether to correlated with the simple pragmatic metric (average number of correct guesses) or composite metric
   (include length of utterance, word specificity, click time, etc.) It might also be interesting to
   investigate weighting the different conditions differently, but for now we are doing a macro-average


In [1]:
# so we can use packages from parent directory
import sys
sys.path.append("..")

In [5]:
from scipy import stats # for pearsonr, spearmanr
import numpy as np
import pandas as pd # for handling test data frame
from enum import Enum, auto

In [10]:
# relevant enums for options
class Speaker(Enum):
    BY_GAME_ID = "gameid"
    BY_WORKER_ID = "workerid_uniq"
    
class Regressor(Enum):
    PEARSON = stats.pearsonr
    SPEARMAN = stats.spearmanr
    
class Score(Enum):
    SIMPLE = auto()
    COMPOSITE = auto()
    

In [None]:
def calculate_scores(eval_df, score=Score.SIMPLE):
    if score == score.SIMPLE:
        return eval_df.groupby(speaker.value).mean()


def score_model(test_data, scores, speaker=Speaker.BY_GAME_ID, regressor=Regressor.PEARSON, score=Score.SIMPLE):
    """
    Assume scores are in the same order as the test data (i.e. 0th row is 0th score)
    """
    relevant_columns = ["gameid", "roundNum", "numOutcome"]
    if speaker == Speaker.BY_WORKER_ID:
        relevant_columns.append(Speaker.BY_WORKER_ID.value)
    
    if score == Score.COMPOSITE:
        # no support for this yet but probably also need:
        relevant_columns.extend(["contents", "clkTime", "msgTime"])
    
    eval_df = test_data.data[relevant_columns].copy()
    eval_df["model_scores"] = scores # why we need scores to be in same order as rows
    
    
    if score == score.SIMPLE:
        # calculate scores as the mean of the number of successful utterances
        # a speaker has
        true_scores = eval_df.groupby(speaker.value).numOutcome.mean()
    else:
        true_scores = calculate_scores(eval_df, score)
    
    # calculate a model score 
    model_scores = eval_df.groupby(speaker.value).model_scores.mean()
    
    result = regressor(true_scores, model_scores)
    return result
    

In [8]:
x = []
x.append(Speaker.BY_GAME_ID)

In [15]:
Speaker.BY_GAME_ID

<Speaker.BY_GAME_ID: 'gameid'>

In [2]:
import nltk

hypothesis = ['It', 'is', 'a', 'cat', 'at', 'room']
reference = ['It', 'is', 'a', 'cat', 'inside', 'the', 'room']
#there may be several references
BLEUscore = nltk.translate.bleu_score.sentence_bleu([reference], hypothesis)
print(BLEUscore)

0.4548019047027907


In [3]:
def score_speaker_bleu(true_captions, generated_captions):
    total_bleu = 0
    for i in range(len(true_captions)):
        reference = [true_captions[0]]
        total_bleu += nltk.translate.bleu_score.sentence_bleu(reference, generated_captions[i])
    return total_bleu/len(true_captions) # avg bleu
    