In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import pandas as pd
import numpy as np
import json
from helpers import *
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
plt.rcParams.update({'font.size': 8})
import itertools
codeToggler()

In [None]:
(df, survey, objNames, trial_types) = loadData()

In [None]:
# Calculate surprisal for all trials_types. 
surprisals = {}
likelihoods = {}
for pics in trial_types:
    trial_type = pics[0][:-1]
    likelihoods[trial_type] = correspondenceProbabilities(pics[0], pics[1], df, objNames)
    surprisals[trial_type] = -np.log(likelihoods[trial_type])

In [None]:
# Calculate the analogy surprisal for all trials
scores = []
for i, row in df.iterrows():
    surprisal = surprisals[row['trial_type']]
    scores += [analogyScore(row['analogies'], surprisal)]
df['analogy_score'] = scores

# How good is this score?
Comparing our scoring function with naive surprisal and frequencies

In [None]:
df['analogies_str'] = df['analogies'].astype(str)

# Analogy scores. Taking averages doesn't change a thing.
analogy_scores = df[df['trial_type'] == 'KA'].groupby('analogies_str').mean()['analogy_score']

# Frequencies
frequencies = df[df['trial_type'] == 'KA'].groupby('analogies_str').count()['analogies']

# Comparison dataframe
compare = pd.DataFrame(index=analogy_scores.index)
compare['score'] = analogy_scores
compare['frequency'] = frequencies
compare ['surprisal'] = -np.log( frequencies / sum(frequencies) )
compare = compare.sort_values(by='frequency', ascending=False)
compare.index = compare.index.str.replace("u''", u'\u00D7').str.replace('[', '(').str.replace(']', ')')

In [None]:
# Plot
ax = compare.plot(secondary_y=['score', 'surprisal'], figsize=(7,3.5), style='-o', lw=1.1)

# Labels
ax.right_ax.set_ylabel('score and surprisal values')
ax.set_ylabel('frequency')
ax.set_xlabel('')
ax.set_xticks(range(len(compare.index)))
ax.set_xticklabels(compare.index, rotation=-40, horizontalalignment='left')
ax.set_xlim([-.3,10.3])
ax.right_ax.set_ylim([0,5])

# Legends
legend = plt.legend(bbox_to_anchor=(1.10,1.05), loc='upper left')
legend.get_frame().set_color('white')
legend2 = ax.legend(bbox_to_anchor=(1.10,.85), loc='upper left')
legend2.get_frame().set_color('white')

# Title
plt.title('Score, frequencies and surprisal value for different analogies', y=1.08);

# Save plot
plt.tight_layout()
plt.savefig('exports/scoring-comparison.png', format='png', dpi=300,
    bbox_extra_artists=(legend,), bbox_inches='tight')

In [None]:
# Save
df.to_json('data/trials-with-scores.json')