# Listener-Hallucinating Speaker

In [1]:
__author__ = "Christopher Leung"
__version__ = "CS224u, Stanford, Spring 2020"

## Set-up

See [colors_overview.ipynb](colors_overview.ipynb) for set-up in instructions and other background details.

In [2]:
from colors import ColorsCorpusReader
import os
from sklearn.model_selection import train_test_split
from torch_color_selector import (
    ColorizedNeuralListener, create_example_dataset)
from torch_color_describer import ColorizedInputDescriber
import utils
from utils import START_SYMBOL, END_SYMBOL, UNK_SYMBOL
import numpy as np
import torch

In [3]:
utils.fix_random_seeds()

In [4]:
COLORS_SRC_FILENAME = os.path.join(
    "data", "colors", "filteredCorpus.csv")

## All two-word examples as a dev corpus

So that you don't have to sit through excessively long training runs during development, I suggest working with the two-word-only subset of the corpus until you enter into the late stages of system testing.

In [5]:
dev_corpus = ColorsCorpusReader(
    COLORS_SRC_FILENAME, 
    word_count=None, 
    normalize_colors=True)

In [6]:
dev_examples = list(dev_corpus.read())

This subset has about one-third the examples of the full corpus:

In [7]:
len(dev_examples)

46994

## Dev dataset

Let's load the saved training and test data.

In [8]:
def load_from_pickle():
    import pickle 
    
    with open('dev_vocab.pickle', 'rb') as handle:
        dev_vocab = pickle.load(handle)
    with open('dev_seqs_test.pickle', 'rb') as handle:
        dev_seqs_test = pickle.load(handle)
    with open('dev_seqs_train.pickle', 'rb') as handle:
        dev_seqs_train = pickle.load(handle)
    with open('dev_cols_test.pickle', 'rb') as handle:
        dev_cols_test = pickle.load(handle)
    with open('dev_cols_train.pickle', 'rb') as handle:
        dev_cols_train = pickle.load(handle)
    with open('embedding.pickle', 'rb') as handle:
        embedding = pickle.load(handle)
    return dev_vocab, dev_seqs_test, dev_seqs_train, dev_cols_test, dev_cols_train, embedding
dev_vocab, dev_seqs_test, dev_seqs_train, dev_cols_test, dev_cols_train, embedding = load_from_pickle()

At this point, our preprocessing steps are complete, and we can fit a first model.

## GloVe embeddings

We also load the GloVe embedding that was used by the speaker.

In [9]:
def load_glove_from_pickle():
    import pickle 
    with open('dev_glove_vocab.pickle', 'rb') as handle:
        dev_glove_vocab = pickle.load(handle)
    with open('dev_glove_embedding.pickle', 'rb') as handle:
        dev_glove_embedding = pickle.load(handle)
    return dev_glove_vocab, dev_glove_embedding
dev_glove_vocab, dev_glove_embedding = load_glove_from_pickle()

The above might dramatically change your vocabulary, depending on how many items from your vocab are in the Glove space:

## Load the Literal Listener

In [10]:
literal_listener = ColorizedNeuralListener(
    dev_vocab, 
    #embedding=dev_glove_embedding, 
    embed_dim=100,
    embedding=embedding,
    hidden_dim=100, 
    max_iter=100,
    batch_size=256,
    dropout_prob=0.,
    eta=0.001,
    lr_rate=0.96,
    warm_start=True,
    device='cuda')
literal_listener.load_model("literal_listener.pt")

Using cuda




In [11]:
test_preds = literal_listener.predict(dev_cols_test, dev_seqs_test)
train_preds = literal_listener.predict(dev_cols_train, dev_seqs_train)

  color_seqs = torch.FloatTensor(color_seqs)


In [12]:
correct = sum([1 if x == 2 else 0 for x in test_preds])
print("test", correct, "/", len(test_preds), correct/len(test_preds))
correct = sum([1 if x == 2 else 0 for x in train_preds])
print("train", correct, "/", len(train_preds), correct/len(train_preds))

test 9213 / 11749 0.7841518427100179
train 29932 / 35245 0.8492552135054617


## Load the Literal Speaker

In [13]:
literal_speaker = ColorizedInputDescriber(
    dev_glove_vocab, 
    embedding=dev_glove_embedding, 
    hidden_dim=100, 
    max_iter=40, 
    eta=0.005,
    batch_size=128)
literal_speaker.load_model("literal_speaker.pt")

Using cuda


In [14]:
literal_speaker.listener_accuracy(dev_cols_test, dev_seqs_test)

  color_seqs = torch.FloatTensor(color_seqs)
  perp = [np.prod(s)**(-1/len(s)) for s in scores]


0.8149629755723892

## Hallucinating Pragmatic Speaker

We coin the Hallucinating Pragmatic Speaker to be the speaker that takes the k highest probability utterances that describes the context by the literal speaker, which then is filtered again by taking the top m number of utterances which maximize the literal listener likelihood of selecting the correct color.

On a high level, the idea here is that the speaker is producing candidate utterances that it thinks is gramatically correct, while picking the top m utterances that maximizes understanding to the communicant. We will refer to this as utterances as hallucinations.

In [15]:
def generate_listener_hallucinations(input_colors, num_hallucinations=5, alpha=0.544, k_samples=10):
    '''This method generates listener hallucinations.
    Parameters
    ----------
    input_colors:
        A list of size (n,m,p) of int where each example has a list of m colors. Each color
        is embedded in size p.
    Returns
    -------
    prag_speaker_pred:
        (n,k_samples,*) The top sentences from the speaker that maximizes the likelihood 
        that the listener will choose the target color. Each sentence can be of different
        length and is tokenized.
    '''
    print("Sampling utterances")
    utterances = literal_speaker.sample_utterances(input_colors, k_samples=k_samples)
    
    print("Preparing Data")
    # Prepare data, flatten the target utterances and repeat the input colors per k_sample
    target_utterances = [seq for seq_list in utterances for seq in seq_list]
    input_colors_extended = [item for item in input_colors for i in range(k_samples)]
    
    print("Calculating probabilities")
    # utterance_preds = literal_listener.predict(input_colors_extended, target_utterances)
    utterance_probs = literal_listener.predict(input_colors_extended, target_utterances, probabilities=True)
    utterance_probs = torch.FloatTensor([preds[2] for preds in utterance_probs]).view(-1, k_samples)
    utterance_probs = utterance_probs ** alpha
    
    total = torch.sum(utterance_probs, dim=1).unsqueeze(1)
    normalized_utterance_probs = utterance_probs/total

    print("Finding top m utterances")
    # Find the best k number of utterances that maximize the listener likelihood
    best_utter_values, best_utter_indices = torch.topk(normalized_utterance_probs, num_hallucinations, dim=1)
    
    # DEPRECATED -Then flip the index number back.
    # prag_speaker_pred_ind = normalized_utterance_probs.shape[1] - best_utter_index - 1
    
    # Index into the utterances to find the sequence candidates
    prag_speaker_pred = [[seqs[utter_index] for utter_index in best_utter_indices[ind]] for ind, seqs in enumerate(utterances)]
    return prag_speaker_pred

Let's generate the input colors needed to predict for different candidate targets.

In [16]:
top_hallucinations = []
for col_partition in [dev_cols_train[:10000], dev_cols_train[10000:20000], dev_cols_train[20000:30000], dev_cols_train[30000:]]:
    torch.cuda.empty_cache()
    third_col_speaker_pred = generate_listener_hallucinations(col_partition, num_hallucinations=5, k_samples=8)
    top_hallucinations.append([seqs[0] for seqs in third_col_speaker_pred])

Sampling utterances


  color_seqs = torch.FloatTensor(color_seqs).to(self.device)


Preparing Data
Calculating probabilities
Finding top m utterances
Sampling utterances
Preparing Data
Calculating probabilities
Finding top m utterances
Sampling utterances
Preparing Data
Calculating probabilities
Finding top m utterances
Sampling utterances
Preparing Data
Calculating probabilities
Finding top m utterances


In [17]:
top_hallucinations = [seq for seqs in top_hallucinations for seq in seqs]
top_hallucinations[:5]

[['<s>', 'grey', '</s>'],
 ['<s>', 'green', ',', 'bright', 'green', '</s>'],
 ['<s>', 'red', '</s>'],
 ['<s>', 'grey', '</s>'],
 ['<s>', 'dull', 'green', '</s>']]

Where each example has m candidate hallucinations.

We can show that by taking the best hallucination produces a near 100% accuracy for the listener. This shows that the space of language that the speaker has learnt can perfectly solve the Stanford Colors problem.

In [18]:
listened_preds = literal_listener.predict(dev_cols_train, top_hallucinations)
correct = sum([1 if x == 2 else 0 for x in listened_preds])
print("test", correct, "/", len(listened_preds), correct/len(listened_preds))

test 34883 / 35245 0.9897290395800823


How do these utterances perfectly capture the space of color differences? More needs to be done to examine this and is an excellent research direction.

One other thing we can do is to train the speaker on these hallucinations.

In [19]:
literal_speaker.warm_start = True
# We only reassign the optimizer, not the graph.
literal_speaker.opt = literal_speaker.optimizer(
                literal_speaker.model.parameters(),
                lr=literal_speaker.eta,
                weight_decay=literal_speaker.l2_strength)

In [20]:
literal_speaker.fit(dev_cols_train, top_hallucinations)

Epoch 40; err = 29.312468945980072

ColorizedInputDescriber(
	hidden_dim=100,
	batch_size=128,
	max_iter=40,
	eta=0.005,
	optimizer=<class 'torch.optim.adam.Adam'>,
	l2_strength=0,
	embed_dim=100,
	embedding=[[ 0.1394268  -0.47498924 -0.22497068 ...  0.02911435  0.47107838
   0.3607797 ]
 [ 0.38472     0.49351     0.49096    ...  0.026263    0.39052
   0.52217   ]
 [-0.66099    -0.073023    0.92379    ... -0.22556     0.8148
  -0.44052   ]
 ...
 [ 0.4765     -0.14409    -0.49884    ... -1.1854     -0.88582
  -0.57597   ]
 [-0.29881     0.81797     1.002      ... -0.23776    -0.90741
   0.55244   ]
 [ 0.38433493  0.12163181  0.07975889 ... -0.19281575  0.35057666
  -0.15122421]])

Let's see how it did.

In [21]:
speaker_preds_train = literal_speaker.predict(dev_cols_train)
listened_preds = literal_listener.predict(dev_cols_train, speaker_preds_train)
correct = sum([1 if x == 2 else 0 for x in listened_preds])
print("test", correct, "/", len(listened_preds), correct/len(listened_preds))

  color_seqs = torch.FloatTensor(color_seqs)


test 33997 / 35245 0.9645907220882395


In [22]:
speaker_preds_train[:420]

[['<s>', 'grey', '</s>'],
 ['<s>', 'dark', '+er', 'green', '</s>'],
 ['<s>', 'red', '</s>'],
 ['<s>', 'grey', '</s>'],
 ['<s>', 'dull', 'green', '</s>'],
 ['<s>', 'dark', 'purple', 'with', 'blue', '</s>'],
 ['<s>', 'tan', '</s>'],
 ['<s>', 'grey', '</s>'],
 ['<s>', 'yellow', '+ish', '</s>'],
 ['<s>', 'dull', 'purple', '</s>'],
 ['<s>', 'teal', '</s>'],
 ['<s>', 'gray', '</s>'],
 ['<s>', 'light', 'blue', '</s>'],
 ['<s>', 'bright', '+est', 'blue', '</s>'],
 ['<s>', 'green', '</s>'],
 ['<s>', 'purple', '</s>'],
 ['<s>', 'tan', '</s>'],
 ['<s>', 'grey', '</s>'],
 ['<s>', 'pink', '+ish', '</s>'],
 ['<s>', 'grey', '</s>'],
 ['<s>', 'red', '</s>'],
 ['<s>', 'purple', '</s>'],
 ['<s>', 'blue', '</s>'],
 ['<s>', 'orange', '</s>'],
 ['<s>', 'bright', '+est', 'purple', '</s>'],
 ['<s>', 'brown', '</s>'],
 ['<s>', 'grey', '</s>'],
 ['<s>', 'pink', '+ish', 'purple', '</s>'],
 ['<s>', 'dull', 'purple', '</s>'],
 ['<s>', 'brown', '</s>'],
 ['<s>', 'grey', '</s>'],
 ['<s>', 'bright', 'pink', '</s>'],

In [23]:
speaker_preds_test = literal_speaker.predict(dev_cols_test)
listened_preds = literal_listener.predict(dev_cols_test, speaker_preds_test)
correct = sum([1 if x == 2 else 0 for x in listened_preds])
print("test", correct, "/", len(listened_preds), correct/len(listened_preds))

test 11249 / 11749 0.9574431866541834


In [24]:
speaker_preds_test

[['<s>', 'purple', '</s>'],
 ['<s>', 'bright', 'pink', '</s>'],
 ['<s>', 'blu', '+ish', 'blue', '</s>'],
 ['<s>', 'bright', '+est', 'purple', '</s>'],
 ['<s>', 'purple', '</s>'],
 ['<s>', 'brown', '</s>'],
 ['<s>', 'green', '+ish', 'green', '</s>'],
 ['<s>', 'purple', '</s>'],
 ['<s>', 'bright', 'pink', '</s>'],
 ['<s>', 'grey', '</s>'],
 ['<s>', 'pink', '+ish', 'grey', '</s>'],
 ['<s>', 'dark', 'pupl', '</s>'],
 ['<s>', 'orange', '</s>'],
 ['<s>', 'bright', '+est', '</s>'],
 ['<s>', 'bright', 'pink', '</s>'],
 ['<s>', 'the', 'dark', '+er', 'purple', '</s>'],
 ['<s>', 'bright', 'pink', '</s>'],
 ['<s>', 'dark', '+er', 'green', '</s>'],
 ['<s>', 'blue', '</s>'],
 ['<s>', 'dark', '+er', 'green', '</s>'],
 ['<s>', 'yellow', '</s>'],
 ['<s>', 'dark', 'purple', '</s>'],
 ['<s>', 'orange', '</s>'],
 ['<s>', 'pink', '</s>'],
 ['<s>', 'brown', '</s>'],
 ['<s>', 'bright', '+est', '</s>'],
 ['<s>',
  'mint',
  'green',
  '#',
  '#',
  '#',
  '#',
  'not',
  'the',
  'aquamarine',
  '</s>'],
 ['<

In [25]:
perp = literal_speaker.perplexities(dev_cols_test, dev_seqs_test)

In [26]:
print(torch.mean(torch.FloatTensor([p for p in perp if p != float("inf")])))

tensor(13381.2324)


In [27]:
literal_speaker.listener_accuracy(dev_cols_test, dev_seqs_test)

0.7311260532811303

In [28]:
sum([len(utt) for utt in speaker_preds_test])/len(speaker_preds_test)

4.00468124946804

In [29]:
sum([len(utt) for utt in speaker_preds_train])/len(speaker_preds_train)

4.0286849198467864