In [None]:
# Train literal listener and speaker to use DNC representations:

In [1]:
import sys
sys.path.append("../color-evaluation/")

In [26]:
import torch
import torch.nn as nn
import dnc
from monroe_data import MonroeData, MonroeDataEntry, Color # last two for reading pkl file
import caption_featurizers
from color_featurizers import ColorFeaturizer, color_phi_fourier
from models import LiteralListener, LiteralSpeaker, CaptionEncoder, CaptionGenerator, PytorchModel, ColorEncoder, BeamNode
from experiment import FeatureHandler
import scipy.stats as stats
from evaluation import score_model, Score

import numpy as np
from queue import PriorityQueue

In [52]:
# load in DNC
class DNCEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, color_dim):
        super(DNCEncoder, self).__init__()
        self.embed = nn.Embedding(vocab_size, embed_dim)
        self.rnn = dnc.DNC(
          input_size=embed_dim + color_dim,
          hidden_size=128,
          rnn_type='lstm',
          num_layers=4,
          nr_cells=100,
          cell_size=32,
          read_heads=4,
          batch_first=True,
          gpu_id=-1,
          debug=True
        )
        self.decoder = nn.Linear(embed_dim + color_dim, vocab_size) # don't predict over padding tag
        self.logsoftmax = nn.LogSoftmax(dim=1)
        
        self.vocab_size = vocab_size
        
    def forward(self, caption, color):
        embeds = self.embed(caption)
        color_reps = color.repeat(1, caption.shape[1], 1) # repeat color for number of tokens in captions
        # concatenate colors to caption
        inputs = torch.cat((embeds, color_reps), dim=2) # cat along the innermost dimension
        # dnc magic
        (controller_hidden, memory, read_vectors) = (None, None, None)
        output, (controller_hidden, memory, read_vectors), debug_memory = \
          self.rnn(inputs, (controller_hidden, memory, read_vectors), reset_experience=True)

        result = self.decoder(output)
        return self.logsoftmax(result)

In [177]:
class DNCSpeaker(PytorchModel):
    
    def predict(self, X, sample=1, beam_width=5):
        all_tokens = []
        self.model.eval()
        max_gen_len = 20
    
        self.model.eval()
        if not torch.cuda.is_available():
            torch.manual_seed(10) # for determinism
        else:
            torch.cuda.manual_seed_all(10)
            
        with torch.no_grad():
            for i, feature in enumerate(X):
                caption, colors = feature
                caption = torch.tensor(caption, dtype=torch.long)
                colors = torch.tensor(colors, dtype=torch.float)

    
                beam_nodes = PriorityQueue()
                ended_list = []
    
                tokens = caption[:, 0].view(-1, 1) # begin at start token
                #print(tokens)
                start = BeamNode(0, tokens, False)
                beam_nodes.put(start)
    
                for i in range(max_gen_len + 1):
                    node = beam_nodes.get()
                    if node.ended:
                        ended_list.append(np.array(node.tokens[0].numpy()))
                        if len(ended_list) == sample:
                            break
                    else:
                        tokens = node.tokens
                        vocab_preds = self.model(tokens, colors)[:,-1:,:] # just distribution over last token
                        log_probs, prediction_indices = vocab_preds.topk(beam_width, dim=2)  # taking the topk predictions
                        for j in range(beam_width):
                            prediction_index = prediction_indices[:,-1,j:j+1] # a single prediction
                            log_prob = log_probs[0][0][j].item()
                            updated_tokens = tokens.clone()
                            updated_tokens = torch.cat((updated_tokens, prediction_index), dim=1)
                            updated_log_prob = node.log_prob + log_prob
                            ended = ((i == max_gen_len - 1) or (prediction_index.item() == caption[:, -1].item()))#.view(-1, 1)))
                            new_node = BeamNode(updated_log_prob, updated_tokens, ended)
                            beam_nodes.put(new_node)
                if sample == 1: # for backwards compatability
                    all_tokens.append(np.array(ended_list))
                else:
                    all_tokens.append(ended_list)
        return all_tokens
    
    def train_iter(self, caption_tensor, color_tensor, target_tensor, criterion):
        model_output = self.model(caption_tensor, color_tensor)
        
        #model_output = model_output[:, :-1, :].squeeze(0)
        model_output = model_output.view(-1, self.model.vocab_size)
        target_tensor = target_tensor.view(-1)
        loss = criterion(model_output, target_tensor)
        return loss

In [121]:
def color_phi_fourier(color_list, space="hsv", resolution=3):
    """
    This is lifted straight but modified to take straight hsv from https://github.com/futurulus/colors-in-context/blob/2e7b830668cd039830154e7e8f211c6d4415d30f/vectorizers.py#L650
    Haven't figured out how it works yet. but it seems to be the only feature function to get somewhat decent results so far
    """
#     if space not in ("rgb_norm", "hsv_norm"):
#         print("Space must be rgb_norm or hsv_norm to use fourier transform")
#         return None

    resolution = [resolution for _ in color_list]
    colors = np.array([color_list])
    # I'm pretty sure ranges aren't actually used anywhere...
    if space == "rgb_norm":
        ranges = np.array([256, 256, 256])
    else:
        ranges = np.array([361, 101, 101])

    color_list = [color_list[0]/360, color_list[1]/100, color_list[2]/100]
    
    # Using a Fourier representation causes colors at the boundary of the
    # space to behave as if the space is toroidal: red = 255 would be
    # about the same as red = 0. We don't want this... so we divide
    # all of the rgb values by 2. (If we were doing this with hsv
    # we wouldn't divide the h value by two becaus it actually is
    # polar, so 360 is close to 0 (both are red)
    if space == "rgb_norm":
        xyz = colors / 2
    else:
        xyz = colors / 2
        xyz[:, 0] *= 2 # this is the 'h' of the 'hsv'

    ax, ay, az = [np.arange(0, g) for g, r in zip(resolution, ranges)]
    gx, gy, gz = np.meshgrid(ax, ay, az)

    arg = (np.multiply.outer(xyz[:, 0], gx) +
           np.multiply.outer(xyz[:, 1], gy) +
           np.multiply.outer(xyz[:, 2], gz))

    repr_complex = np.exp(-2j * np.pi * (arg % 1.0)).swapaxes(1, 2).reshape((xyz.shape[0], -1))
    result = np.hstack([repr_complex.real, repr_complex.imag]).astype(np.float32)
    return result[0]

In [178]:
# get DNC
dnc_speaker = DNCSpeaker(DNCEncoder, num_epochs=5)
dnc_speaker.init_model(vocab_size = 368, embed_dim=100, color_dim=54) # vocab size just copied from dnc training notebook
# load model manually because we need to set "map_location" to cpu
dnc_speaker.model.load_state_dict(torch.load("checkpoint_4.params", map_location='cpu'))
#dnc_speaker.load_model("./model_checkpoint_4.params")

In [3]:
# get data
monroe_train_data = MonroeData("../color-evaluation/data/csv/train_corpus_monroe.csv", "../color-evaluation/data/entries/train_entries_monroe.pkl")
monroe_dev_data = MonroeData("../color-evaluation/data/csv/dev_corpus_monroe.csv", "../color-evaluation/data/entries/dev_entries_monroe.pkl")

In [179]:
# build dnc color featurizer:
def dnc_phi(color_list, space):
    if space != "hsv":
        print("Space must be hsv to use dnc")
        return None
    # we're going to do a greedy search for the tokens and then max-pool their embeddings
    features = [
        [np.array([[0, 1]]), np.array([color_phi_fourier(color_list)])]
    ]
    #print(features)
    predicted_tokens = dnc_speaker.predict(features) # looks like [array([[  0, 123,   1]])]
    
    predicted_tokens = predicted_tokens[0].flatten() # looks like array([  0, 123,   1])
    
    predicted_tokens = predicted_tokens[1:-1] # get rid of nasty start and end tokens
    
    # get embeddings for predicted tokens
    #print("hello", predicted_tokens)
    embeds = dnc_speaker.model.embed(torch.from_numpy(np.array(predicted_tokens)))
    return embeds

In [159]:
dnc_phi([100.0, 30.0, 30.0], space="hsv")

[[array([[0, 1]]), array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0.]], dtype=float32)]]
tensor([[0]])
hello [120]


tensor([[-0.0100, -0.9641, -1.5174,  0.0861,  1.5254,  1.0565,  0.4181,  0.1787,
          0.6285, -1.9980, -0.4956,  0.3684,  0.7323,  0.9005,  0.4452,  0.9927,
         -0.8477,  1.3443, -0.5014, -1.2765, -0.6567,  0.4856,  0.4202, -0.3767,
         -1.8097,  0.5443, -0.1397,  0.4094,  0.1381,  0.8562, -0.1073, -0.2321,
          0.1718,  0.0061, -0.3351, -0.0614,  1.2263,  1.3344,  0.3503, -0.4874,
         -0.7214, -0.4114,  1.2891, -1.2204,  1.3222, -0.1335, -0.2869,  1.0041,
         -1.9206,  1.0088,  0.4697,  0.6737,  1.4490, -1.4709,  0.3612, -0.3839,
         -1.1166,  0.5365, -0.4841,  0.5475, -1.6019,  1.1423, -0.9571,  2.1764,
         -0.6859,  1.9283,  0.6891,  1.1430, -0.4513, -0.3581, -1.1672, -1.0731,
          0.1990,  0.0097,  1.9685,  0.5761, -0.2622,  0.4326,  0.5301,  0.3559,
         -1.5529, -0.5831,  0.8287,  0.9119,  1.2720, -0.5678, -0.7216,  0.2540,
          0.6690, -1.2522, -1.0299, -0.3800,  0.4937,  0.6474, -0.5182, -2.2360,
          0.8794, -0.2466, -

In [40]:
test_features = [
        [np.array([[0, 1]]), np.array([[100.0, 30.0, 30.0]])]
    ]

In [41]:
test_features

[[array([[0, 1]]), array([[100.,  30.,  30.]])]]

In [160]:
test_phi = color_phi_fourier(np.array([100.,  30.,  30. ]))

In [161]:
test_phi

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0.], dtype=float32)

In [169]:
dnc_speaker.predict([[np.array([[0, 2, 1]]), test_phi]], beam_width=5, sample=1)

tensor([[0]])


[array([[  0, 120,   1]])]

In [141]:
test_predicted_tokens

[array([[  0, 367, 367, 367, 367,   2,   2,   2,   2, 367, 367, 367, 367,
         367, 367, 367, 365, 365,   2,   2,   2]])]

In [60]:
np.array([[0, 1]])[:, -1]

array([1])

In [55]:
test_predicted_tokens

[array([], dtype=float64)]

In [39]:
np.array([[0, 1]]).shape

(1, 2)

In [30]:
embeds = dnc_phi([100.0, 30.0, 30.0], space="hsv")

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 2. Got 2 and 1 in dimension 0 at /Users/administrator/nightlies/pytorch-1.0.0/wheel_build_dirs/wheel_3.6/pytorch/aten/src/TH/generic/THTensorMoreMath.cpp:1333

In [180]:
# OK because DNC isn't working, let's do this with the regular literal speaker/listener

print("Initializing featurizers")
caption_phi = caption_featurizers.CaptionFeaturizer(tokenizer=caption_featurizers.EndingTokenizer) # Use with parameter files that end in `endings_tkn` - using endings tokenizer to separate endings like "ish" and "er"
color_phi = ColorFeaturizer(dnc_phi, "hsv", normalized=False)


feature_handler = FeatureHandler(monroe_train_data, monroe_dev_data, caption_phi, color_phi,
                                randomized_colors=False)



Initializing featurizers


In [181]:
%%time
print("Obtaining training features") # get features even if you're runnning the pretrained model for example
train_features = feature_handler.train_features()
train_targets = feature_handler.train_targets()

Obtaining training features
CPU times: user 1h 4min 43s, sys: 2min 22s, total: 1h 7min 5s
Wall time: 1h 6min


In [182]:
import pickle
with open("dnc_lit_listener_and_speaker_train_features.params", "wb") as file:
    pickle.dump(train_features, file)

In [184]:
import numpy as np

In [189]:
np.allclose(*[i.detach().numpy() for i in train_features[0][1]])

False

In [200]:
np.allclose([i.detach().numpy() for i in train_features[0][1]][0][0], [i.detach().numpy() for i in train_features[8784][1]][1][0])

True

In [183]:
for i in train_features

[array([0, 1, 2, 3, 4, 5, 6]),
 array([tensor([[-0.0100, -0.9641, -1.5174,  0.0861,  1.5254,  1.0565,  0.4181,  0.1787,
           0.6285, -1.9980, -0.4956,  0.3684,  0.7323,  0.9005,  0.4452,  0.9927,
          -0.8477,  1.3443, -0.5014, -1.2765, -0.6567,  0.4856,  0.4202, -0.3767,
          -1.8097,  0.5443, -0.1397,  0.4094,  0.1381,  0.8562, -0.1073, -0.2321,
           0.1718,  0.0061, -0.3351, -0.0614,  1.2263,  1.3344,  0.3503, -0.4874,
          -0.7214, -0.4114,  1.2891, -1.2204,  1.3222, -0.1335, -0.2869,  1.0041,
          -1.9206,  1.0088,  0.4697,  0.6737,  1.4490, -1.4709,  0.3612, -0.3839,
          -1.1166,  0.5365, -0.4841,  0.5475, -1.6019,  1.1423, -0.9571,  2.1764,
          -0.6859,  1.9283,  0.6891,  1.1430, -0.4513, -0.3581, -1.1672, -1.0731,
           0.1990,  0.0097,  1.9685,  0.5761, -0.2622,  0.4326,  0.5301,  0.3559,
          -1.5529, -0.5831,  0.8287,  0.9119,  1.2720, -0.5678, -0.7216,  0.2540,
           0.6690, -1.2522, -1.0299, -0.3800,  0.4937,  0.64

In [111]:
assess_features = feature_handler.test_features()
assess_targets = feature_handler.test_targets()

In [112]:
assess_features[0]

[array([ 0, 96,  6]),
 array([[ 1.0000000e+00,  2.0711137e-01, -9.1420978e-01, -1.2241068e-01,
         -9.9631262e-01, -2.9028466e-01, -9.7003126e-01,  3.6807224e-02,
          9.8527765e-01, -2.0711137e-01, -1.0000000e+00, -2.0711137e-01,
         -9.4560730e-01,  1.2241068e-01,  9.9631262e-01,  4.3861625e-01,
          9.7003126e-01, -3.6807224e-02, -9.1420978e-01,  2.0711137e-01,
          1.0000000e+00,  5.1410276e-01,  9.4560730e-01, -1.2241068e-01,
          7.8834641e-01, -4.3861625e-01, -9.7003126e-01,  0.0000000e+00,
         -9.7831738e-01, -4.0524131e-01, -9.9247956e-01, -8.5797310e-02,
          9.5694035e-01,  2.4298018e-01,  9.9932235e-01,  1.7096189e-01,
         -9.7831738e-01, -1.2246469e-16,  9.7831738e-01,  3.2531029e-01,
          9.9247956e-01,  8.5797310e-02,  8.9867449e-01, -2.4298018e-01,
         -9.9932235e-01,  4.0524131e-01,  9.7831738e-01,  0.0000000e+00,
          8.5772860e-01, -3.2531029e-01, -9.9247956e-01, -6.1523157e-01,
         -8.9867449e-01,  2.4

In [None]:
for feature in assess_features:
    