In [1]:
# so we can access classes from parent directory
import sys
sys.path.append("..")

In [2]:
from monroe_data import MonroeData, MonroeDataEntry, Color # for loading in training data
import caption_featurizers                              # for getting caption representations
import color_featurizers                                # for getting color representations
from experiment import FeatureHandler                   # for combining caption and color features

from models import PytorchModel, LiteralSpeaker         # model base that handles training / evaluation

In [3]:
# get data
train_data = MonroeData("../data/csv/train_corpus_monroe.csv", "../data/entries/train_entries_monroe.pkl")
dev_data = MonroeData("../data/csv/dev_corpus_monroe.csv", "../data/entries/dev_entries_monroe.pkl")

In [4]:
# define feature functions
caption_phi = caption_featurizers.CaptionFeaturizer(tokenizer = caption_featurizers.CharacterTokenizer)

#    use fourier representation from the hsv space and normalize all hsv values to be between 0 and 1
color_phi = color_featurizers.ColorFeaturizer(color_featurizers.color_phi_fourier, "hsv", normalized=True)

In [5]:
# speaker's target is to predict tokens following the SOS token
def speaker_target(data_entry):
    _, caption_ids = caption_phi.to_string_features(data_entry.caption) # this probably works...
    target = caption_ids[1:]
    return target
# pass in train and dev data, our caption and color feature functions, function for turning an element of our data
# (train or dev) into the target, and we only care the target (which is the first color) so we aren't going to mess that up
# by randomizing the order of the colors
# we aren't going to use the dev data to train the model, but we'll still include it in the feature handler
feature_handler = FeatureHandler(train_data, dev_data, caption_phi, color_phi, target_fn=speaker_target, randomized_colors=False)

In [6]:
X_train = feature_handler.train_features()
y_train = feature_handler.train_targets()

In [7]:
# now would be where we define the model:
import torch
from torch import nn
import numpy as np

In [13]:
class UnconditionalCharacterCaptionGenerator(nn.Module):
    
    def __init__(self, vocab_size, embed_dim, speaker_hidden_dim):
        super(UnconditionalCharacterCaptionGenerator, self).__init__()
        
        # for text
        self.embed = nn.Embedding(vocab_size, embed_dim)
        
        self.speaker_lstm = nn.LSTM(embed_dim, speaker_hidden_dim, batch_first=True)
        self.linear = nn.Linear(speaker_hidden_dim, vocab_size)
        self.logsoftmax = nn.LogSoftmax(dim=2)
        
    def forward(self, color, captions):
        # we get colors because we're using the same framework that the other caption generators use, but 
        # we don't use them at all because we're an UNCONDITIONAL speaker
        
        caption_features = self.embed(captions)
        
        # just use teacher forcing here for now (i.e. don't feed predictions back into network some percentage of the time
        #). Not doing this leads to greater instability but is cleaner to implement
        hiddens, _ = self.speaker_lstm(caption_features)
        outputs = self.linear(hiddens)
        output_norm = self.logsoftmax(outputs)
        return output_norm
        

In [14]:
# We'll use the LiteralSpeaker model to train this, so we can also use it to sample
unconditional_character_s0 = LiteralSpeaker(UnconditionalCharacterCaptionGenerator, optimizer=torch.optim.Adam, lr=0.004, num_epochs=5)
unconditional_character_s0.init_model(vocab_size=feature_handler.caption_featurizer.caption_indexer.size,
                       embed_dim=50, speaker_hidden_dim=100)

In [15]:
unconditional_character_s0.fit(X_train, y_train)

---EPOCH 0---
0m 0s (0:0 0.00%) 0.0040
0m 5s (0:1000 7.90%) 1.3070
0m 12s (0:2000 15.79%) 1.0848
0m 18s (0:3000 23.69%) 1.1525
0m 26s (0:4000 31.58%) 1.0775
0m 33s (0:5000 39.48%) 1.2551
0m 39s (0:6000 47.37%) 0.9923
0m 45s (0:7000 55.27%) 0.9124
0m 51s (0:8000 63.17%) 1.0461
0m 57s (0:9000 71.06%) 0.8670
1m 3s (0:10000 78.96%) 0.9100
1m 9s (0:11000 86.85%) 0.9022
1m 15s (0:12000 94.75%) 0.8522
AFTER EPOCH 2999 - AVERAGE VALIDATION LOSS: 1.127891745229562
---EPOCH 1---
1m 24s (1:0 0.00%) 0.0007
1m 29s (1:1000 7.90%) 0.8786
1m 35s (1:2000 15.79%) 0.9381
1m 43s (1:3000 23.69%) 1.0348
1m 50s (1:4000 31.58%) 0.9996
1m 58s (1:5000 39.48%) 1.1612
2m 5s (1:6000 47.37%) 0.9259
2m 11s (1:7000 55.27%) 0.8601
2m 18s (1:8000 63.17%) 0.9946
2m 24s (1:9000 71.06%) 0.8351
2m 30s (1:10000 78.96%) 0.8675
2m 35s (1:11000 86.85%) 0.8709
2m 40s (1:12000 94.75%) 0.8265
AFTER EPOCH 2999 - AVERAGE VALIDATION LOSS: 1.0845355052848658
---EPOCH 2---
2m 48s (2:0 0.00%) 0.0007
2m 54s (2:1000 7.90%) 0.8513
3m 0s (

In [16]:
unconditional_character_s0.save_model("../model/unconditional_character_literal_speaker.params")