In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import os
import torch
import numpy as np
import torch.nn as nn
import matplotlib.pyplot as plt
from cs231n.coco_utils import load_coco_data, sample_coco_minibatch, decode_captions

%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

In [4]:
from distance_senteces import Distance_Sentences
from distance_image import Distance_Image

In [5]:
class CaptioningModel(nn.Module):
    """
    A CaptioningRNN produces captions from image features using a recurrent
    neural network.

    The RNN receives input vectors of size D, has a vocab size of V, works on
    sequences of length T, has an RNN hidden dimension of H, uses word vectors
    of dimension W, and operates on minibatches of size N.

    Note that we don't use any regularization for the CaptioningRNN.
    """

    def __init__(self, word_to_idx, 
                 input_dim=512, 
                 wordvec_dim=128,
                 hidden_dim=128,
                 num_layers=1,
                 use_cuda = False, 
                 device = torch.device("cuda:0")):
        """
        Construct a new CaptioningLSTM instance.

        Inputs:
        - word_to_idx: A dictionary giving the vocabulary. It contains V entries,
          and maps each string to a unique integer in the range [0, V).
        - input_dim: Dimension D of input image feature vectors.
        - wordvec_dim: Dimension W of word vectors.        
        - hidden_dim: Dimension H for the hidden state of the RNN.
        """
        super(CaptioningModel, self).__init__()

        
        if torch.cuda.is_available() and use_cuda:
            self.use_cuda = True
            self.device = device
        else:
            self.device = torch.device("cpu")
            
        self.wordvec_dim = wordvec_dim
        self.num_layers = num_layers
        vocab_size = len(word_to_idx)
        self.vocab_size = vocab_size
        
        self.W_embed = nn.Embedding(vocab_size, wordvec_dim) # Initialize word vectors 
        self.W_proj = nn.Linear(input_dim, hidden_dim)

        self.rnn = nn.LSTM(input_size=wordvec_dim, 
                           hidden_size=hidden_dim, 
                           num_layers=self.num_layers, 
                           batch_first=True)

        
        self.W_vocab = nn.Linear(hidden_dim,vocab_size) # Initialize output to vocab weights
        self.criterion = nn.CrossEntropyLoss()

        self._null = word_to_idx['<NULL>']
        self._start = word_to_idx.get('<START>', None)
        self._end = word_to_idx.get('<END>', None)        
        self.word_to_idx = word_to_idx
        self.idx_to_word = {i: w for w, i in word_to_idx.items()}

        
    def forward(self, features, captions):
        
        hidden_image = self.W_proj(features)
        word_embedding = self.W_embed(captions)        
        hidden_image = hidden_image.unsqueeze(0)
        hidden_image = hidden_image.repeat(self.num_layers,1,1)
        c0 = torch.zeros(hidden_image.shape).to(self.device)
        
        hiddens, _ = self.rnn(word_embedding, (hidden_image,c0))
        pred = self.W_vocab(hiddens)
        return pred


    def sample(self, features, max_length=30):
        """
        Run a test-time forward pass for the model, sampling captions for input
        feature vectors.

        At each timestep, we embed the current word, pass it and the previous hidden
        state to the RNN to get the next hidden state, use the hidden state to get
        scores for all vocab words, and choose the word with the highest score as
        the next word. The initial hidden state is computed by applying an affine
        transform to the input image features, and the initial word is the <START>
        token.

        For LSTMs you will also have to keep track of the cell state; in that case
        the initial cell state should be zero.

        Inputs:
        - features: Array of input image features of shape (N, D).
        - max_length: Maximum length T of generated captions.

        Returns:
        - captions: Array of shape (N, max_length) giving sampled captions,
          where each element is an integer in the range [0, V). The first element
          of captions should be the first sampled word, not the <START> token.
        """
        N = features.shape[0]
        V = self.wordvec_dim
        captions = (self._null * torch.ones((N, max_length), dtype=torch.long)).to(self.device)
        captions[:, 0] = self._start
        
        hidden_image = self.W_proj(features);      
        next_h = hidden_image.unsqueeze(0)
        next_h = next_h.repeat(self.num_layers,1,1)        
        
        c0 = torch.zeros(next_h.shape).to(self.device)
            
        for t in range(max_length-1): 
            word = (self.W_embed(captions[:, t]))
            word = word.unsqueeze(1)
            pred, out = self.rnn(word,(next_h,c0))
            next_h, c0 = out
            pred = self.W_vocab(next_h[-1,:,:])     
            sol = torch.max(pred, dim=1)
            _, captions[:,t + 1] = sol
        
        return captions.cpu().data.numpy()

In [6]:
class Discriminator(nn.Module):
    """
    A CaptioningRNN produces captions from image features using a recurrent
    neural network.

    The RNN receives input vectors of size D, has a vocab size of V, works on
    sequences of length T, has an RNN hidden dimension of H, uses word vectors
    of dimension W, and operates on minibatches of size N.

    Note that we don't use any regularization for the CaptioningRNN.
    """

    def __init__(self, 
                 word_to_idx, 
                 input_Dim=1, 
                 wordvec_Dim=128,
                 hidden_Dim=128,
                 num_layers=1,
                 N=128, 
                 O=128, 
                 image_input=512, 
                 set_size=10, 
                 use_cuda = False,
                 device = torch.device("cpu")):
        
        super(Discriminator, self).__init__()

        if torch.cuda.is_available() and use_cuda:
            self.use_cuda = True
            self.device = device
        else:
            self.device = torch.device("cpu")
            
        self.sentence_embedding= CaptioningModel(word_to_idx, 
                                                 input_dim=input_Dim, 
                                                 wordvec_dim=wordvec_Dim,
                                                 num_layers = num_layers,
                                                 hidden_dim=hidden_Dim, 
                                                 use_cuda = self.use_cuda, 
                                                 device = self.device)         
        
        vocab_size=len(word_to_idx)
        self.distance_layer_sentences = Distance_Sentences(vocab_size, N, O)
        self.distance_layer_images = Distance_Image(vocab_size, N, O, image_input)
        
        
        self.set_size = set_size
        self.projection = nn.Linear((self.set_size+1)*O, 2)
        
        
    def forward(self, captions, features):
        
        nsamples, _ = captions.shape
        
        ft = torch.zeros(nsamples,1).to(self.device)
        print("input to embedding", ft.shape)
        S = self.sentence_embedding.forward(ft, captions)
        print("Sentence Embedding Shape", S.shape)
        S = S[:,-1,:]
        print("Sentence Embedding Modified Shape", S.shape)
        
        S = S.view(S.shape[0] // self.set_size, self.set_size, -1)
        print(S.shape)
        
        o_sentence = self.distance_layer_sentences.forward(S)
        print("sentence distance", o_sentence.shape)

        o_image = self.distance_layer_images.forward(S,features)
        print("image distance", o_image.shape)
        
        o = torch.cat((o_image, o_sentence), 1).to(self.device)        
        print("distances concatenated", o.shape)
        
        D = nn.functional.log_softmax(self.projection(o),dim=1)
        print("logmax performed", D.shape)
        
        return D

# Load MS-COCO data
As in the previous notebook, we will use the Microsoft COCO dataset for captioning.

In [7]:
# Load COCO data from disk; this returns a dictionary
# We'll work with dimensionality-reduced features for this notebook, but feel
# free to experiment with the original features by changing the flag below.
data = load_coco_data(pca_features=True)

# Print out all the keys and values from the data dictionary
for k, v in data.items():
    if type(v) == np.ndarray:
        print(k, type(v), v.shape, v.dtype)
    else:
        print(k, type(v), len(v))

train_captions <class 'numpy.ndarray'> (400135, 17) int32
train_image_idxs <class 'numpy.ndarray'> (400135,) int32
val_captions <class 'numpy.ndarray'> (195954, 17) int32
val_image_idxs <class 'numpy.ndarray'> (195954,) int32
train_features <class 'numpy.ndarray'> (82783, 512) float32
val_features <class 'numpy.ndarray'> (40504, 512) float32
idx_to_word <class 'list'> 1004
word_to_idx <class 'dict'> 1004
train_urls <class 'numpy.ndarray'> (82783,) <U63
val_urls <class 'numpy.ndarray'> (40504,) <U63


In [8]:
device = torch.device("cuda:2")

In [9]:
sample_data = load_coco_data(max_train=20000)
minibatch = sample_coco_minibatch(sample_data, batch_size=1000, split='train')

captions, features, urls = minibatch
captions=torch.from_numpy(captions).long().to(device)
features=torch.from_numpy(features).to(device)

In [10]:
print(captions.shape)
print(features.shape)
print(len(sample_data['word_to_idx']))

torch.Size([1000, 17])
torch.Size([1000, 512])
1004


In [27]:
sentence = []
for ix in captions[0]:
    wrd = sample_data['idx_to_word'][ix]
    if wrd in "<NULL>":
        break
    else:
        sentence.append(wrd)
print(" ".join(sentence))

<START> a silver train going down some train tracks <END>


In [28]:
discriminator_model = Discriminator(word_to_idx=sample_data['word_to_idx'], 
                                    image_input=features.shape[1], 
                                    use_cuda=True,
                                    device=device).to(device)

In [29]:
h = discriminator_model(captions, features[0:100,:])

input to embedding torch.Size([1000, 1])
Sentence Embedding Shape torch.Size([1000, 17, 1004])
Sentence Embedding Modified Shape torch.Size([1000, 1004])
torch.Size([100, 10, 1004])
sentence distance torch.Size([100, 1280])
image distance torch.Size([100, 128])
distances concatenated torch.Size([100, 1408])
logmax performed torch.Size([100, 2])


In [30]:
%%time
h = torch.sum(h)
h.backward()

CPU times: user 948 ms, sys: 390 ms, total: 1.34 s
Wall time: 1.11 s
