In [1]:
from builtins import range
from builtins import object
import numpy as np

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence

from fede.distance_senteces import Distance_Sentences
from fede.distance_image import Distance_Image

In [None]:
class CaptioningRNN(nn.Module):
    """
    A CaptioningRNN produces captions from image features using a recurrent
    neural network.

    The RNN receives input vectors of size D, has a vocab size of V, works on
    sequences of length T, has an RNN hidden dimension of H, uses word vectors
    of dimension W, and operates on minibatches of size N.

    Note that we don't use any regularization for the CaptioningRNN.
    """

    def __init__(self, 
                 word_to_idx, 
                 input_dim=512, 
                 wordvec_dim=128,
                 hidden_dim=128):
        """
        Construct a new CaptioningRNN instance.

        Inputs:
        - word_to_idx: A dictionary giving the vocabulary. It contains V entries,
          and maps each string to a unique integer in the range [0, V).
        - input_dim: Dimension D of input image feature vectors.
        - wordvec_dim: Dimension W of word vectors.
        - hidden_dim: Dimension H for the hidden state of the RNN.
        """
        super(CaptioningRNN, self).__init__()

        

        vocab_size = len(word_to_idx)



        # Initialize word vectors      
        self.W_embed=nn.Embedding(vocab_size, wordvec_dim);
        
        self.vocab_size=vocab_size
        self.wordvec_dim=wordvec_dim
  
        self.W_proj=nn.Linear(input_dim,hidden_dim);

        self.num_layers=10
        self.rnn=nn.LSTM(input_size  = wordvec_dim, hidden_size =hidden_dim,
        num_layers =self.num_layers,batch_first=True,dropout=0.5 );

        # Initialize output to vocab weights
        self.W_vocab=nn.Linear(hidden_dim,vocab_size);
        
        
        self.criterion=nn.CrossEntropyLoss();


        # Cast parameters to correct dtype
        #for k, v in self.params.items():
        #    self.params[k] = v.astype(self.dtype)
        self._null = word_to_idx['<NULL>']
        self._start = word_to_idx.get('<START>', None)
        self._end = word_to_idx.get('<END>', None)
        
        self.dtype = dtype
        self.word_to_idx = word_to_idx
        self.idx_to_word = {i: w for w, i in word_to_idx.items()}
        
        
        
        
        return

    def forward(self, features, captions):
    
        
        hidden_image=self.W_proj(features);
        
        captions=captions.long()
        word_embedding=self.W_embed(captions);        
        
        
        hidden_image=hidden_image.unsqueeze(0)
        hidden_image=hidden_image.repeat(self.num_layers,1,1)
        
        c0=torch.zeros(hidden_image.shape)
        if(self.use_cuda):
            c0=c0.cuda()
        hiddens, _ = self.rnn(word_embedding,(hidden_image,c0))
        
        
        pred=self.W_vocab(hiddens)
        
        return pred


In [None]:
class Discriminator(nn.Module):
    """
    A CaptioningRNN produces captions from image features using a recurrent
    neural network.

    The RNN receives input vectors of size D, has a vocab size of V, works on
    sequences of length T, has an RNN hidden dimension of H, uses word vectors
    of dimension W, and operates on minibatches of size N.

    Note that we don't use any regularization for the CaptioningRNN.
    """

    def __init__(self, word_to_idx, input_Dim=1, wordvec_Dim=128,
                 hidden_Dim=128, N=128, O=128, image_input=512,set_size=10  ):
        
        super(Discriminator, self).__init__()
        
        self.sentence_embedding=MY_CaptioningRNN(word_to_idx, input_dim=input_Dim, wordvec_dim=wordvec_Dim,
                 hidden_dim=hidden_Dim);
                 
        vocab_size=len(word_to_idx);
        self.distance_layer_sentences=Distance_Sentences(vocab_size,N,O);
        self.distance_layer_images=Distance_Image(vocab_size,N,O,image_input);
        
        
        self.set_size=set_size
        self.projection=nn.Linear((self.set_size+1)*O,2);
        return 
        
    def forward(self, captions, features):
    
        
        nsamples,trash=captions.shape
        
        S=self.sentence_embedding.forward( torch.zeros(nsamples,1), captions);
        S=S[:,-1,:];
        print(S.shape)
        S=S.view(S.shape[0]/self.set_size,self.set_size,-1)
        print(S.shape)
        o_sentence=self.distance_layer_sentences.forward(S);
        print(o_sentence.shape)
        o_image=self.distance_layer_images.forward(S,features);
        print(o_image.shape)
        o=torch.cat((o_image, o_sentence), 1);
        print(o.shape)
        D=nn.functional.log_softmax(self.projection(o),dim=1);
        print(D.shape)
        return D
        
