In [5]:
import torch

In [1]:
import torch.nn as nn

In [3]:
class Decoder(nn.Module):
    def __init__(self, embedding_size, vocab_size, hidden_size, start_token, end_token, pad_token):
        super(Decoder,self).__init__()
        self.hidden_size = hidden_size
        self.start_token = start_token
        self.end_token = end_token
        self.pad_token = pad_token
        self.embedding = nn.Embedding(vocab_size,embedding_size)
        self.lstm = nn.LSTMCell(embedding_size, hidden_size)
        self.fc = nn.Linear(hidden_size,vocab_size)
        self.init_h = nn.Linear(512, hidden_size)  # linear layer to find initial hidden state of LSTMCell
        self.init_c = nn.Linear(512, hidden_size)  # linear layer to find initial cell state of LSTMCell
        self.vocab_size = vocab_size
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def forward(self, features, captions, caption_lengths):
        batch_size = features.shape[0]
        sorted_caption_lengths, sort_ind = caption_lengths.sort(descending=True)
        features = features[sort_ind]
        captions = captions[sort_ind]
        caption_embeddings = self.embedding(captions)
        decode_lengths = sorted_caption_lengths - 1
        output = torch.zeros((batch_size,int(decode_lengths.max()),self.vocab_size))
        hx = self.init_h(features) 
        cx = self.init_c(features)
        for t in range(int(decode_lengths.max())):
            batch_size_t = int((decode_lengths > t).sum())
            hx, cx = self.lstm(caption_embeddings[:batch_size_t,t,:], (hx[:batch_size_t], cx[:batch_size_t]))
            raw_scores = self.fc(hx)
            output[:batch_size_t,t,:] = raw_scores       
        return output, sort_ind
    
    def predict(self, features):
        batch_size = features.shape[0]
        hx = self.init_h(features) 
        cx = self.init_c(features)
        eok_pred = torch.tensor([False]*batch_size, device=self.device)
        current_pred = torch.tensor(batch_size*[self.start_token], device=self.device)
        output = []
        predictions = []
        i = 0
        while eok_pred.sum() < batch_size and i < 100:
            unfinished_pred = current_pred != self.end_token
            hx_, cx_ = self.lstm(self.embedding(current_pred[unfinished_pred]), (hx[unfinished_pred], cx[unfinished_pred]))
            raw_scores = self.fc(hx_)
            hx[unfinished_pred] = hx_
            cx[unfinished_pred] = cx_
            output.append(raw_scores)
            prediction = raw_scores.argmax(1)
            predictions.append(prediction)
            current_pred[unfinished_pred] = prediction
            eok_pred[current_pred == self.end_token] = True
            i = i + 1
        return nn.utils.rnn.pad_sequence(output), nn.utils.rnn.pad_sequence(predictions, batch_first=False, padding_value=self.pad_token)
        
        

In [4]:
class DecoderImprovedTraining(nn.Module):
    def __init__(self, embedding_size, vocab_size, hidden_size, start_token, end_token, pad_token):
        super(DecoderImprovedTraining,self).__init__()
        self.hidden_size = hidden_size
        self.start_token = start_token
        self.end_token = end_token
        self.pad_token = pad_token
        self.embedding_size = embedding_size
        self.embedding = nn.Embedding(vocab_size,embedding_size)
        self.lstm = nn.LSTM(embedding_size,hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size,vocab_size)
        self.init_h = nn.Linear(512, hidden_size)  # linear layer to find initial hidden state of LSTMCell
        self.init_c = nn.Linear(512, hidden_size)  # linear layer to find initial cell state of LSTMCell
        self.vocab_size = vocab_size
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        
    def forward(self, features, captions, caption_length):
        batch_size = features.shape[0]
        caption_embeddings = self.embedding(captions)
        caption_embeddings_packed = nn.utils.rnn.pack_padded_sequence(caption_embeddings,caption_length.cpu()-1,True)
        hx = self.init_h(features).unsqueeze(0)
        cx = self.init_c(features).unsqueeze(0)
        hx, cx = self.lstm(caption_embeddings_packed,(hx, cx))
        hx_unpacked, lens_unpacked = nn.utils.rnn.pad_packed_sequence(hx,padding_value=0, batch_first=True)
        raw_scores = self.fc(hx_unpacked)
        return raw_scores
    
    def prepare_for_prediction(self, beam_width):
        self.lstm_cell = nn.LSTMCell(self.embedding_size,self.hidden_size,device=self.device)
        self.lstm_cell.load_state_dict({ key[:-3]:state for key,state in self.lstm.state_dict().items()})
        self.beam_width = beam_width
    
    def predict_first_tokens(self, features):
        batch_size = features.shape[0]
        h0 = self.init_h(features) 
        c0 = self.init_c(features)
        start_tokens = torch.tensor(batch_size*[self.start_token], device=self.device)
        h1, c1 = self.lstm_cell(self.embedding(start_tokens), (h0, c0))
        raw_scores = self.fc(h1)
        raw_scores = nn.functional.log_softmax(raw_scores,1)
        score, tokens = raw_scores.topk(self.beam_width,dim=1) #should be shaped batchsize x beam_width
        return score, tokens, h1, c1
    
    def predict_next_token(self,score,h_i,c_i,tokens):
        h, c = self.lstm_cell(self.embedding(tokens), (h_i, c_i))
        raw_scores = nn.functional.log_softmax(self.fc(h),1)
        total_score = raw_scores + score.unsqueeze(dim=-1)
        total_score = total_score.flatten()
        new_score, new_tokens = total_score.topk(self.beam_width)
        return new_score, new_tokens, h, c
    
    def predict(self, features):
        batch_size = features.shape[0]
        hx = self.init_h(features) 
        cx = self.init_c(features)
        eok_pred = torch.tensor([False]*batch_size, device=self.device)
        current_pred = torch.tensor(batch_size*[self.start_token], device=self.device)
        output = []
        predictions = []
        i = 0
        while eok_pred.sum() < batch_size and i < 100:
            unfinished_pred = current_pred != self.end_token
            hx_, cx_ = self.lstm_cell(self.embedding(current_pred[unfinished_pred]), (hx[unfinished_pred], cx[unfinished_pred]))
            raw_scores = self.fc(hx_)
            hx[unfinished_pred] = hx_
            cx[unfinished_pred] = cx_
            output.append(raw_scores)
            prediction = raw_scores.argmax(1)
            predictions.append(prediction)
            current_pred[unfinished_pred] = prediction
            eok_pred[current_pred == self.end_token] = True
            i = i + 1
        return nn.utils.rnn.pad_sequence(output), nn.utils.rnn.pad_sequence(predictions, batch_first=False, padding_value=self.pad_token)

In [None]:
class DecoderPretrainedEmbeddings(nn.Module):
    def __init__(self, embedding_matrix, vocab_size, hidden_size, start_token, end_token, pad_token):
        super(DecoderPretrainedEmbeddings,self).__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.hidden_size = hidden_size
        self.start_token = start_token
        self.end_token = end_token
        self.pad_token = pad_token
        self.embedding_size = embedding_matrix.shape[1]
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix.to(self.device),freeze=False)
        self.lstm = nn.LSTM(self.embedding_size,hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size,vocab_size)
        self.init_h = nn.Linear(512, hidden_size)  # linear layer to find initial hidden state of LSTMCell
        self.init_c = nn.Linear(512, hidden_size)  # linear layer to find initial cell state of LSTMCell
        self.vocab_size = vocab_size
        
    def forward(self, features, captions, caption_length):
        batch_size = features.shape[0]
        caption_embeddings = self.embedding(captions)
        caption_embeddings_packed = nn.utils.rnn.pack_padded_sequence(caption_embeddings,caption_length.cpu()-1,True)
        hx = self.init_h(features).unsqueeze(0)
        cx = self.init_c(features).unsqueeze(0)
        hx, cx = self.lstm(caption_embeddings_packed,(hx, cx))
        hx_unpacked, lens_unpacked = nn.utils.rnn.pad_packed_sequence(hx,padding_value=0, batch_first=True)
        raw_scores = self.fc(hx_unpacked)
        return raw_scores
    
    def prepare_for_prediction(self, beam_width):
        self.lstm_cell = nn.LSTMCell(self.embedding_size,self.hidden_size,device=self.device)
        self.lstm_cell.load_state_dict({ key[:-3]:state for key,state in self.lstm.state_dict().items()})
        self.beam_width = beam_width
    
    def predict_first_tokens(self, features):
        batch_size = features.shape[0]
        h0 = self.init_h(features) 
        c0 = self.init_c(features)
        start_tokens = torch.tensor(batch_size*[self.start_token], device=self.device)
        h1, c1 = self.lstm_cell(self.embedding(start_tokens), (h0, c0))
        raw_scores = self.fc(h1)
        raw_scores = nn.functional.log_softmax(raw_scores,1)
        score, tokens = raw_scores.topk(self.beam_width,dim=1) #should be shaped batchsize x beam_width
        return score, tokens, h1, c1
    
    def predict_next_token(self,score,h_i,c_i,tokens):
        h, c = self.lstm_cell(self.embedding(tokens), (h_i, c_i))
        raw_scores = nn.functional.log_softmax(self.fc(h),1)
        total_score = raw_scores + score.unsqueeze(dim=-1)
        total_score = total_score.flatten()
        new_score, new_tokens = total_score.topk(self.beam_width)
        return new_score, new_tokens, h, c
    
    def predict(self, features):
        batch_size = features.shape[0]
        hx = self.init_h(features) 
        cx = self.init_c(features)
        eok_pred = torch.tensor([False]*batch_size, device=self.device)
        current_pred = torch.tensor(batch_size*[self.start_token], device=self.device)
        output = []
        predictions = []
        i = 0
        while eok_pred.sum() < batch_size and i < 100:
            unfinished_pred = current_pred != self.end_token
            hx_, cx_ = self.lstm_cell(self.embedding(current_pred[unfinished_pred]), (hx[unfinished_pred], cx[unfinished_pred]))
            raw_scores = self.fc(hx_)
            hx[unfinished_pred] = hx_
            cx[unfinished_pred] = cx_
            output.append(raw_scores)
            prediction = raw_scores.argmax(1)
            predictions.append(prediction)
            current_pred[unfinished_pred] = prediction
            eok_pred[current_pred == self.end_token] = True
            i = i + 1
        return nn.utils.rnn.pad_sequence(output), nn.utils.rnn.pad_sequence(predictions, batch_first=False, padding_value=self.pad_token)


In [1]:
class DecoderPretrainedWithImageFeatureForPrediction(nn.Module):
    def __init__(self, feature_size, embedding_matrix, hidden_size, vocab_size, start_token, end_token, pad_token):
        super(DecoderPretrainedWithImageFeatureForPrediction,self).__init__()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.hidden_size = hidden_size
        self.start_token = start_token
        self.end_token = end_token
        self.pad_token = pad_token
        self.embedding_size = embedding_matrix.shape[1]
        self.features_to_embedding = nn.Linear(feature_size, self.embedding_size)
        self.features_to_hidden = nn.Linear(feature_size, self.hidden_size)
        self.embedding = nn.Embedding.from_pretrained(embedding_matrix.to(self.device),freeze=False)
        self.lstm = nn.LSTM(self.embedding_size,hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size,vocab_size)
        self.vocab_size = vocab_size
        
    def forward(self, features, captions, caption_length):
        batch_size = features.shape[0]
        feature_embeddings = nn.functional.relu(self.features_to_embedding(features).unsqueeze(1))
        caption_embeddings = self.embedding(captions)
        all_embeddings = torch.cat([feature_embeddings, caption_embeddings], dim=1)
        hx = torch.zeros((1,batch_size, self.hidden_size)).to(self.device)
        cx = torch.zeros((1,batch_size, self.hidden_size)).to(self.device)
        caption_embeddings_packed = nn.utils.rnn.pack_padded_sequence(all_embeddings,caption_length.cpu(),True)
        hx, cx = self.lstm(caption_embeddings_packed,(hx, cx))
        hx_unpacked, lens_unpacked = nn.utils.rnn.pad_packed_sequence(hx,padding_value=0, batch_first=True)
        features_hidden = nn.functional.relu(self.features_to_hidden(features).unsqueeze(1))
        fc_inp = hx_unpacked + features_hidden 
        raw_scores = self.fc(fc_inp)
        return raw_scores
    
    def prepare_for_prediction(self, beam_width):
        self.lstm_cell = nn.LSTMCell(self.embedding_size,self.hidden_size,device=self.device)
        self.lstm_cell.load_state_dict({ key[:-3]:state for key,state in self.lstm.state_dict().items()})
        self.beam_width = beam_width
    
    def predict_first_tokens(self, features):
        batch_size = features.shape[0]
        h0 = torch.zeros((batch_size, self.hidden_size)).to(self.device)
        c0 = torch.zeros((batch_size, self.hidden_size)).to(self.device)
        feature_embeddings = nn.functional.relu(self.features_to_embedding(features))
        features_hidden = nn.functional.relu(self.features_to_hidden(features))
        h1, c1 = self.lstm_cell(feature_embeddings, (h0, c0))
        raw_scores = self.fc(h1 + features_hidden)
        raw_scores = nn.functional.log_softmax(raw_scores,1)
        score, tokens = raw_scores.topk(self.beam_width,dim=1) #should be shaped batchsize x beam_width
        return score, tokens, h1, c1
    
    def predict_next_token(self,features, score,h_i,c_i,tokens):
        features_hidden = nn.functional.relu(self.features_to_hidden(features))
        h, c = self.lstm_cell(self.embedding(tokens), (h_i, c_i))
        raw_scores = nn.functional.log_softmax(self.fc(h + features_hidden),1)
        total_score = raw_scores + score.unsqueeze(dim=-1)
        total_score = total_score.flatten()
        new_score, new_tokens = total_score.topk(self.beam_width)
        return new_score, new_tokens, h, c
    
    def predict(self, features):
        batch_size = features.shape[0]
        hx = torch.zeros((batch_size, self.hidden_size)).to(self.device)
        cx = torch.zeros((batch_size, self.hidden_size)).to(self.device)
        feature_embeddings = nn.functional.relu(self.features_to_embedding(features))
        eok_pred = torch.tensor([False]*batch_size, device=self.device)
        current_pred = torch.tensor(batch_size*[self.start_token], device=self.device)
        output = []
        predictions = []
        i = 0
        features_hidden = nn.functional.relu(self.features_to_hidden(features))
        while eok_pred.sum() < batch_size and i < 50:
            
            unfinished_pred = current_pred != self.end_token
            if i == 0:
                embeddings = feature_embeddings
            else:
                embeddings = self.embedding(current_pred[unfinished_pred])
            hx_, cx_ = self.lstm_cell(embeddings, (hx[unfinished_pred], cx[unfinished_pred]))
            raw_scores = self.fc(hx_ + features_hidden[unfinished_pred])
            hx[unfinished_pred] = hx_
            cx[unfinished_pred] = cx_
            output.append(raw_scores)
            prediction_array = torch.ones(batch_size).type(torch.LongTensor).to(self.device)*self.pad_token
            prediction = raw_scores.argmax(1)
            prediction_array[unfinished_pred] = prediction
            predictions.append(prediction_array)
            current_pred[unfinished_pred] = prediction
            eok_pred[current_pred == self.end_token] = True
            i = i + 1
        
        predictions = torch.stack(predictions).T
        pred_lengths = predictions.shape[1] - (predictions == self.pad_token).sum(1)
                
        return nn.utils.rnn.pad_sequence(output,), predictions, pred_lengths

NameError: name 'nn' is not defined