In [1]:
import torch
import torch.nn as nn
import torchvision.models as models
from torch.nn.utils.rnn import pack_padded_sequence
from torch.autograd import Variable
import torch.nn.functional as F
from torch.nn import init

In [2]:
class AttentiveCNN( nn.Module ):
    def __init__( self, embed_size, hidden_size ):
        super( AttentiveCNN, self ).__init__()
        
        """# vgg16 backend
        vgg = models.vgg16(pretrained=True)
        modules = list( vgg.children() )[ :-1 ] # delete the last fc layer
        vgg_conv = nn.Sequential( *modules ) # last conv feature"""
        
        # resnet backend
        resnet = models.resnet152(pretrained=True)
        modules = list( resnet.children() )[ :-2 ] # delete the last fc layer and pooling layer
        resnet_conv = nn.Sequential( *modules ) # last conv feature
        
        
        self.resnet_conv = resnet_conv
        self.avgpool = nn.AvgPool2d( 7 )
        self.affine_a = nn.Linear( 2048, hidden_size ) # v_i = W_a * A
        self.affine_b = nn.Linear( 2048, embed_size )  # v_g = W_b * a^g
        self.batch_norm = nn.BatchNorm2d(2048, affine=False)
        
        # Dropout before affine transformation
        self.dropout = nn.Dropout( 0.5 )
        
        self.init_weights()
        
    def init_weights( self ):
        """Initialize the weights."""
        init.kaiming_uniform( self.affine_a.weight, mode='fan_in' )
        init.kaiming_uniform( self.affine_b.weight, mode='fan_in' )
        self.affine_a.bias.data.fill_( 0 )
        self.affine_b.bias.data.fill_( 0 )
        
        
    def forward( self, images ):
        '''
        Input: images
        Output: V=[v_1, ..., v_n], v_g
        '''
        
        # Last conv layer feature map
        A = self.resnet_conv( images )
        A = self.batch_norm(A)
        
        # a^g, average pooling feature map
        a_g = self.avgpool( A )
        a_g = a_g.view( a_g.size(0), -1 )
        
        # V = [ v_1, v_2, ..., v_49 ]
        V = A.view( A.size( 0 ), A.size( 1 ), -1 ).transpose( 1,2 )
        V = F.relu( self.affine_a( self.dropout( V ) ) )
        
        v_g = F.relu( self.affine_b( self.dropout( a_g ) ) )
        
        return V, v_g

In [3]:
# Attention Block for C_hat calculation
class Atten( nn.Module ):
    def __init__( self, hidden_size ):
        super( Atten, self ).__init__()

        self.affine_v = nn.Linear( hidden_size, 49, bias=False ) # W_v
        self.affine_g = nn.Linear( hidden_size, 49, bias=False ) # W_g
        self.affine_s = nn.Linear( hidden_size, 49, bias=False ) # W_s
        self.affine_h = nn.Linear( 49, 1, bias=False ) # w_h
        
        self.dropout = nn.Dropout( 0.5 )
        self.init_weights()
        
    def init_weights( self ):
        """Initialize the weights."""
        init.xavier_uniform( self.affine_v.weight )
        init.xavier_uniform( self.affine_g.weight )
        init.xavier_uniform( self.affine_h.weight )
        init.xavier_uniform( self.affine_s.weight )
        
    def forward( self, V, h_t, s_t ):
        '''
        Input: V=[v_1, v_2, ... v_k], h_t, s_t from LSTM
        Output: c_hat_t, attention feature map
        '''
        
        # W_v * V + W_g * h_t * 1^T
        content_v = self.affine_v( self.dropout( V ) ).unsqueeze( 1 ) \
                    + self.affine_g( self.dropout( h_t ) ).unsqueeze( 2 )
        
        # z_t = W_h * tanh( content_v )
        z_t = self.affine_h( self.dropout( F.tanh( content_v ) ) ).squeeze( 3 )
        alpha_t = F.softmax( z_t.view( -1, z_t.size( 2 ) ) ).view( z_t.size( 0 ), z_t.size( 1 ), -1 )
        
        # Construct c_t: B x seq x hidden_size
        c_t = torch.bmm( alpha_t, V ).squeeze( 2 )
        
        # W_s * s_t + W_g * h_t
        content_s = self.affine_s( self.dropout( s_t ) ) + self.affine_g( self.dropout( h_t ) )
        # w_t * tanh( content_s )
        z_t_extended = self.affine_h( self.dropout( F.tanh( content_s ) ) )
        
        # Attention score between sentinel and image content
        extended = torch.cat( ( z_t, z_t_extended ), dim=2 )
        alpha_hat_t = F.softmax( extended.view( -1, extended.size( 2 ) ) ).view( extended.size( 0 ), extended.size( 1 ), -1 )
        beta_t = alpha_hat_t[ :, :, -1 ]
        
        # c_hat_t = beta * s_t + ( 1 - beta ) * c_t
        beta_t = beta_t.unsqueeze( 2 )
        c_hat_t = beta_t * s_t + ( 1 - beta_t ) * c_t

        return c_hat_t, alpha_t, beta_t

In [4]:
# Sentinel BLock    
class Sentinel( nn.Module ):
    def __init__( self, input_size, hidden_size ):
        super( Sentinel, self ).__init__()

        self.affine_x = nn.Linear( input_size, hidden_size, bias=False )
        self.affine_h = nn.Linear( hidden_size, hidden_size, bias=False )
        
        # Dropout applied before affine transformation
        self.dropout = nn.Dropout( 0.5 )
        
        self.init_weights()
        
    def init_weights( self ):
        init.xavier_uniform( self.affine_x.weight )
        init.xavier_uniform( self.affine_h.weight )
        
    def forward( self, x_t, h_t_1, cell_t ):
        
        # g_t = sigmoid( W_x * x_t + W_h * h_(t-1) )        
        gate_t = self.affine_x( self.dropout( x_t ) ) + self.affine_h( self.dropout( h_t_1 ) )
        gate_t = F.sigmoid( gate_t )
        
        # Sentinel embedding
        s_t =  gate_t * F.tanh( cell_t )
        
        return s_t

In [5]:
# Adaptive Attention Block: C_t, Spatial Attention Weights, Sentinel embedding    
class AdaptiveBlock( nn.Module ):
    
    def __init__( self, embed_size, hidden_size, vocab_size ):
        super( AdaptiveBlock, self ).__init__()

        # Sentinel block
        self.sentinel = Sentinel( embed_size * 2, hidden_size )
        
        # Image Spatial Attention Block
        self.atten = Atten( hidden_size )
        
        # Final Caption generator
        self.mlp = nn.Linear( hidden_size, vocab_size )
        
        # Dropout layer inside Affine Transformation
        self.dropout = nn.Dropout( 0.5 )
        
        self.hidden_size = hidden_size
        self.init_weights()
        
    def init_weights( self ):
        '''
        Initialize final classifier weights
        '''
        init.kaiming_normal( self.mlp.weight, mode='fan_in' )
        self.mlp.bias.data.fill_( 0 )
        
        
    def forward( self, x, hiddens, cells, V ):
        
        # hidden for sentinel should be h0-ht-1
        h0 = self.init_hidden( x.size(0) )[0].transpose( 0,1 )
        
        # h_(t-1): B x seq x hidden_size ( 0 - t-1 )
        if hiddens.size( 1 ) > 1:
            hiddens_t_1 = torch.cat( ( h0, hiddens[ :, :-1, : ] ), dim=1 )
        else:
            hiddens_t_1 = h0

        # Get Sentinel embedding, it's calculated blockly    
        sentinel = self.sentinel( x, hiddens_t_1, cells )
        
        # Get C_t, Spatial attention, sentinel score
        c_hat, atten_weights, beta = self.atten( V, hiddens, sentinel )
        
        # Final score along vocabulary
        scores = self.mlp( self.dropout( c_hat + hiddens ) )
        
        return scores, atten_weights, beta
    
    def init_hidden( self, bsz ):
        '''
        Hidden_0 & Cell_0 initialization
        '''
        weight = next( self.parameters() ).data
        
        if torch.cuda.is_available():
            return ( Variable( weight.new( 1 , bsz, self.hidden_size ).zero_().cuda() ),
                    Variable( weight.new( 1,  bsz, self.hidden_size ).zero_().cuda() ) ) 
        else: 
            return ( Variable( weight.new( 1 , bsz, self.hidden_size ).zero_() ),
                    Variable( weight.new( 1,  bsz, self.hidden_size ).zero_() ) ) 

In [6]:
# Caption Decoder
class Decoder( nn.Module ):
    def __init__( self, embed_size, vocab_size, hidden_size ):
        super( Decoder, self ).__init__()

        # word embedding
        self.embed = nn.Embedding( vocab_size, embed_size )
        
        # LSTM decoder: input = [ w_t; v_g ] => 2 x word_embed_size;
        self.LSTM = nn.LSTM( embed_size * 2, hidden_size, 1, batch_first=True )
        
        # Save hidden_size for hidden and cell variable 
        self.hidden_size = hidden_size
        
        # Adaptive Attention Block: Sentinel + C_hat + Final scores for caption sampling
        self.adaptive = AdaptiveBlock( embed_size, hidden_size, vocab_size )
        
    def forward( self, V, v_g , captions, states=None ):
        
        # Word Embedding
        embeddings = self.embed( captions )
        
        # x_t = [w_t;v_g]
        x = torch.cat( ( embeddings, v_g.unsqueeze( 1 ).expand_as( embeddings ) ), dim=2 )
        
        # Hiddens: Batch x seq_len x hidden_size
        # Cells: seq_len x Batch x hidden_size, default setup by Pytorch
        if torch.cuda.is_available():
            hiddens = Variable( torch.zeros( x.size(0), x.size(1), self.hidden_size ).cuda() )
            cells = Variable( torch.zeros( x.size(1), x.size(0), self.hidden_size ).cuda() )
        else:
            hiddens = Variable( torch.zeros( x.size(0), x.size(1), self.hidden_size ) )
            cells = Variable( torch.zeros( x.size(1), x.size(0), self.hidden_size ) )            
        
        # Recurrent Block
        # Retrieve hidden & cell for Sentinel simulation
        for time_step in range( x.size( 1 ) ):
            
            # Feed in x_t one at a time
            x_t = x[ :, time_step, : ]
            x_t = x_t.unsqueeze( 1 )
            
            h_t, states = self.LSTM( x_t, states )
            
            # Save hidden and cell
            hiddens[ :, time_step, : ] = h_t  # Batch_first
            cells[ time_step, :, : ] = states[ 1 ]
        
        # cell: Batch x seq_len x hidden_size
        cells = cells.transpose( 0, 1 )

        # Data parallelism for adaptive attention block
        if torch.cuda.device_count() > 1:
            ids = range( torch.cuda.device_count() )
            adaptive_block_parallel = nn.DataParallel( self.adaptive, device_ids=ids )
            
            scores, atten_weights, beta = adaptive_block_parallel( x, hiddens, cells, V )
        else:
            scores, atten_weights, beta = self.adaptive( x, hiddens, cells, V )
        
        # Return states for Caption Sampling purpose
        return scores, states, atten_weights, beta

In [7]:
# Whole Architecture with Image Encoder and Caption decoder        
class Encoder2Decoder( nn.Module ):
    def __init__( self, embed_size, vocab_size, hidden_size ):
        super( Encoder2Decoder, self ).__init__()
        
        # Image CNN encoder and Adaptive Attention Decoder
        self.encoder = AttentiveCNN( embed_size, hidden_size )
        self.decoder = Decoder( embed_size, vocab_size, hidden_size )
        
        
    def forward( self, images, captions, lengths ):
        
        # Data parallelism for V v_g encoder if multiple GPUs are available
        # V=[ v_1, ..., v_k ], v_g in the original paper
        if torch.cuda.device_count() > 1:
            device_ids = range( torch.cuda.device_count() )
            encoder_parallel = torch.nn.DataParallel( self.encoder, device_ids=device_ids )
            V, v_g = encoder_parallel( images ) 
        else:
            V, v_g = self.encoder( images )
        
        # Language Modeling on word prediction
        scores, _, _,_ = self.decoder( V, v_g, captions )
        
        # Pack it to make criterion calculation more efficient
        packed_scores = pack_padded_sequence( scores, lengths, batch_first=True )
        
        return packed_scores
    
    # Caption generator
    def sampler( self, images, max_len=20 ):
        """
        Samples captions for given image features (Greedy search).
        """
        
        # Data parallelism if multiple GPUs
        if torch.cuda.device_count() > 1:
            device_ids = range( torch.cuda.device_count() )
            encoder_parallel = torch.nn.DataParallel( self.encoder, device_ids=device_ids )
            V, v_g = encoder_parallel( images ) 
        else:    
            V, v_g = self.encoder( images )
            
        # Build the starting token Variable <start> (index 1): B x 1
        if torch.cuda.is_available():
            captions = Variable( torch.LongTensor( images.size( 0 ), 1 ).fill_( 1 ).cuda() )
        else:
            captions = Variable( torch.LongTensor( images.size( 0 ), 1 ).fill_( 1 ) )
        
        # Get generated caption idx list, attention weights and sentinel score
        sampled_ids = []
        attention = []
        Beta = []
        
        # Initial hidden states
        states = None

        for i in range( max_len ):

            scores, states, atten_weights, beta = self.decoder( V, v_g, captions, states ) 
            predicted = scores.max( 2 )[ 1 ] # argmax
            captions = predicted
            
            # Save sampled word, attention map and sentinel at each timestep
            sampled_ids.append( captions )
            attention.append( atten_weights )
            Beta.append( beta )
        
        # caption: B x max_len
        # attention: B x max_len x 49
        # sentinel: B x max_len
        sampled_ids = torch.cat( sampled_ids, dim=1 )
        attention = torch.cat( attention, dim=1 )
        Beta = torch.cat( Beta, dim=1 )
        
        return sampled_ids, attention, Beta

In [8]:
import math
import json
import argparse
import torch
import torch.nn as nn
import numpy as np
import os
import pickle
from utils import CocoImageFolder, coco_eval, to_var
from data_loader import get_loader 
from build_vocab import Vocabulary
from torch.autograd import Variable 
from torchvision import transforms
from torch.nn.utils.rnn import pack_padded_sequence

In [9]:
if __name__ == '__main__':
    
    parser = argparse.ArgumentParser(description='Add attributes.')
    parser.add_argument( '-f', default='self', help='To make it runnable in jupyter' )
    parser.add_argument( '--model_path', type=str, default='./models-attentive/',
                         help='path for saving trained models')
    parser.add_argument('--crop_size', type=int, default=224 ,
                        help='size for randomly cropping images')
    parser.add_argument('--vocab_path', type=str, default='./data/vocab.pkl',
                        help='path for vocabulary wrapper')
    parser.add_argument('--image_dir', type=str, default='./data/resize/train2017' ,
                        help='directory for resized training images')
    parser.add_argument('--val_dir', type=str, default='./data/resize/val2017',
                        help='directory for resized validation images' )
    parser.add_argument('--caption_path', type=str,
                        default='./data/annotations/captions_train2017.json',
                        help='path for train annotation json file')
    parser.add_argument('--caption_val_path', type=str,
                        default='./data/annotations/captions_val2017.json',
                        help='path for validation annotation json file')
    parser.add_argument('--log_step', type=int, default=50,
                        help='step size for printing log info')
    parser.add_argument('--seed', type=int, default=123,
                        help='random seed for model reproduction')
    
    # ---------------------------Hyper Parameter Setup------------------------------------
    
    # CNN fine-tuning
    parser.add_argument('--fine_tune_start_layer', type=int, default=6,
                        help='CNN fine-tuning layers from: [0-7]')
    parser.add_argument('--cnn_epoch', type=int, default=20,
                        help='start fine-tuning CNN after')
    
    # Optimizer Adam parameter
    parser.add_argument( '--alpha', type=float, default=0.8,
                         help='alpha in Adam' )
    parser.add_argument( '--beta', type=float, default=0.999,
                         help='beta in Adam' )
    parser.add_argument( '--learning_rate', type=float, default=4e-4,
                         help='learning rate for the whole model' )
    parser.add_argument( '--learning_rate_cnn', type=float, default=1e-4,
                         help='learning rate for fine-tuning CNN' )
    
    # LSTM hyper parameters
    parser.add_argument( '--embed_size', type=int, default=256,
                         help='dimension of word embedding vectors, also dimension of v_g' )
    parser.add_argument( '--hidden_size', type=int, default=512,
                         help='dimension of lstm hidden states' )
    
    # Training details
    parser.add_argument( '--pretrained', type=str, default='', help='start from checkpoint or scratch' )
    parser.add_argument( '--num_epochs', type=int, default=10 )
    parser.add_argument( '--batch_size', type=int, default=10 )
    
    # For eval_size > 30, it will cause cuda OOM error.
    parser.add_argument( '--eval_size', type=int, default=30 ) 
    parser.add_argument( '--num_workers', type=int, default=2 )
    parser.add_argument( '--clip', type=float, default=0.1 )
    parser.add_argument( '--lr_decay', type=int, default=20, help='epoch at which to start lr decay' )
    parser.add_argument( '--learning_rate_decay_every', type=int, default=50,
                         help='decay learning rate at every this number')
    
    
    args = parser.parse_args()
    
    print ('------------------------Model and Training Details--------------------------')
    print(args)

------------------------Model and Training Details--------------------------
Namespace(alpha=0.8, batch_size=10, beta=0.999, caption_path='./data/annotations/captions_train2017.json', caption_val_path='./data/annotations/captions_val2017.json', clip=0.1, cnn_epoch=20, crop_size=224, embed_size=256, eval_size=30, f='/run/user/1003/jupyter/kernel-4fba79ec-a942-416f-9081-85d2ff07fad1.json', fine_tune_start_layer=6, hidden_size=512, image_dir='./data/resize/train2017', learning_rate=0.0004, learning_rate_cnn=0.0001, learning_rate_decay_every=50, log_step=50, lr_decay=20, model_path='./models-attentive/', num_epochs=10, num_workers=2, pretrained='', seed=123, val_dir='./data/resize/val2017', vocab_path='./data/vocab.pkl')


In [10]:
transform = transforms.Compose([ 
        transforms.RandomCrop( args.crop_size ),
        transforms.RandomHorizontalFlip(), 
        transforms.ToTensor(), 
        transforms.Normalize(( 0.485, 0.456, 0.406 ), 
                             ( 0.229, 0.224, 0.225 ))])

In [11]:
# Load vocabulary wrapper.
with open( args.vocab_path, 'rb') as f:
    vocab = pickle.load( f )

In [18]:
# Build training data loader
"""
data_loader = get_loader( args.image_dir, args.caption_path, vocab, 
                          transform, 30,
                          shuffle=True, num_workers=args.num_workers )

# for testing the data, we can just use validation set to train, use the following loader
"""
data_loader = get_loader( args.val_dir, args.caption_val_path, vocab, 
                          transform, 80,
                          shuffle=True, num_workers=args.num_workers ) 


loading annotations into memory...
Done (t=0.04s)
creating index...
index created!


In [19]:
# Load pretrained model or build from scratch
adaptive = Encoder2Decoder( args.embed_size, len(vocab), args.hidden_size )

In [20]:
start_epoch = 3
# Constructing CNN parameters for optimization, only fine-tuning higher layers
cnn_subs = list( adaptive.encoder.resnet_conv.children() )
cnn_params = [ list( sub_module.parameters() ) for sub_module in cnn_subs ]
cnn_params = [ item for sublist in cnn_params for item in sublist ]

'''cnn_optimizer = torch.optim.Adam( cnn_params, lr=args.learning_rate_cnn, 
                                  betas=( args.alpha, args.beta ) )'''


cnn_optimizer = torch.optim.SGD(cnn_params, lr=0.0001, momentum=0.9)

In [21]:
# Other parameter optimization
params = list( adaptive.encoder.affine_a.parameters() ) + list( adaptive.encoder.affine_b.parameters() ) \
            + list( adaptive.decoder.parameters() )

# Will decay later    
learning_rate = args.learning_rate

# Language Modeling Loss, Optimizers
LMcriterion = nn.CrossEntropyLoss()

# Change to GPU mode if available
if torch.cuda.is_available():
    adaptive.cuda()
    LMcriterion.cuda()

# Train the Models
total_step = len( data_loader )

bleu_scores = []
best_cider = 0.0
best_epoch = 0
loss_list = []

In [None]:
# Start Training 
for epoch in range(start_epoch, 11):
    optimizer = torch.optim.SGD( params, lr=5e-2, momentum=0.9 )
    # optimizer = torch.optim.Adam( params, lr=learning_rate )

    # Language Modeling Training
    print ('------------------Training for Epoch %d----------------'%(epoch))
    for i, (images, captions, lengths, _ ) in enumerate( data_loader ):

        # Set mini-batch dataset
        images = to_var( images )
        captions = to_var( captions )
        lengths = [ cap_len - 1  for cap_len in lengths ]
        targets = pack_padded_sequence( captions[:,1:], lengths, batch_first=True )[0]

        # Forward, Backward and Optimize
        adaptive.train()
        adaptive.zero_grad()

        packed_scores = adaptive( images, captions, lengths )

        # Compute loss and backprop
        loss = LMcriterion( packed_scores[0], targets )
        loss.backward()

        # Gradient clipping for gradient exploding problem in LSTM
        for p in adaptive.decoder.LSTM.parameters():
            p.data.clamp_( -args.clip, args.clip )

        optimizer.step()

        # Start learning rate decay
        if epoch > args.lr_decay:

            frac = ( epoch - args.cnn_epoch ) / args.learning_rate_decay_every
            decay_factor = math.pow( 0.5, frac )

            # Decay the learning rate
            learning_rate = learning_rate * decay_factor

        # Start CNN fine-tuning
        if epoch > 10:
            cnn_optimizer.step()

        # Print log info
        if i % args.log_step == 0:
            print('Epoch [%d/%d], Step [%d/%d], CrossEntropy Loss: %.4f, Perplexity: %5.4f'\
                  %( epoch, args.num_epochs, i, total_step, loss.data[0], np.exp( loss.data[0] ) )) 
            loss_list.append(loss.data[0])
        if i % 500 == 0 and i != 0:
            torch.save( adaptive.state_dict(), os.path.join( args.model_path, 'adaptive-%d-%d.pkl'%( epoch, i ) ) )
            bleu = coco_eval( adaptive, args, epoch, i )
            bleu_scores.append( bleu )
            print('Bleu_2 score: %.4f'%(bleu))

    # Save the Adaptive Attention model after each epoch
    torch.save( adaptive.state_dict(), 
               os.path.join( args.model_path, 'adaptive-%d-%d.pkl'%( epoch, i ) ) )

    # Evaluation on validation set        
    bleu = coco_eval( adaptive, args, epoch, i )
    bleu_scores.append( bleu )        
'''
    if cider > best_cider:
        best_cider = cider
        best_epoch = epoch

    if len( cider_scores ) > 5:

        last_6 = cider_scores[-6:]
        last_6_max = max( last_6 )

        # Test if there is improvement, if not do early stopping
        if last_6_max != best_cider:

            print ('No improvement with CIDEr in the last 6 epochs...Early stopping triggered.')
            print ('Model of best epoch #: %d with CIDEr score %.2f'%( best_epoch, best_cider ))
            break
'''

------------------Training for Epoch 3----------------




Epoch [3/10], Step [0/313], CrossEntropy Loss: 6.2430, Perplexity: 514.3945
Epoch [3/10], Step [50/313], CrossEntropy Loss: 5.1451, Perplexity: 171.5811
Epoch [3/10], Step [100/313], CrossEntropy Loss: 4.5343, Perplexity: 93.1592
Epoch [3/10], Step [150/313], CrossEntropy Loss: 4.2974, Perplexity: 73.5102
Epoch [3/10], Step [200/313], CrossEntropy Loss: 4.1376, Perplexity: 62.6515
Epoch [3/10], Step [250/313], CrossEntropy Loss: 4.2928, Perplexity: 73.1723
Epoch [3/10], Step [300/313], CrossEntropy Loss: 4.1212, Perplexity: 61.6315
---------------------Start evaluation on MS-COCO dataset-----------------------
[100/167]
------------------------Caption Generated-------------------------------------
loading annotations into memory...
Done (t=0.05s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 47554, 'reflen': 47969, 'guess': [47554, 42554, 37554, 



Epoch [4/10], Step [0/313], CrossEntropy Loss: 4.0842, Perplexity: 59.3922
Epoch [4/10], Step [50/313], CrossEntropy Loss: 3.7888, Perplexity: 44.2020
Epoch [4/10], Step [100/313], CrossEntropy Loss: 4.0337, Perplexity: 56.4684
Epoch [4/10], Step [150/313], CrossEntropy Loss: 3.9233, Perplexity: 50.5679
Epoch [4/10], Step [200/313], CrossEntropy Loss: 3.5814, Perplexity: 35.9254
Epoch [4/10], Step [250/313], CrossEntropy Loss: 3.8574, Perplexity: 47.3430
Epoch [4/10], Step [300/313], CrossEntropy Loss: 3.5843, Perplexity: 36.0297
---------------------Start evaluation on MS-COCO dataset-----------------------
[100/167]
------------------------Caption Generated-------------------------------------
loading annotations into memory...
Done (t=0.05s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.01s)
creating index...
index created!
tokenization...
setting up scorers...
computing Bleu score...
{'testlen': 43232, 'reflen': 45234, 'guess': [43232, 38232, 33232, 28



Epoch [5/10], Step [0/313], CrossEntropy Loss: 3.6402, Perplexity: 38.1008
Epoch [5/10], Step [50/313], CrossEntropy Loss: 3.4401, Perplexity: 31.1899
Epoch [5/10], Step [100/313], CrossEntropy Loss: 3.6523, Perplexity: 38.5614
Epoch [5/10], Step [150/313], CrossEntropy Loss: 3.2954, Perplexity: 26.9872
Epoch [5/10], Step [200/313], CrossEntropy Loss: 3.5373, Perplexity: 34.3745
