In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import random
import json
import os
import time
import tensorflow as tf
from tensorflow.contrib import rnn


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
#from sklearn.model_selection import train_test_split
import tensorflow.contrib.legacy_seq2seq as seq2seq
from utilities import show_graph
#from util import inv_sigmoid, linear_decay, dec_print_train, dec_print_val, dec_print_test

import unicodedata
import re
import numpy as np
import os
import io
import time
import collections
import json
import string

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
random.seed(0)
np.random.seed(0)
tf.set_random_seed(0)

n_inputs        = 4096
n_hidden        = 600
val_batch_size  = 100 #100
n_frames        = 80
max_caption_len = 50
forget_bias_red = 1.0
forget_bias_gre = 1.0
dropout_prob    = 0.5
n_attention     = n_hidden

special_tokens  = {'<PAD>': 0, '<BOS>': 1, '<EOS>': 2, '<UNK>': 3}
phases = {'train': 0, 'val': 1, 'test': 2}

In [3]:
class S2VT:
    def __init__(self, vocab_num = 0,lr = 1e-4):

        self.vocab_num = vocab_num
        self.learning_rate = lr
        self.saver = None

    def set_saver(self, saver):
        self.saver = saver
     
    def build_model(self, feat, captions=None, cap_len=None, sampling=None, phase=0):

        weights = {
            'W_feat': tf.Variable( tf.random_uniform([n_inputs, n_hidden], -0.1, 0.1), name='W_feat'), 
            'W_dec': tf.Variable(tf.random_uniform([n_hidden, self.vocab_num], -0.1, 0.1), name='W_dec')
        }
        biases = {
            'b_feat':  tf.Variable( tf.zeros([n_hidden]), name='b_feat'),
            'b_dec': tf.Variable(tf.zeros([self.vocab_num]), name='b_dec')
        }   
        embeddings = {
         'emb': tf.Variable(tf.random_uniform([self.vocab_num, n_hidden], -0.1, 0.1), name='emb')
        }

        batch_size = tf.shape(feat)[0]

        if phase != phases['test']:
            # cap_len: (250, 1) -> (250, 50)
            cap_mask = tf.sequence_mask(cap_len, max_caption_len, dtype=tf.float32)
     
        if phase == phases['train']: #  add noise
            noise = tf.random_uniform(tf.shape(feat), -0.1, 0.1, dtype=tf.float32)
            feat = feat + noise

        if phase == phases['train']:
            feat = tf.nn.dropout(feat, dropout_prob)

        feat = tf.reshape(feat, [-1, n_inputs])
        image_emb = tf.matmul(feat, weights['W_feat']) + biases['b_feat']
        image_emb = tf.reshape(image_emb, [-1, n_frames, n_hidden])
        image_emb = tf.transpose(image_emb, perm=[1, 0, 2])
        
        with tf.variable_scope('LSTM1'):
            lstm_red = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=forget_bias_red, state_is_tuple=True)
            if phase == phases['train']:
                lstm_red = tf.contrib.rnn.DropoutWrapper(lstm_red, output_keep_prob=dropout_prob)    
        with tf.variable_scope('LSTM2'):
            lstm_gre = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, forget_bias=forget_bias_gre, state_is_tuple=True)
            if phase == phases['train']:
                lstm_gre = tf.contrib.rnn.DropoutWrapper(lstm_gre, output_keep_prob=dropout_prob)    

        state_red = lstm_red.zero_state(batch_size, dtype=tf.float32)
        state_gre = lstm_gre.zero_state(batch_size, dtype=tf.float32)

        padding = tf.zeros([batch_size, n_hidden])

        h_src = []
        for i in range(0, n_frames):
            with tf.variable_scope("LSTM1"):
                output_red, state_red = lstm_red(image_emb[i,:,:], state_red)
            
            with tf.variable_scope("LSTM2"):
                output_gre, state_gre = lstm_gre(tf.concat([padding, output_red], axis=1), state_gre)
                h_src.append(output_gre) # even though padding is augmented, output_gre/state_gre's shape not change

        h_src = tf.stack(h_src, axis = 0)

        bos = tf.ones([batch_size, n_hidden])
        padding_in = tf.zeros([batch_size, n_hidden])

        logits = []
        max_prob_index = None

        

        cross_ent_list = []
        for i in range(0, max_caption_len):

            with tf.variable_scope("LSTM1"):
                output_red, state_red = lstm_red(padding_in, state_red)

            if i == 0:
                with tf.variable_scope("LSTM2"):
                    con = tf.concat([bos, output_red], axis=1)
                    output_gre, state_gre = lstm_gre(con, state_gre)
            else:
                if phase == phases['train']:
                    if sampling[i] == True:
                        feed_in = captions[:, i - 1]
                    else:
                        feed_in = tf.argmax(logit_words, 1)
                else:
                    feed_in = tf.argmax(logit_words, 1)
                with tf.device("/cpu:0"):
                    embed_result = tf.nn.embedding_lookup(embeddings['emb'], feed_in)
                with tf.variable_scope("LSTM2"):
                    con = tf.concat([embed_result, output_red], axis=1)
                    output_gre, state_gre = lstm_gre(con, state_gre)

            logit_words = tf.matmul(output_gre, weights['W_dec']) + biases['b_dec']
            logits.append(logit_words)

            if phase != phases['test']:
                labels = captions[:, i]
                one_hot_labels = tf.one_hot(labels, self.vocab_num, on_value = 1, off_value = None, axis = 1) 
                cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit_words, labels=one_hot_labels)
                cross_entropy = cross_entropy * cap_mask[:, i]
                cross_ent_list.append(cross_entropy)
        
        loss = 0.0
        if phase != phases['test']:
            cross_entropy_tensor = tf.stack(cross_ent_list, 1)
            loss = tf.reduce_sum(cross_entropy_tensor, axis=1)
            loss = tf.divide(loss, tf.cast(cap_len, tf.float32))
            loss = tf.reduce_mean(loss, axis=0)

        logits = tf.stack(logits, axis = 0)
        logits = tf.reshape(logits, (max_caption_len, batch_size, self.vocab_num))
        logits = tf.transpose(logits, [1, 0, 2])
        
        summary = None
        if phase == phases['train']:
            summary = tf.summary.scalar('training_loss', loss)
        elif phase == phases['val']:
            summary = tf.summary.scalar('validation_loss', loss)

        return logits, loss, summary

    def inference(self, logits):
        
        #print('using greedy search...')
        dec_pred = tf.argmax(logits, 2)
        return dec_pred

    def optimize(self, loss_op):

        params = tf.trainable_variables()
        optimizer = tf.train.AdamOptimizer(self.learning_rate)#.minimize(loss_op)
        gradients, variables = zip(*optimizer.compute_gradients(loss_op))
        gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        train_op = optimizer.apply_gradients(zip(gradients, params))

        return train_op

In [4]:
def tokenize(line,token='word'):
    if token == 'word':
        return [line.split(' ')]
    elif token == 'char':
        return [list(line)]
    else:
        print('ERROR: unknown token type '+token)

In [5]:
def count_tokens(tokanized_sentences):
    # Flatten a list of token lists into a list of tokens
    tokens = [tk for line in tokanized_sentences for tk in line]
    return collections.Counter(tokens)

In [6]:

def parse_vid_data_into_batches(filename,batch_size,feat_filepath):
    
    with open(filename, 'r') as f:
        datastore = json.load(f)
        
    batches = len(datastore)/batch_size
    batches = int(batches)
        
    i = 0
    j = 0
    
    vid_batch = {}
    sentence_set = {}
    
    for data in datastore:
        
        #### Extracting all feature vectors per video

        
        #vid_feat_list = []
        
        video_id = data["id"]
        features = np.load(feat_filepath.format(video_id))

        vid_framefeats = [] #list of all feature vectors per video. Shape = [80,4096]
        
        for array in features:
            vid_framefeats.append(array)

        if j not in vid_batch:
            vid_batch[j] = []

        vid_batch[j].append(vid_framefeats)
        
        
        #### Extracting only a single sentence per video into a standalone dict

        sentences = data["caption"]
        sentences = [word.lower() for word in sentences] #Normalize the case
        table = str.maketrans('', '', string.punctuation) #Normalize the punctuation
        sentences = [word.translate(table) for word in sentences]

        sentence_set[i] = sentences[0] #0 for only the first sentence\
        
        i = i+1

        if i%batch_size == 0:
            j = j+1            
            
    return vid_batch, batches

In [7]:
def extract_sentences(filename, feat_filepath):
    
    sentence_set = {}
    
    with open(filename, 'r') as f:
        datastore = json.load(f)
        
    i = 0
    for data in datastore:
        
        #### Extracting only a single sentence per video into a standalone dict

        sentences = data["caption"]
        sentences = [word.lower() for word in sentences] #Normalize the case
        table = str.maketrans('', '', string.punctuation) #Normalize the punctuation
        sentences = [word.translate(table) for word in sentences]

        sentence_set[i] = sentences[0] #0 for only the first sentence\
        
        i = i+1
        
    return sentence_set

In [8]:
# Mapping string tokens to numertical indices.
def listVocab(sentence_set):
    
    PAD_token = 0
    BOS_token = 1
    EOS_token = 2
    UNK_token = 3
    
    all_tokens = []
    word_count = {}
    token2index = {"<PAD>": 0,"<BOS>":1,"<EOS>":2,"<UNK>":3}
    index2token = {PAD_token: "<PAD>", BOS_token: "<BOS>", EOS_token: "<EOS>", UNK_token: "<UNK>"}
    
    #for set_i in vid_sentence_set:
    #    sentence_set = vid_sentence_set[set_i]
    #    for line in sentence_set: 
    
    for n in sentence_set:
        line = sentence_set[n]
        tokenized_captions = tokenize(line) #Seperate the words
        all_tokens += tokenized_captions
    
    counter = count_tokens(all_tokens) #Count the word repeatitions in each set
    
    counter_dict = counter.items()
    counter_sort = sorted(counter_dict, key=lambda x:x[1],reverse=True) #sort by frequency of occurance 
    #print(counter_sort)

    i = len(index2token)
    values = [0,1,2,3]
    tokens = ["<PAD>","<BOS>","<EOS>","<UNK>"]
    for token, freq in counter_sort:
        word_count[token] = freq
        index2token[i] = token
        token2index[token] = i
        values += [i]
        tokens += [token]
        i+=1
        
    word_count['<PAD>'] = i
    word_count['<BOS>'] = i
    word_count['<EOS>'] = i
    word_count['<UNK>'] = i
    
    bias_init_vector = np.array([1.0 * word_count[ index2token[i] ] for i in index2token])
    bias_init_vector /= np.sum(bias_init_vector) # normalize to frequencies
    bias_init_vector = np.log(bias_init_vector)
    bias_init_vector -= np.max(bias_init_vector) # shift to nice numeric range
    
    return [word_count, tokens, values, token2index, index2token, len(index2token),bias_init_vector]

In [9]:
def flattenList(nestedList,output): 
    for i in nestedList: 
        if type(i) == list: 
            flattenList(i,output) 
        else: 
            output.append(i) 
            
    return output

def num_encode(test_sentence,index2token,tokens,tokenized_sentence=[],num_encoded_sentence=[]):
    
    tokenized_sentence.clear()
    num_encoded_sentence.clear()
    
    tokenized_sentence = ["<BOS>"] + tokenize(test_sentence) + ["<EOS>"]
    #print(tokenized_sentence)
    output=[]
    tokenized_sentence = flattenList(tokenized_sentence,output)
    
    cap_len = len(tokenized_sentence)
    
    while len(tokenized_sentence) < MAX_WORDS:
        tokenized_sentence.append("<PAD>")    
    
    #print(len(tokenized_sentence))
    
    for ind, token in enumerate(tokenized_sentence):
        if token in tokens:
            for i in range(0,len(index2token)):
                if token == index2token[i]: 
                    num_encoded_sentence.append(i) 
                    
            #print("token exists")
        else:
            num_encoded_sentence.append(3)
            tokenized_sentence[ind] = tokens[3]
            #print("token unknown")
            
            
                
    #print(len(num_encoded_sentence))

        
    return tokenized_sentence, num_encoded_sentence, cap_len

In [10]:

def parse_sentence_data_into_batches(sentence_set, index2token,tokens,batch_size):

    tokenizedsentence_batch = {}
    intencode_batch = {}
    cap_len_batch = {}

    ii = 0
    jj = 0  

    for n in sentence_set:
        sentence = sentence_set[n]

        tokenized_sentence,encoded_sentence, cap_len = num_encode(sentence,index2token,tokens)
        
        #print(np.shape(encoded_sentence))

        tokenized_sentence = list(tokenized_sentence)
        encoded_sentence = list(encoded_sentence)

        if jj not in intencode_batch:
            #onehot_batch[jj] = []
            intencode_batch[jj] = []
            tokenizedsentence_batch[jj] = []
            cap_len_batch[jj] = []

        #print(np.shape(onehot_encoded_sentence))    
        #onehot_batch[jj].append(onehot_encoded_sentence)
        intencode_batch[jj].append(encoded_sentence)
        tokenizedsentence_batch[jj].append(tokenized_sentence)
        cap_len_batch[jj].append(cap_len)

        ii = ii+1

        if ii%batch_size == 0:
            jj = jj+1
            
        
    return tokenizedsentence_batch, intencode_batch, cap_len_batch

In [11]:
filename_train = 'MLDS_hw2_1_data/training_label.json'
filename_test = 'MLDS_hw2_1_data/testing_label.json'
feat_filepath_train = "MLDS_hw2_1_data/training_data/feat/{}.npy"
feat_filepath_test = "MLDS_hw2_1_data/testing_data/feat/{}.npy"

ckpt_path = 'saved_model/trained_model.ckpt'

# forget_bias_red = 1.0
# forget_bias_gre = 1.0
# dropout_prob    = 0.5

batch_size = 50

MAX_WORDS = max_caption_len #max number of words in a caption
n_features = n_inputs
no_of_frames = n_frames
sizeof_sentence= MAX_WORDS
learning_rate = 0.0001
n_hidden = n_hidden

#### PARSE TRAINING DATA #####

#Parse Training Data into batches
vid_batch, n_batches = parse_vid_data_into_batches(filename_train,batch_size,feat_filepath_train)
print("The number of videos in the training set are %d and each video has 80 frames with 4096 features/units each" % (n_batches*batch_size))

# Extracting captions for each video
sentence_set = extract_sentences(filename_train,feat_filepath_train)

word_count, tokens, values, token2index, index2token, n_words,bias_init_vector = listVocab(sentence_set)
print("There are %d unique words in the captions dataset" % n_words)

tokenizedsentence_batch, intencode_batch, cap_len_batch = parse_sentence_data_into_batches(sentence_set,index2token,tokens,batch_size)

# # integer encode
# label_encoder = LabelEncoder()
# integer_encoded = label_encoder.fit_transform(values)
# integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
# integer_encoded

The number of videos in the training set are 1450 and each video has 80 frames with 4096 features/units each
There are 1988 unique words in the captions dataset


In [12]:
#### PARSE TESTING DATA #####

#Parse Testing Data into batches
vid_batch_test, n_batches_test = parse_vid_data_into_batches(filename_test,batch_size,feat_filepath_test)
print("The number of videos in the test set are %d and each video has 80 frames with 4096 features/units each" % (n_batches_test*batch_size))

# Extracting captions for each video
sentence_set_test = extract_sentences(filename_test,feat_filepath_test)
tokenizedsentence_batch_test, intencode_batch_test, cap_len_batch_test = parse_sentence_data_into_batches(sentence_set_test,index2token,tokens,batch_size)


The number of videos in the test set are 100 and each video has 80 frames with 4096 features/units each


In [13]:
def schedule_sampling(sampling_prob, cap_len_batch):

        sampling = np.ones(max_caption_len, dtype = bool)
        for l in range(max_caption_len):
            if np.random.uniform(0,1,1) < sampling_prob:
                sampling[l] = True
            else:
                sampling[l] = False
         
        sampling[0] = True
        return sampling

In [14]:
def inv_sigmoid(num_epo):

    # 0.88 to 0.12 (-2.0 to 2.0)
    x = np.arange(-2.0, 2.0, (4.0/num_epo))
    y = 1/(1 + np.e**x)
    #y = np.ones(num_epo)
    print(y)
    return y

In [15]:
def dec_print_train(pred, cap_len, label, idx2word, batch_size, values):
    
    i = np.random.randint(0, batch_size)
    eos_pred = max_caption_len - 1
    eos = cap_len[i] - 1
    for j in range(0, max_caption_len):
            if pred[i][j] == special_tokens['<EOS>']:
                eos_pred = j
                break
    
    pre = list( map (lambda x: idx2word[x] , pred[i][0:eos_pred])  )
    lab = list( map (lambda x: idx2word[x] , label[i][0:eos])  )
    print('\nid: ' + str(values[i]) + '\nanswer: ' + str(lab) + '\nprediction: ' + str(pre))

In [16]:
# samp_prob = inv_sigmoid(num_epochs)
# samp_prob

NameError: name 'num_epochs' is not defined

In [None]:
# samp = schedule_sampling(samp_prob[epo], caption_lens_batch)
# samp

In [19]:
from tqdm import tqdm
train_graph = tf.Graph()
gpu_config = tf.ConfigProto()
gpu_config.gpu_options.allow_growth = True

print('train_graph: start')

vocab_num = n_words
num_epochs = 10
num_display_steps = 15

with train_graph.as_default():
    feat = tf.placeholder(tf.float32, [None, n_frames, n_inputs], name='video_features')
    captions = tf.placeholder(tf.int32, [None, max_caption_len], name='captions')
    sampling = tf.placeholder(tf.bool, [max_caption_len], name='sampling')
    cap_len = tf.placeholder(tf.int32, [None], name='cap_len')
    model = S2VT(vocab_num=vocab_num, lr=learning_rate)
    logits, loss_op, summary = model.build_model(feat, captions, cap_len, sampling, phases['train'])
    dec_pred = model.inference(logits)
    train_op = model.optimize(loss_op)

    model.set_saver(tf.train.Saver(max_to_keep = 3))
    init = tf.global_variables_initializer()
train_sess = tf.Session(graph=train_graph, config=gpu_config)


train_sess.run(init)

samp_prob = inv_sigmoid(num_epochs)
pbar = tqdm(range(0, num_epochs))

for epo in pbar:
    num_steps = n_batches
    epo_loss = 0
    for i in range(0, num_steps):
        data_batch = np.array(vid_batch[i])
        label_batch = np.array(intencode_batch[i])
        caption_lens_batch = np.array(cap_len_batch[i])
        
        #data_batch, label_batch, caption_lens_batch, id_batch = datasetTrain.next_batch()
        
        samp = schedule_sampling(samp_prob[epo], caption_lens_batch)
        
        if i % num_display_steps == 1:
            # training 
            run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE)
            _, loss, p, summ = train_sess.run([train_op, loss_op, dec_pred, summary], 
                            feed_dict={feat: data_batch,
                                       captions: label_batch,
                                       cap_len: caption_lens_batch,
                                       sampling: samp},
                            options=run_options)
            
            #summary_writer.add_summary(summ, global_step=(epo * num_steps) + i)
            print("\n[Train. Prediction] Epoch " + str(epo) + ", step " + str(i) + "/" + str(num_steps) + "......",)
            
            dec_print_train(p, caption_lens_batch, label_batch, index2token, batch_size, values)

        else:
            _, loss, p = train_sess.run([train_op, loss_op, dec_pred], 
                            feed_dict={feat: data_batch,
                                       captions: label_batch,
                                       cap_len: caption_lens_batch,
                                       sampling: samp})

        epo_loss += loss
        pbar.set_description("Epoch " + str(epo) + ", step " + str(i) + "/" + str(num_steps) + \
            ", (Training Loss: " + "{:.4f}".format(loss) + \
            ", samp_prob: " + "{:.4f}".format(samp_prob[epo]) + ")" )

    print("\n[FINISHED] Epoch " + str(epo) + ", (Training Loss (per epoch): " + "{:.4f}".format(epo_loss) + " samp_prob: " + "{:.4f}".format(samp_prob[epo]) + ")")



train_graph: start




  0%|          | 0/10 [00:00<?, ?it/s][A[A

[0.88079708 0.83201839 0.76852478 0.68997448 0.59868766 0.5
 0.40131234 0.31002552 0.23147522 0.16798161]




Epoch 0, step 0/29, (Training Loss: 7.6513, samp_prob: 0.8808):   0%|          | 0/10 [00:40<?, ?it/s][A[A

Epoch 0, step 1/29, (Training Loss: 7.5223, samp_prob: 0.8808):   0%|          | 0/10 [01:20<?, ?it/s][A[A


[Train. Prediction] Epoch 0, step 1/29......

id: 40
answer: ['<BOS>', 'a', 'baby', 'is', 'repeatedly', 'kissing', 'his', 'reflection', 'in', 'a', 'mirror']
prediction: ['patted', 'saxaphone', 'petted', 'saxaphone', 'patted', 'microwave', 'microwave', 'chin', 'tatoos', 'quickly', 'show', 'pinkish', 'pealing', 'story', 'its', 'adds', 'hip', 'squeezes', 'dish', 'lotion', 'instruments', 'his', 'bolt', 'steps', 'singing', 'lady', 'animal', 'hair', 'drills', 'nicholson', 'backpack', 'tricycle', 'biking', 'pulp', 'sprinkles', 'bra', 'fenced', 'kick', 'close', 'yard', 'each', 'add', 'skull', 'target', 'noose', 'add', 'puddle', 'putting', 'processed']




Epoch 0, step 2/29, (Training Loss: 7.3753, samp_prob: 0.8808):   0%|          | 0/10 [01:22<?, ?it/s][A[A

Epoch 0, step 3/29, (Training Loss: 7.2473, samp_prob: 0.8808):   0%|          | 0/10 [01:24<?, ?it/s][A[A

Epoch 0, step 4/29, (Training Loss: 7.1124, samp_prob: 0.8808):   0%|          | 0/10 [01:26<?, ?it/s][A[A

Epoch 0, step 5/29, (Training Loss: 6.9959, samp_prob: 0.8808):   0%|          | 0/10 [01:29<?, ?it/s][A[A

Epoch 0, step 6/29, (Training Loss: 6.8193, samp_prob: 0.8808):   0%|          | 0/10 [01:31<?, ?it/s][A[A

Epoch 0, step 7/29, (Training Loss: 6.6973, samp_prob: 0.8808):   0%|          | 0/10 [01:33<?, ?it/s][A[A

Epoch 0, step 8/29, (Training Loss: 6.5132, samp_prob: 0.8808):   0%|          | 0/10 [01:35<?, ?it/s][A[A

Epoch 0, step 9/29, (Training Loss: 6.2832, samp_prob: 0.8808):   0%|          | 0/10 [01:37<?, ?it/s][A[A

Epoch 0, step 10/29, (Training Loss: 6.0948, samp_prob: 0.8808):   0%|          | 0/10 [01:39<?, ?it/s][A[A

Epoch 0


[Train. Prediction] Epoch 0, step 16/29......

id: 47
answer: ['<BOS>', 'two', 'men', 'are', 'walking', 'down', 'a', 'street', 'holding', 'their', 'jackets', 'and', 'talking']
prediction: ['<BOS>', 'a', 'a', 'a', 'a', 'a']




Epoch 0, step 17/29, (Training Loss: 5.5791, samp_prob: 0.8808):   0%|          | 0/10 [01:55<?, ?it/s][A[A

Epoch 0, step 18/29, (Training Loss: 5.5023, samp_prob: 0.8808):   0%|          | 0/10 [01:57<?, ?it/s][A[A

Epoch 0, step 19/29, (Training Loss: 5.3681, samp_prob: 0.8808):   0%|          | 0/10 [01:59<?, ?it/s][A[A

Epoch 0, step 20/29, (Training Loss: 5.6569, samp_prob: 0.8808):   0%|          | 0/10 [02:01<?, ?it/s][A[A

Epoch 0, step 21/29, (Training Loss: 5.6055, samp_prob: 0.8808):   0%|          | 0/10 [02:03<?, ?it/s][A[A

Epoch 0, step 22/29, (Training Loss: 5.4339, samp_prob: 0.8808):   0%|          | 0/10 [02:05<?, ?it/s][A[A

Epoch 0, step 23/29, (Training Loss: 5.4650, samp_prob: 0.8808):   0%|          | 0/10 [02:07<?, ?it/s][A[A

Epoch 0, step 24/29, (Training Loss: 5.0805, samp_prob: 0.8808):   0%|          | 0/10 [02:09<?, ?it/s][A[A

Epoch 0, step 25/29, (Training Loss: 5.2738, samp_prob: 0.8808):   0%|          | 0/10 [02:12<?, ?it/s][A[A



[FINISHED] Epoch 0, (Training Loss (per epoch): 174.9346 samp_prob: 0.8808)




Epoch 1, step 0/29, (Training Loss: 5.1056, samp_prob: 0.8320):  10%|█         | 1/10 [02:20<20:46, 138.55s/it] [A[A

Epoch 1, step 1/29, (Training Loss: 4.9625, samp_prob: 0.8320):  10%|█         | 1/10 [02:22<20:46, 138.55s/it][A[A


[Train. Prediction] Epoch 1, step 1/29......

id: 28
answer: ['<BOS>', 'a', 'boy', 'takes', 'a', 'drink', 'from', 'a', 'plastic', 'cup', 'makes', 'a', 'face', 'and', 'tosses', 'the', 'liquid', 'toward', 'some', 'plants', 'hitting', 'a', 'camera', 'sitting', 'on', 'the', 'ledge']
prediction: ['<BOS>', 'a', 'a', 'a', 'a']




Epoch 1, step 2/29, (Training Loss: 4.9682, samp_prob: 0.8320):  10%|█         | 1/10 [02:25<20:46, 138.55s/it][A[A

Epoch 1, step 3/29, (Training Loss: 4.9703, samp_prob: 0.8320):  10%|█         | 1/10 [02:27<20:46, 138.55s/it][A[A

Epoch 1, step 4/29, (Training Loss: 4.9985, samp_prob: 0.8320):  10%|█         | 1/10 [02:29<20:46, 138.55s/it][A[A

Epoch 1, step 5/29, (Training Loss: 4.8432, samp_prob: 0.8320):  10%|█         | 1/10 [02:31<20:46, 138.55s/it][A[A

Epoch 1, step 6/29, (Training Loss: 5.0794, samp_prob: 0.8320):  10%|█         | 1/10 [02:33<20:46, 138.55s/it][A[A

Epoch 1, step 7/29, (Training Loss: 4.9403, samp_prob: 0.8320):  10%|█         | 1/10 [02:35<20:46, 138.55s/it][A[A

Epoch 1, step 8/29, (Training Loss: 5.0638, samp_prob: 0.8320):  10%|█         | 1/10 [02:37<20:46, 138.55s/it][A[A

Epoch 1, step 9/29, (Training Loss: 4.9186, samp_prob: 0.8320):  10%|█         | 1/10 [02:40<20:46, 138.55s/it][A[A

Epoch 1, step 10/29, (Training Loss: 4.7393, s


[Train. Prediction] Epoch 1, step 16/29......

id: 30
answer: ['<BOS>', 'a', 'kid', 'gets', 'knocked', 'down', 'by', 'an', 'animal']
prediction: ['<BOS>', 'a', 'a', 'man', 'a', 'a']




Epoch 1, step 17/29, (Training Loss: 4.8104, samp_prob: 0.8320):  10%|█         | 1/10 [02:57<20:46, 138.55s/it][A[A

Epoch 1, step 18/29, (Training Loss: 4.7955, samp_prob: 0.8320):  10%|█         | 1/10 [02:59<20:46, 138.55s/it][A[A

Epoch 1, step 19/29, (Training Loss: 4.7082, samp_prob: 0.8320):  10%|█         | 1/10 [03:01<20:46, 138.55s/it][A[A

Epoch 1, step 20/29, (Training Loss: 5.0966, samp_prob: 0.8320):  10%|█         | 1/10 [03:03<20:46, 138.55s/it][A[A

Epoch 1, step 21/29, (Training Loss: 5.0640, samp_prob: 0.8320):  10%|█         | 1/10 [03:05<20:46, 138.55s/it][A[A

Epoch 1, step 22/29, (Training Loss: 4.8599, samp_prob: 0.8320):  10%|█         | 1/10 [03:07<20:46, 138.55s/it][A[A

Epoch 1, step 23/29, (Training Loss: 4.9470, samp_prob: 0.8320):  10%|█         | 1/10 [03:09<20:46, 138.55s/it][A[A

Epoch 1, step 24/29, (Training Loss: 4.6353, samp_prob: 0.8320):  10%|█         | 1/10 [03:12<20:46, 138.55s/it][A[A

Epoch 1, step 25/29, (Training Loss: 4


[FINISHED] Epoch 1, (Training Loss (per epoch): 141.9564 samp_prob: 0.8320)




Epoch 2, step 0/29, (Training Loss: 4.7063, samp_prob: 0.7685):  20%|██        | 2/10 [03:22<15:24, 115.60s/it] [A[A

Epoch 2, step 1/29, (Training Loss: 4.6019, samp_prob: 0.7685):  20%|██        | 2/10 [03:25<15:24, 115.60s/it][A[A


[Train. Prediction] Epoch 2, step 1/29......

id: 28
answer: ['<BOS>', 'a', 'boy', 'takes', 'a', 'drink', 'from', 'a', 'plastic', 'cup', 'makes', 'a', 'face', 'and', 'tosses', 'the', 'liquid', 'toward', 'some', 'plants', 'hitting', 'a', 'camera', 'sitting', 'on', 'the', 'ledge']
prediction: ['<BOS>', 'a', 'a', 'a', 'a', 'a']




Epoch 2, step 2/29, (Training Loss: 4.6039, samp_prob: 0.7685):  20%|██        | 2/10 [03:27<15:24, 115.60s/it][A[A

Epoch 2, step 3/29, (Training Loss: 4.6417, samp_prob: 0.7685):  20%|██        | 2/10 [03:29<15:24, 115.60s/it][A[A

Epoch 2, step 4/29, (Training Loss: 4.7206, samp_prob: 0.7685):  20%|██        | 2/10 [03:31<15:24, 115.60s/it][A[A

Epoch 2, step 5/29, (Training Loss: 4.4727, samp_prob: 0.7685):  20%|██        | 2/10 [03:33<15:24, 115.60s/it][A[A

Epoch 2, step 6/29, (Training Loss: 4.6912, samp_prob: 0.7685):  20%|██        | 2/10 [03:35<15:24, 115.60s/it][A[A

Epoch 2, step 7/29, (Training Loss: 4.6470, samp_prob: 0.7685):  20%|██        | 2/10 [03:37<15:24, 115.60s/it][A[A

Epoch 2, step 8/29, (Training Loss: 4.7772, samp_prob: 0.7685):  20%|██        | 2/10 [03:39<15:24, 115.60s/it][A[A

Epoch 2, step 9/29, (Training Loss: 4.6451, samp_prob: 0.7685):  20%|██        | 2/10 [03:42<15:24, 115.60s/it][A[A

Epoch 2, step 10/29, (Training Loss: 4.4056, s


[Train. Prediction] Epoch 2, step 16/29......

id: 37
answer: ['<BOS>', 'a', 'man', 'is', 'riding', 'a', 'motorcycle', 'with', 'a', 'woman', 'riding', 'behind', 'him', 'as', 'a', 'passenger']
prediction: ['<BOS>', 'a', 'man', 'man', 'a']




Epoch 2, step 17/29, (Training Loss: 4.5613, samp_prob: 0.7685):  20%|██        | 2/10 [03:59<15:24, 115.60s/it][A[A

Epoch 2, step 18/29, (Training Loss: 4.5519, samp_prob: 0.7685):  20%|██        | 2/10 [04:01<15:24, 115.60s/it][A[A

Epoch 2, step 19/29, (Training Loss: 4.5450, samp_prob: 0.7685):  20%|██        | 2/10 [04:03<15:24, 115.60s/it][A[A

Epoch 2, step 20/29, (Training Loss: 4.8421, samp_prob: 0.7685):  20%|██        | 2/10 [04:05<15:24, 115.60s/it][A[A

Epoch 2, step 21/29, (Training Loss: 4.8389, samp_prob: 0.7685):  20%|██        | 2/10 [04:07<15:24, 115.60s/it][A[A

Epoch 2, step 22/29, (Training Loss: 4.6863, samp_prob: 0.7685):  20%|██        | 2/10 [04:09<15:24, 115.60s/it][A[A

Epoch 2, step 23/29, (Training Loss: 4.7569, samp_prob: 0.7685):  20%|██        | 2/10 [04:11<15:24, 115.60s/it][A[A

Epoch 2, step 24/29, (Training Loss: 4.5265, samp_prob: 0.7685):  20%|██        | 2/10 [04:13<15:24, 115.60s/it][A[A

Epoch 2, step 25/29, (Training Loss: 4


[FINISHED] Epoch 2, (Training Loss (per epoch): 134.5950 samp_prob: 0.7685)




Epoch 3, step 0/29, (Training Loss: 4.4651, samp_prob: 0.6900):  30%|███       | 3/10 [04:24<11:36, 99.45s/it] [A[A

Epoch 3, step 1/29, (Training Loss: 4.3919, samp_prob: 0.6900):  30%|███       | 3/10 [04:26<11:36, 99.45s/it][A[A


[Train. Prediction] Epoch 3, step 1/29......

id: 41
answer: ['<BOS>', 'a', 'cartoon', 'is', 'swinging']
prediction: ['<BOS>', 'a', 'is', 'is', 'a', 'a']




Epoch 3, step 2/29, (Training Loss: 4.4435, samp_prob: 0.6900):  30%|███       | 3/10 [04:28<11:36, 99.45s/it][A[A

Epoch 3, step 3/29, (Training Loss: 4.4851, samp_prob: 0.6900):  30%|███       | 3/10 [04:31<11:36, 99.45s/it][A[A

Epoch 3, step 4/29, (Training Loss: 4.5470, samp_prob: 0.6900):  30%|███       | 3/10 [04:33<11:36, 99.45s/it][A[A

Epoch 3, step 5/29, (Training Loss: 4.4076, samp_prob: 0.6900):  30%|███       | 3/10 [04:35<11:36, 99.45s/it][A[A

Epoch 3, step 6/29, (Training Loss: 4.6167, samp_prob: 0.6900):  30%|███       | 3/10 [04:37<11:36, 99.45s/it][A[A

Epoch 3, step 7/29, (Training Loss: 4.4688, samp_prob: 0.6900):  30%|███       | 3/10 [04:39<11:36, 99.45s/it][A[A

Epoch 3, step 8/29, (Training Loss: 4.7145, samp_prob: 0.6900):  30%|███       | 3/10 [04:41<11:36, 99.45s/it][A[A

Epoch 3, step 9/29, (Training Loss: 4.5214, samp_prob: 0.6900):  30%|███       | 3/10 [04:43<11:36, 99.45s/it][A[A

Epoch 3, step 10/29, (Training Loss: 4.2878, samp_prob


[Train. Prediction] Epoch 3, step 16/29......

id: 1
answer: ['<BOS>', 'a', 'chef', 'pealing', 'a', 'onion', 'for', 'a', 'dish']
prediction: ['<BOS>', 'a', 'woman', 'a', 'a', 'a']




Epoch 3, step 17/29, (Training Loss: 4.4513, samp_prob: 0.6900):  30%|███       | 3/10 [05:00<11:36, 99.45s/it][A[A

Epoch 3, step 18/29, (Training Loss: 4.4800, samp_prob: 0.6900):  30%|███       | 3/10 [05:03<11:36, 99.45s/it][A[A

Epoch 3, step 19/29, (Training Loss: 4.4768, samp_prob: 0.6900):  30%|███       | 3/10 [05:05<11:36, 99.45s/it][A[A

Epoch 3, step 20/29, (Training Loss: 4.7669, samp_prob: 0.6900):  30%|███       | 3/10 [05:07<11:36, 99.45s/it][A[A

Epoch 3, step 21/29, (Training Loss: 4.7495, samp_prob: 0.6900):  30%|███       | 3/10 [05:09<11:36, 99.45s/it][A[A

Epoch 3, step 22/29, (Training Loss: 4.5687, samp_prob: 0.6900):  30%|███       | 3/10 [05:11<11:36, 99.45s/it][A[A

Epoch 3, step 23/29, (Training Loss: 4.6284, samp_prob: 0.6900):  30%|███       | 3/10 [05:13<11:36, 99.45s/it][A[A

Epoch 3, step 24/29, (Training Loss: 4.4396, samp_prob: 0.6900):  30%|███       | 3/10 [05:15<11:36, 99.45s/it][A[A

Epoch 3, step 25/29, (Training Loss: 4.5445, s


[FINISHED] Epoch 3, (Training Loss (per epoch): 131.0247 samp_prob: 0.6900)




Epoch 4, step 0/29, (Training Loss: 4.3718, samp_prob: 0.5987):  40%|████      | 4/10 [05:26<08:48, 88.16s/it] [A[A

Epoch 4, step 1/29, (Training Loss: 4.3259, samp_prob: 0.5987):  40%|████      | 4/10 [05:28<08:48, 88.16s/it][A[A


[Train. Prediction] Epoch 4, step 1/29......

id: 4
answer: ['<BOS>', 'a', 'man', 'playing', 'drums']
prediction: ['<BOS>', 'a', 'man', 'is', 'is', 'a', 'a']




Epoch 4, step 2/29, (Training Loss: 4.3677, samp_prob: 0.5987):  40%|████      | 4/10 [05:30<08:48, 88.16s/it][A[A

Epoch 4, step 3/29, (Training Loss: 4.3510, samp_prob: 0.5987):  40%|████      | 4/10 [05:32<08:48, 88.16s/it][A[A

Epoch 4, step 4/29, (Training Loss: 4.4381, samp_prob: 0.5987):  40%|████      | 4/10 [05:35<08:48, 88.16s/it][A[A

Epoch 4, step 5/29, (Training Loss: 4.2893, samp_prob: 0.5987):  40%|████      | 4/10 [05:37<08:48, 88.16s/it][A[A

Epoch 4, step 6/29, (Training Loss: 4.4677, samp_prob: 0.5987):  40%|████      | 4/10 [05:39<08:48, 88.16s/it][A[A

Epoch 4, step 7/29, (Training Loss: 4.3891, samp_prob: 0.5987):  40%|████      | 4/10 [05:41<08:48, 88.16s/it][A[A

Epoch 4, step 8/29, (Training Loss: 4.6505, samp_prob: 0.5987):  40%|████      | 4/10 [05:43<08:48, 88.16s/it][A[A

Epoch 4, step 9/29, (Training Loss: 4.4511, samp_prob: 0.5987):  40%|████      | 4/10 [05:45<08:48, 88.16s/it][A[A

Epoch 4, step 10/29, (Training Loss: 4.2584, samp_prob


[Train. Prediction] Epoch 4, step 16/29......

id: 15
answer: ['<BOS>', 'a', 'car', 'is', 'driving']
prediction: ['<BOS>', 'a', 'woman', 'is', 'a']




Epoch 4, step 17/29, (Training Loss: 4.4379, samp_prob: 0.5987):  40%|████      | 4/10 [06:02<08:48, 88.16s/it][A[A

Epoch 4, step 18/29, (Training Loss: 4.3486, samp_prob: 0.5987):  40%|████      | 4/10 [06:05<08:48, 88.16s/it][A[A

Epoch 4, step 19/29, (Training Loss: 4.3936, samp_prob: 0.5987):  40%|████      | 4/10 [06:07<08:48, 88.16s/it][A[A

Epoch 4, step 20/29, (Training Loss: 4.7151, samp_prob: 0.5987):  40%|████      | 4/10 [06:09<08:48, 88.16s/it][A[A

Epoch 4, step 21/29, (Training Loss: 4.7047, samp_prob: 0.5987):  40%|████      | 4/10 [06:11<08:48, 88.16s/it][A[A

Epoch 4, step 22/29, (Training Loss: 4.5438, samp_prob: 0.5987):  40%|████      | 4/10 [06:13<08:48, 88.16s/it][A[A

Epoch 4, step 23/29, (Training Loss: 4.6031, samp_prob: 0.5987):  40%|████      | 4/10 [06:15<08:48, 88.16s/it][A[A

Epoch 4, step 24/29, (Training Loss: 4.3387, samp_prob: 0.5987):  40%|████      | 4/10 [06:17<08:48, 88.16s/it][A[A

Epoch 4, step 25/29, (Training Loss: 4.4631, s


[FINISHED] Epoch 4, (Training Loss (per epoch): 128.9962 samp_prob: 0.5987)




Epoch 5, step 0/29, (Training Loss: 4.3278, samp_prob: 0.5000):  50%|█████     | 5/10 [06:28<06:41, 80.35s/it] [A[A

Epoch 5, step 1/29, (Training Loss: 4.2125, samp_prob: 0.5000):  50%|█████     | 5/10 [06:30<06:41, 80.35s/it][A[A


[Train. Prediction] Epoch 5, step 1/29......

id: 41
answer: ['<BOS>', 'a', 'cartoon', 'is', 'swinging']
prediction: ['<BOS>', 'a', 'is', 'is', 'is']




Epoch 5, step 2/29, (Training Loss: 4.2432, samp_prob: 0.5000):  50%|█████     | 5/10 [06:32<06:41, 80.35s/it][A[A

Epoch 5, step 3/29, (Training Loss: 4.3247, samp_prob: 0.5000):  50%|█████     | 5/10 [06:34<06:41, 80.35s/it][A[A

Epoch 5, step 4/29, (Training Loss: 4.3481, samp_prob: 0.5000):  50%|█████     | 5/10 [06:37<06:41, 80.35s/it][A[A

Epoch 5, step 5/29, (Training Loss: 4.1663, samp_prob: 0.5000):  50%|█████     | 5/10 [06:39<06:41, 80.35s/it][A[A

Epoch 5, step 6/29, (Training Loss: 4.3850, samp_prob: 0.5000):  50%|█████     | 5/10 [06:41<06:41, 80.35s/it][A[A

Epoch 5, step 7/29, (Training Loss: 4.2901, samp_prob: 0.5000):  50%|█████     | 5/10 [06:43<06:41, 80.35s/it][A[A

Epoch 5, step 8/29, (Training Loss: 4.5620, samp_prob: 0.5000):  50%|█████     | 5/10 [06:45<06:41, 80.35s/it][A[A

Epoch 5, step 9/29, (Training Loss: 4.3278, samp_prob: 0.5000):  50%|█████     | 5/10 [06:47<06:41, 80.35s/it][A[A

Epoch 5, step 10/29, (Training Loss: 4.1292, samp_prob


[Train. Prediction] Epoch 5, step 16/29......

id: 38
answer: ['<BOS>', 'a', 'person', 'cutting', 'up', 'vegatables']
prediction: ['<BOS>', 'a', 'woman', 'is', 'a', 'a']




Epoch 5, step 17/29, (Training Loss: 4.3304, samp_prob: 0.5000):  50%|█████     | 5/10 [07:05<06:41, 80.35s/it][A[A

Epoch 5, step 18/29, (Training Loss: 4.2453, samp_prob: 0.5000):  50%|█████     | 5/10 [07:07<06:41, 80.35s/it][A[A

Epoch 5, step 19/29, (Training Loss: 4.2703, samp_prob: 0.5000):  50%|█████     | 5/10 [07:09<06:41, 80.35s/it][A[A

Epoch 5, step 20/29, (Training Loss: 4.5995, samp_prob: 0.5000):  50%|█████     | 5/10 [07:11<06:41, 80.35s/it][A[A

Epoch 5, step 21/29, (Training Loss: 4.5685, samp_prob: 0.5000):  50%|█████     | 5/10 [07:13<06:41, 80.35s/it][A[A

Epoch 5, step 22/29, (Training Loss: 4.4919, samp_prob: 0.5000):  50%|█████     | 5/10 [07:15<06:41, 80.35s/it][A[A

Epoch 5, step 23/29, (Training Loss: 4.4734, samp_prob: 0.5000):  50%|█████     | 5/10 [07:17<06:41, 80.35s/it][A[A

Epoch 5, step 24/29, (Training Loss: 4.2423, samp_prob: 0.5000):  50%|█████     | 5/10 [07:19<06:41, 80.35s/it][A[A

Epoch 5, step 25/29, (Training Loss: 4.4181, s


[FINISHED] Epoch 5, (Training Loss (per epoch): 126.2917 samp_prob: 0.5000)




Epoch 6, step 0/29, (Training Loss: 4.2749, samp_prob: 0.4013):  60%|██████    | 6/10 [07:30<04:59, 74.90s/it] [A[A

Epoch 6, step 1/29, (Training Loss: 4.1802, samp_prob: 0.4013):  60%|██████    | 6/10 [07:32<04:59, 74.90s/it][A[A


[Train. Prediction] Epoch 6, step 1/29......

id: 21
answer: ['<BOS>', 'a', 'man', 'holding', 'an', 'umbrella', 'has', 'jumped', 'a', 'hurdle', 'and', 'a', 'wall']
prediction: ['<BOS>', 'a', 'man', 'is', 'is', 'a', 'a', 'a']




Epoch 6, step 2/29, (Training Loss: 4.1878, samp_prob: 0.4013):  60%|██████    | 6/10 [07:35<04:59, 74.90s/it][A[A

Epoch 6, step 3/29, (Training Loss: 4.2511, samp_prob: 0.4013):  60%|██████    | 6/10 [07:37<04:59, 74.90s/it][A[A

Epoch 6, step 4/29, (Training Loss: 4.2856, samp_prob: 0.4013):  60%|██████    | 6/10 [07:39<04:59, 74.90s/it][A[A

Epoch 6, step 5/29, (Training Loss: 4.1226, samp_prob: 0.4013):  60%|██████    | 6/10 [07:41<04:59, 74.90s/it][A[A

Epoch 6, step 6/29, (Training Loss: 4.3789, samp_prob: 0.4013):  60%|██████    | 6/10 [07:43<04:59, 74.90s/it][A[A

Epoch 6, step 7/29, (Training Loss: 4.2560, samp_prob: 0.4013):  60%|██████    | 6/10 [07:45<04:59, 74.90s/it][A[A

Epoch 6, step 8/29, (Training Loss: 4.4255, samp_prob: 0.4013):  60%|██████    | 6/10 [07:47<04:59, 74.90s/it][A[A

Epoch 6, step 9/29, (Training Loss: 4.2851, samp_prob: 0.4013):  60%|██████    | 6/10 [07:49<04:59, 74.90s/it][A[A

Epoch 6, step 10/29, (Training Loss: 4.0205, samp_prob


[Train. Prediction] Epoch 6, step 16/29......

id: 43
answer: ['<BOS>', 'a', 'boy', 'kneeling', 'in', 'front', 'of', 'a', 'bench', 'is', 'moving', 'his', 'arms', 'and', 'body', 'rhythmically', 'and', 'then', 'he', 'begins', 'to', 'dance', 'on', 'and', 'around', 'the', 'bench']
prediction: ['<BOS>', 'a', 'is', 'is', 'is', 'a']




Epoch 6, step 17/29, (Training Loss: 4.2646, samp_prob: 0.4013):  60%|██████    | 6/10 [08:07<04:59, 74.90s/it][A[A

Epoch 6, step 18/29, (Training Loss: 4.2106, samp_prob: 0.4013):  60%|██████    | 6/10 [08:09<04:59, 74.90s/it][A[A

Epoch 6, step 19/29, (Training Loss: 4.2364, samp_prob: 0.4013):  60%|██████    | 6/10 [08:11<04:59, 74.90s/it][A[A

Epoch 6, step 20/29, (Training Loss: 4.6084, samp_prob: 0.4013):  60%|██████    | 6/10 [08:13<04:59, 74.90s/it][A[A

Epoch 6, step 21/29, (Training Loss: 4.5034, samp_prob: 0.4013):  60%|██████    | 6/10 [08:15<04:59, 74.90s/it][A[A

Epoch 6, step 22/29, (Training Loss: 4.4040, samp_prob: 0.4013):  60%|██████    | 6/10 [08:17<04:59, 74.90s/it][A[A

Epoch 6, step 23/29, (Training Loss: 4.4571, samp_prob: 0.4013):  60%|██████    | 6/10 [08:19<04:59, 74.90s/it][A[A

Epoch 6, step 24/29, (Training Loss: 4.1726, samp_prob: 0.4013):  60%|██████    | 6/10 [08:22<04:59, 74.90s/it][A[A

Epoch 6, step 25/29, (Training Loss: 4.3282, s


[FINISHED] Epoch 6, (Training Loss (per epoch): 124.6124 samp_prob: 0.4013)




Epoch 7, step 0/29, (Training Loss: 4.1741, samp_prob: 0.3100):  70%|███████   | 7/10 [08:32<03:33, 71.05s/it] [A[A

Epoch 7, step 1/29, (Training Loss: 4.1009, samp_prob: 0.3100):  70%|███████   | 7/10 [08:34<03:33, 71.05s/it][A[A


[Train. Prediction] Epoch 7, step 1/29......

id: 43
answer: ['<BOS>', 'a', 'young', 'man', 'is', 'hammering', 'nails', 'into', 'a', 'strip', 'of', 'wood', 'with', 'a', 'camera']
prediction: ['<BOS>', 'a', 'man', 'is', 'is', 'a', 'a']




Epoch 7, step 2/29, (Training Loss: 4.1210, samp_prob: 0.3100):  70%|███████   | 7/10 [08:37<03:33, 71.05s/it][A[A

Epoch 7, step 3/29, (Training Loss: 4.1853, samp_prob: 0.3100):  70%|███████   | 7/10 [08:39<03:33, 71.05s/it][A[A

Epoch 7, step 4/29, (Training Loss: 4.2327, samp_prob: 0.3100):  70%|███████   | 7/10 [08:41<03:33, 71.05s/it][A[A

Epoch 7, step 5/29, (Training Loss: 4.0579, samp_prob: 0.3100):  70%|███████   | 7/10 [08:43<03:33, 71.05s/it][A[A

Epoch 7, step 6/29, (Training Loss: 4.2757, samp_prob: 0.3100):  70%|███████   | 7/10 [08:45<03:33, 71.05s/it][A[A

Epoch 7, step 7/29, (Training Loss: 4.1824, samp_prob: 0.3100):  70%|███████   | 7/10 [08:47<03:33, 71.05s/it][A[A

Epoch 7, step 8/29, (Training Loss: 4.4081, samp_prob: 0.3100):  70%|███████   | 7/10 [08:49<03:33, 71.05s/it][A[A

Epoch 7, step 9/29, (Training Loss: 4.2084, samp_prob: 0.3100):  70%|███████   | 7/10 [08:51<03:33, 71.05s/it][A[A

Epoch 7, step 10/29, (Training Loss: 3.9972, samp_prob


[Train. Prediction] Epoch 7, step 16/29......

id: 22
answer: ['<BOS>', 'a', 'boy', 'is', 'singing', 'into', 'a', 'mic']
prediction: ['<BOS>', 'a', 'man', 'is', 'a']




Epoch 7, step 17/29, (Training Loss: 4.2344, samp_prob: 0.3100):  70%|███████   | 7/10 [09:09<03:33, 71.05s/it][A[A

Epoch 7, step 18/29, (Training Loss: 4.1712, samp_prob: 0.3100):  70%|███████   | 7/10 [09:11<03:33, 71.05s/it][A[A

Epoch 7, step 19/29, (Training Loss: 4.1624, samp_prob: 0.3100):  70%|███████   | 7/10 [09:13<03:33, 71.05s/it][A[A

Epoch 7, step 20/29, (Training Loss: 4.4953, samp_prob: 0.3100):  70%|███████   | 7/10 [09:15<03:33, 71.05s/it][A[A

Epoch 7, step 21/29, (Training Loss: 4.4343, samp_prob: 0.3100):  70%|███████   | 7/10 [09:17<03:33, 71.05s/it][A[A

Epoch 7, step 22/29, (Training Loss: 4.3511, samp_prob: 0.3100):  70%|███████   | 7/10 [09:19<03:33, 71.05s/it][A[A

Epoch 7, step 23/29, (Training Loss: 4.3630, samp_prob: 0.3100):  70%|███████   | 7/10 [09:22<03:33, 71.05s/it][A[A

Epoch 7, step 24/29, (Training Loss: 4.1038, samp_prob: 0.3100):  70%|███████   | 7/10 [09:24<03:33, 71.05s/it][A[A

Epoch 7, step 25/29, (Training Loss: 4.2630, s


[FINISHED] Epoch 7, (Training Loss (per epoch): 122.6865 samp_prob: 0.3100)




Epoch 8, step 0/29, (Training Loss: 4.1456, samp_prob: 0.2315):  80%|████████  | 8/10 [09:34<02:16, 68.36s/it] [A[A

Epoch 8, step 1/29, (Training Loss: 4.0571, samp_prob: 0.2315):  80%|████████  | 8/10 [09:37<02:16, 68.36s/it][A[A


[Train. Prediction] Epoch 8, step 1/29......

id: 11
answer: ['<BOS>', 'a', 'boy', 'sitting', 'at', 'a', 'picnic', 'table', 'watches', 'a', 'man', 'hurdle', 'over', 'the', 'table', 'and', 'do', 'a', 'back', 'flip']
prediction: ['<BOS>', 'a', 'man', 'a', 'is', 'a', 'a']




Epoch 8, step 2/29, (Training Loss: 4.0920, samp_prob: 0.2315):  80%|████████  | 8/10 [09:39<02:16, 68.36s/it][A[A

Epoch 8, step 3/29, (Training Loss: 4.1351, samp_prob: 0.2315):  80%|████████  | 8/10 [09:41<02:16, 68.36s/it][A[A

Epoch 8, step 4/29, (Training Loss: 4.2552, samp_prob: 0.2315):  80%|████████  | 8/10 [09:43<02:16, 68.36s/it][A[A

Epoch 8, step 5/29, (Training Loss: 4.0080, samp_prob: 0.2315):  80%|████████  | 8/10 [09:45<02:16, 68.36s/it][A[A

Epoch 8, step 6/29, (Training Loss: 4.1859, samp_prob: 0.2315):  80%|████████  | 8/10 [09:47<02:16, 68.36s/it][A[A

Epoch 8, step 7/29, (Training Loss: 4.0661, samp_prob: 0.2315):  80%|████████  | 8/10 [09:49<02:16, 68.36s/it][A[A

Epoch 8, step 8/29, (Training Loss: 4.3613, samp_prob: 0.2315):  80%|████████  | 8/10 [09:52<02:16, 68.36s/it][A[A

Epoch 8, step 9/29, (Training Loss: 4.1085, samp_prob: 0.2315):  80%|████████  | 8/10 [09:54<02:16, 68.36s/it][A[A

Epoch 8, step 10/29, (Training Loss: 3.9214, samp_prob


[Train. Prediction] Epoch 8, step 16/29......

id: 5
answer: ['<BOS>', 'a', 'man', 'shoots', 'in', 'a', 'practice', 'range']
prediction: ['<BOS>', 'a', 'man', 'is', 'a', 'a', 'a']




Epoch 8, step 17/29, (Training Loss: 4.1076, samp_prob: 0.2315):  80%|████████  | 8/10 [10:11<02:16, 68.36s/it][A[A

Epoch 8, step 18/29, (Training Loss: 4.0751, samp_prob: 0.2315):  80%|████████  | 8/10 [10:13<02:16, 68.36s/it][A[A

Epoch 8, step 19/29, (Training Loss: 4.1204, samp_prob: 0.2315):  80%|████████  | 8/10 [10:15<02:16, 68.36s/it][A[A

Epoch 8, step 20/29, (Training Loss: 4.4291, samp_prob: 0.2315):  80%|████████  | 8/10 [10:17<02:16, 68.36s/it][A[A

Epoch 8, step 21/29, (Training Loss: 4.4133, samp_prob: 0.2315):  80%|████████  | 8/10 [10:20<02:16, 68.36s/it][A[A

Epoch 8, step 22/29, (Training Loss: 4.3278, samp_prob: 0.2315):  80%|████████  | 8/10 [10:22<02:16, 68.36s/it][A[A

Epoch 8, step 23/29, (Training Loss: 4.2951, samp_prob: 0.2315):  80%|████████  | 8/10 [10:24<02:16, 68.36s/it][A[A

Epoch 8, step 24/29, (Training Loss: 4.0918, samp_prob: 0.2315):  80%|████████  | 8/10 [10:26<02:16, 68.36s/it][A[A

Epoch 8, step 25/29, (Training Loss: 4.2404, s


[FINISHED] Epoch 8, (Training Loss (per epoch): 121.0575 samp_prob: 0.2315)




Epoch 9, step 0/29, (Training Loss: 4.0550, samp_prob: 0.1680):  90%|█████████ | 9/10 [10:37<01:06, 66.52s/it] [A[A

Epoch 9, step 1/29, (Training Loss: 3.9690, samp_prob: 0.1680):  90%|█████████ | 9/10 [10:39<01:06, 66.52s/it][A[A


[Train. Prediction] Epoch 9, step 1/29......

id: 1
answer: ['<BOS>', 'a', 'woman', 'is', 'dancing']
prediction: ['<BOS>', 'a', 'man', 'is', 'is', 'a']




Epoch 9, step 2/29, (Training Loss: 4.0585, samp_prob: 0.1680):  90%|█████████ | 9/10 [10:41<01:06, 66.52s/it][A[A

Epoch 9, step 3/29, (Training Loss: 4.0366, samp_prob: 0.1680):  90%|█████████ | 9/10 [10:43<01:06, 66.52s/it][A[A

Epoch 9, step 4/29, (Training Loss: 4.0889, samp_prob: 0.1680):  90%|█████████ | 9/10 [10:45<01:06, 66.52s/it][A[A

Epoch 9, step 5/29, (Training Loss: 3.8871, samp_prob: 0.1680):  90%|█████████ | 9/10 [10:47<01:06, 66.52s/it][A[A

Epoch 9, step 6/29, (Training Loss: 4.1065, samp_prob: 0.1680):  90%|█████████ | 9/10 [10:49<01:06, 66.52s/it][A[A

Epoch 9, step 7/29, (Training Loss: 4.0142, samp_prob: 0.1680):  90%|█████████ | 9/10 [10:52<01:06, 66.52s/it][A[A

Epoch 9, step 8/29, (Training Loss: 4.3185, samp_prob: 0.1680):  90%|█████████ | 9/10 [10:54<01:06, 66.52s/it][A[A

Epoch 9, step 9/29, (Training Loss: 4.0799, samp_prob: 0.1680):  90%|█████████ | 9/10 [10:56<01:06, 66.52s/it][A[A

Epoch 9, step 10/29, (Training Loss: 3.8034, samp_prob


[Train. Prediction] Epoch 9, step 16/29......

id: 3
answer: ['<BOS>', 'five', 'kittens', 'in', 'a', 'row', 'and', 'then', 'in', 'a', 'circle', 'are', 'eating', 'off', 'of', 'plates']
prediction: ['<BOS>', 'a', 'woman', 'is', 'on']




Epoch 9, step 17/29, (Training Loss: 4.0351, samp_prob: 0.1680):  90%|█████████ | 9/10 [11:13<01:06, 66.52s/it][A[A

Epoch 9, step 18/29, (Training Loss: 4.0366, samp_prob: 0.1680):  90%|█████████ | 9/10 [11:15<01:06, 66.52s/it][A[A

Epoch 9, step 19/29, (Training Loss: 4.0383, samp_prob: 0.1680):  90%|█████████ | 9/10 [11:17<01:06, 66.52s/it][A[A

Epoch 9, step 20/29, (Training Loss: 4.3465, samp_prob: 0.1680):  90%|█████████ | 9/10 [11:19<01:06, 66.52s/it][A[A

Epoch 9, step 21/29, (Training Loss: 4.3445, samp_prob: 0.1680):  90%|█████████ | 9/10 [11:22<01:06, 66.52s/it][A[A

Epoch 9, step 22/29, (Training Loss: 4.2996, samp_prob: 0.1680):  90%|█████████ | 9/10 [11:24<01:06, 66.52s/it][A[A

Epoch 9, step 23/29, (Training Loss: 4.2624, samp_prob: 0.1680):  90%|█████████ | 9/10 [11:26<01:06, 66.52s/it][A[A

Epoch 9, step 24/29, (Training Loss: 4.0026, samp_prob: 0.1680):  90%|█████████ | 9/10 [11:28<01:06, 66.52s/it][A[A

Epoch 9, step 25/29, (Training Loss: 4.1877, s


[FINISHED] Epoch 9, (Training Loss (per epoch): 119.2239 samp_prob: 0.1680)





In [None]:
# model = Video_Caption_Generator(dim_image=n_features, 
#                                 n_words = n_words, 
#                                 dim_hidden = n_hidden, 
#                                 batch_size=batch_size, 
#                                 n_lstm_steps=80,
#                                 n_video_lstm_step=80,
#                                 n_caption_lstm_step=80,
#                                 bias_init_vector=bias_init_vector)

In [None]:
# tf_loss, tf_video, tf_video_mask, tf_caption, tf_caption_mask, tf_probs = model.build_model()


In [None]:
# n_words = n_words

# with tf.Graph().as_default() as graph:
    

#     weights_enc = tf.Variable(tf.random_uniform([n_features, n_hidden],-0.1,0.1),name="weights_enc")
#     bias_enc = tf.Variable(tf.zeros([n_hidden]),name="bias_enc")

#     weights_dec = tf.Variable(tf.random_uniform([n_hidden, n_words],-0.1,0.1),name="weights_dec")
#     bias_dec = tf.Variable(tf.zeros([n_words]),name="bias_dec")


#     x_video = tf.placeholder(tf.float32, (None, no_of_frames, n_features),'video_features') #inputs

#     batch_size = tf.shape(x_video)[0]
    
#     x_video_drop = tf.nn.dropout(x_video, 0.5)
    
#     x_video_flat = tf.reshape(x_video_drop,[-1,n_features])

#     y_label = tf.placeholder(tf.int32,(None, sizeof_sentence),'captions') #outputs


#     #sampling = tf.placeholder(tf.bool, [sizeof_sentence], name='sampling')
#     padding = tf.zeros([batch_size, n_hidden])

#     loss = 0.0

#     ########## DATA ###########
#     # Example: For i = 0
#     #batch_x = np.array(vid_batch[0])
#     #batch_y = np.array(intencode_batch[0])
#     ###########################

#     input_embedding = tf.matmul(x_video_flat,weights_enc) + bias_enc
#     input_embedding = tf.reshape(input_embedding,[-1, no_of_frames,n_hidden])
#     input_embed = tf.transpose(input_embedding, perm=[1, 0, 2])

#     with tf.device("/cpu:0"):
#         output_embedding = tf.Variable(tf.random_uniform((n_words, n_hidden),-0.1,0.1), name='dec_embedding')
#     # output_embed = tf.nn.embedding_lookup(output_embedding,y_label)
    
#     ## ENCODING #################################
    
#     with tf.variable_scope("LSTM1"):
#         lstm1 = tf.nn.rnn_cell.BasicLSTMCell(n_hidden,state_is_tuple=True)
#         lstm1 = tf.contrib.rnn.DropoutWrapper(lstm1, output_keep_prob=0.5)    

#     with tf.variable_scope("LSTM2"):
#         lstm2 = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, state_is_tuple=True)
#         lstm2 = tf.contrib.rnn.DropoutWrapper(lstm2, output_keep_prob=0.5)    


#     state1 = lstm1.zero_state(batch_size, dtype=tf.float32)
#     state2 = lstm2.zero_state(batch_size, dtype=tf.float32)
    
#     for i in range(0, no_of_frames):
        
#         if i > 0:
#                 tf.get_variable_scope().reuse_variables()
                
#         with tf.variable_scope("LSTM1"):
#             output1, state1 = lstm1(input_embed[i,:,:], state1)

#         with tf.variable_scope("LSTM2"):
#             output2, state2 = lstm2(tf.concat([padding, output1], axis=1), state2)
    
#     ## DECODING ##################################
    
#     bos = tf.ones([batch_size, n_hidden])
#     padding_in = tf.zeros([batch_size, n_hidden])

#     logits = []
#     cross_ent_list=[]
#     max_prob_index = None


#     for i in range(0, MAX_WORDS):
        
#         tf.get_variable_scope().reuse_variables()

        
#         with tf.variable_scope("LSTM1"):
#             output1, state1 = lstm1(padding_in, state1)
            
#         if i == 0:
            
#             with tf.variable_scope("LSTM2"):
#                 con = tf.concat([bos, output1], axis=1)
#                 output2, state2 = lstm2(con, state2)
                
#         else:
            
#             with tf.device("/cpu:0"):
            
#                 feed_in = y_label[:,i]
#                 #feed_in = tf.argmax()
#                 output_embed = tf.nn.embedding_lookup(output_embedding,feed_in)
                
#             with tf.variable_scope("LSTM2"):
#                 con = tf.concat([output_embed, output1], axis=1)
#                 output2, state2 = lstm2(con, state2)

#         logit_words = tf.matmul(output2, weights_dec) + bias_dec
#         logits.append(logit_words)

#         word_i = y_label[:,i]

#         one_hot_labels = tf.one_hot(word_i, n_words, on_value = 1, off_value = None, axis = 1) 
#         cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit_words, labels=one_hot_labels)
#         cross_ent_list.append(cross_entropy)
        
        
#         #current_loss = tf.reduce_sum(cross_entropy)/batch_size
#         #loss = loss + current_loss

#     cross_entropy_tensor = tf.stack(cross_ent_list, 1)
#     loss = tf.reduce_sum(cross_entropy_tensor, axis=1)
#     loss = tf.divide(loss, tf.cast(tf.Variable(sizeof_sentence), tf.float32))

#     loss = tf.reduce_mean(loss, axis=0)
    
#     summary = tf.summary.scalar('training_loss', loss)

#     params = tf.trainable_variables()
#     #optimizer = tf.train.AdamOptimizer(learning_rate)#.minimize(loss_op)
#     optimizer = tf.train.GradientDescentOptimizer(learning_rate)
#     train_op = optimizer.minimize(loss)

#     #train_step = optimizer.minimize(loss)
    
# #     gradients, variables = zip(*optimizer.compute_gradients(loss))
# #     gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
# #     train_op = optimizer.apply_gradients(zip(gradients, params))
    
# #     logits = tf.stack(logits, axis = 0)
# #     logits = tf.reshape(logits, (sizeof_sentence, batch_size, n_words))
# #     logits = tf.transpose(logits, [1, 0, 2])
# #     preds = tf.argmax(logits,2)
# #     correct_pred = tf.equal(tf.argmax(preds,1), tf.argmax(y_label,1))
# #     accuracy = tf.reduce_mean(correct_pred)

#     logits = tf.stack(logits,axis=0)
#     logits = tf.transpose(logits, [1, 0, 2])
#     output_preds = tf.argmax(logits,2)
    
#     #correct_pred = tf.equal(tf.argmax(output_preds, 1), tf.argmax(y_label, 1))
#     #accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
#     saver = tf.train.Saver(max_to_keep=3)


In [None]:
# loss

In [None]:
# run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True)

# gpu_config = tf.ConfigProto()

# with tf.Session(graph=graph,config=gpu_config) as sess:

#     loss_list_train = []
#     loss_list_test = []
#     preds_dict = {}

#     sess.run(tf.global_variables_initializer())
#     epochs = 10

#     #training
#     n=0

#     for epoch in range(epochs):

#         for i in range(n_batches):

#             batch_x = np.array(vid_batch[i])
#             #batch_x = np.reshape(batch_x,[-1,n_features])
#             batch_y = np.array(intencode_batch[i])

#             _, batch_loss, preds = sess.run([train_op, loss, logits], feed_dict = {x_video: batch_x, y_label: batch_y})        

#             loss_list_train.append(batch_loss)
#             print("train: %f " % (batch_loss))

            
#             n = n+1

       
#     #testing
    
#         saver.save(sess,ckpt_path, global_step=n)
#         print('Model saved at ' + ckpt_path)
        
    
#     for i in range(n_batches_test):

#         batch_x_test = np.array(vid_batch_test[i])
#         #batch_x = np.reshape(batch_x,[-1,n_features])
#         batch_y_test = np.array(intencode_batch_test[i])

#         acc = sess.run(accuracy, feed_dict = {x_video: batch_x_test, y_label: batch_y_test})        
#         print("accuracy %f" % acc)

# #         loss_list_test.append(batch_loss)
# #         print("test:", batch_loss)
    
# #         preds_dict[i] = batch_preds 

In [None]:
# with tf.variable_scope("encoding") as encoding_scope:
#     lstm_enc = tf.contrib.rnn.BasicLSTMCell(n_hidden)
#     _, last_state = tf.nn.dynamic_rnn(lstm_enc, inputs=input_embed, dtype=tf.float32)

In [None]:
# with tf.variable_scope("decoding") as decoding_scope:
#     # TODO: create the decoder LSTMs, this is very similar to the above
#     # you will need to set initial_state=last_state from the encoder
#     lstm_dec = tf.contrib.rnn.BasicLSTMCell(n_hidden)
#     dec_outputs, _ = tf.nn.dynamic_rnn(lstm_dec,inputs=output_embed, dtype=tf.float32)

In [None]:
# #connect outputs to 
# logits = tf.contrib.layers.fully_connected(dec_outputs, num_outputs=len(index2token), activation_fn=None) 

# with tf.name_scope("optimization"):
#     # Loss function
#     loss = tf.contrib.seq2seq.sequence_loss(logits, targets, tf.ones([batch_size, sizeof_sentence]))
#     # Optimizer
#     optimizer = tf.train.RMSPropOptimizer(1e-3).minimize(loss)

In [None]:
# output_dec.get_shape().as_list()

In [None]:
# state_dec[0].get_shape().as_list()

In [None]:
# x_video.get_shape().as_list()

In [None]:
# from utilities import show_graph
# show_graph(tf.get_default_graph().as_graph_def())

In [None]:
# def RNN(x, weights1, biases1):
    
#     x = tf.unstack(x,no_of_frames,1)
    
#     lstm_encoder = tf.keras.layers.LSTM(n_hidden, return_state=True) #reuse=tf.AUTO_REUSE)
#     output_encoder,state_h,state_c = lstm_encoder(x) #,dtype=tf.float32)
#     encoder_states = [state_h,state_c]
    
#     decoder
    
#     return tf.matmul(output1[-1],weights1) + bias1

In [None]:
# np.shape(vid_batch[1])

In [None]:
# logits = RNN(x_video,weights1,bias1)
# prediction = tf.nn.softmax(logits)


# loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_label))


# optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
# train_op = optimizer.minimize(loss_op)

# # Evaluate model (with test logits, for dropout to be disabled)
# correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(y_label, 1))
# accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [None]:
# batch_y = np.array(intencode_batch[1])
# np.shape(batch_y)

In [None]:
# batch_x = np.array(vid_batch[0])
# print(np.shape(batch_x))
# batch_x = np.reshape(batch_x,[-1,n_features])
# np.shape(batch_x)

In [None]:
# preds_dict[0]
# def predicted_sentence(preds_dict):
    

In [None]:
# batch_x = np.array(vid_batch[0])
# batch_y = np.array(intencode_batch[0])

# with tf.Session() as sess:
#     sess.run(init)

#     sess.run(train_op, feed_dict={x_video: batch_x, y_label: batch_y})

In [None]:


# image_emb = tf.nn.xw_plus_b(x_video, weights1, bias1) 
# #image_emb = tf.reshape(image_emb, [batch_size, no_of_frames, n_hidden])

# #lstm2 = tf.keras.layers.LSTMCell(n_hidden)

# padding = tf.zeros([batch_size, n_hidden])


# #Only read the frames


        

            
                
# logit_words = tf.nn.xw_plus_b(output2, weights2, bias2)
# cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logit_words,onehot_encoded)

# loss = tf.reduce_sum(cross_entropy)

In [None]:
# with tf.Session() as sess:
#     with sess.as_default():
#         print(tf.nn.embedding_lookup(onehot_encoded,[1]).eval())

In [None]:
# inputs