In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals
import random
import json
import os
import time
import tensorflow as tf
from tensorflow.contrib import rnn


import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
#from sklearn.model_selection import train_test_split
import tensorflow.contrib.legacy_seq2seq as seq2seq
from utilities import show_graph

import unicodedata
import re
import numpy as np
import os
import io
import time
import collections
import json
import string

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



In [2]:
def tokenize(line,token='word'):
    if token == 'word':
        return [line.split(' ')]
    elif token == 'char':
        return [list(line)]
    else:
        print('ERROR: unknown token type '+token)

In [3]:
def count_tokens(tokanized_sentences):
    # Flatten a list of token lists into a list of tokens
    tokens = [tk for line in tokanized_sentences for tk in line]
    return collections.Counter(tokens)

In [4]:

def parse_vid_data_into_batches(filename,batch_size,feat_filepath):
    
    with open(filename, 'r') as f:
        datastore = json.load(f)
        
    batches = len(datastore)/batch_size
    batches = int(batches)
        
    i = 0
    j = 0
    
    vid_batch = {}
    sentence_set = {}
    
    for data in datastore:
        
        #### Extracting all feature vectors per video

        
        #vid_feat_list = []
        
        video_id = data["id"]
        features = np.load(feat_filepath.format(video_id))

        vid_framefeats = [] #list of all feature vectors per video. Shape = [80,4096]
        
        for array in features:
            vid_framefeats.append(array)

        if j not in vid_batch:
            vid_batch[j] = []

        vid_batch[j].append(vid_framefeats)
        
        
        #### Extracting only a single sentence per video into a standalone dict

        sentences = data["caption"]
        sentences = [word.lower() for word in sentences] #Normalize the case
        table = str.maketrans('', '', string.punctuation) #Normalize the punctuation
        sentences = [word.translate(table) for word in sentences]

        sentence_set[i] = sentences[0] #0 for only the first sentence\
        
        i = i+1

        if i%batch_size == 0:
            j = j+1            
            
    return vid_batch, batches

In [5]:
def extract_sentences(filename, feat_filepath):
    
    sentence_set = {}
    
    with open(filename, 'r') as f:
        datastore = json.load(f)
        
    i = 0
    for data in datastore:
        
        #### Extracting only a single sentence per video into a standalone dict

        sentences = data["caption"]
        sentences = [word.lower() for word in sentences] #Normalize the case
        table = str.maketrans('', '', string.punctuation) #Normalize the punctuation
        sentences = [word.translate(table) for word in sentences]

        sentence_set[i] = sentences[0] #0 for only the first sentence\
        
        i = i+1
        
    return sentence_set

In [6]:
# Mapping string tokens to numertical indices.
def listVocab(sentence_set):
    
    PAD_token = 0
    BOS_token = 1
    EOS_token = 2
    UNK_token = 3
    
    all_tokens = []
    token2index = {"<PAD>": 0,"<BOS>":1,"<EOS>":2,"<UNK>":3}
    index2token = {PAD_token: "<PAD>", BOS_token: "<BOS>", EOS_token: "<EOS>", UNK_token: "<UNK>"}
    
    #for set_i in vid_sentence_set:
    #    sentence_set = vid_sentence_set[set_i]
    #    for line in sentence_set: 
    
    for n in sentence_set:
        line = sentence_set[n]
        tokenized_captions = tokenize(line) #Seperate the words
        all_tokens += tokenized_captions
    
    counter = count_tokens(all_tokens) #Count the word repeatitions in each set
    
    counter_dict = counter.items()
    counter_sort = sorted(counter_dict, key=lambda x:x[1],reverse=True) #sort by frequency of occurance 
    #print(counter_sort)

    i = len(index2token)
    values = [0,1,2,3]
    tokens = ["<PAD>","<BOS>","<EOS>","<UNK>"]
    for token, freq in counter_sort:
        index2token[i] = token
        token2index[token] = i
        values += [i]
        tokens += [token]
        i+=1
    
    return [tokens, values, token2index, index2token, len(index2token)]

In [7]:
def flattenList(nestedList,output): 
    for i in nestedList: 
        if type(i) == list: 
            flattenList(i,output) 
        else: 
            output.append(i) 
            
    return output

def num_encode(test_sentence,index2token,tokens,tokenized_sentence=[],num_encoded_sentence=[]):
    
    tokenized_sentence.clear()
    num_encoded_sentence.clear()
    
    tokenized_sentence = ["<BOS>"] + tokenize(test_sentence) + ["<EOS>"]
    #print(tokenized_sentence)
    output=[]
    tokenized_sentence = flattenList(tokenized_sentence,output)

    while len(tokenized_sentence) < MAX_WORDS:
        tokenized_sentence.append("<PAD>")    
    #print(len(tokenized_sentence))
    for ind, token in enumerate(tokenized_sentence):
        if token in tokens:
            for i in range(0,len(index2token)):
                if token == index2token[i]: 
                    num_encoded_sentence.append(i) 
                    
            #print("token exists")
        else:
            num_encoded_sentence.append(3)
            tokenized_sentence[ind] = tokens[3]
            #print("token unknown")
            
            
                
    #print(len(num_encoded_sentence))

        
    return tokenized_sentence, num_encoded_sentence

In [8]:

def parse_sentence_data_into_batches(sentence_set, index2token,tokens,batch_size):

    tokenizedsentence_batch = {}
    intencode_batch = {}

    ii = 0
    jj = 0  

    for n in sentence_set:
        sentence = sentence_set[n]

        tokenized_sentence,encoded_sentence = num_encode(sentence,index2token,tokens)
        
        #print(np.shape(encoded_sentence))

        tokenized_sentence = list(tokenized_sentence)
        encoded_sentence = list(encoded_sentence)

        if jj not in intencode_batch:
            #onehot_batch[jj] = []
            intencode_batch[jj] = []
            tokenizedsentence_batch[jj] = []

        #print(np.shape(onehot_encoded_sentence))    
        #onehot_batch[jj].append(onehot_encoded_sentence)
        intencode_batch[jj].append(encoded_sentence)
        tokenizedsentence_batch[jj].append(tokenized_sentence)

        ii = ii+1

        if ii%batch_size == 0:
            jj = jj+1
            
        
    return tokenizedsentence_batch, intencode_batch

In [9]:
filename_train = 'MLDS_hw2_1_data/training_label.json'
filename_test = 'MLDS_hw2_1_data/testing_label.json'
feat_filepath_train = "MLDS_hw2_1_data/training_data/feat/{}.npy"
feat_filepath_test = "MLDS_hw2_1_data/testing_data/feat/{}.npy"

ckpt_path = 'saved_model/trained_model.ckpt'


batch_size = 50

MAX_WORDS = 80 #max number of words in a caption
n_features = 4096
no_of_frames = 80
sizeof_sentence= MAX_WORDS
learning_rate = 0.001

#### PARSE TRAINING DATA #####

#Parse Training Data into batches
vid_batch, n_batches = parse_vid_data_into_batches(filename_train,batch_size,feat_filepath_train)
print("The number of videos in the training set are %d and each video has 80 frames with 4096 features/units each" % (n_batches*batch_size))

# Extracting captions for each video
sentence_set = extract_sentences(filename_train,feat_filepath_train)

tokens, values, token2index, index2token, n_words = listVocab(sentence_set)
print("There are %d unique words in the captions dataset" % n_words)

tokenizedsentence_batch, intencode_batch = parse_sentence_data_into_batches(sentence_set,index2token,tokens,batch_size)

# # integer encode
# label_encoder = LabelEncoder()
# integer_encoded = label_encoder.fit_transform(values)
# integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
# integer_encoded

The number of videos in the training set are 1450 and each video has 80 frames with 4096 features/units each
There are 1988 unique words in the captions dataset


In [10]:
#### PARSE TESTING DATA #####

#Parse Testing Data into batches
vid_batch_test, n_batches_test = parse_vid_data_into_batches(filename_test,batch_size,feat_filepath_test)
print("The number of videos in the test set are %d and each video has 80 frames with 4096 features/units each" % (n_batches_test*batch_size))

# Extracting captions for each video
sentence_set_test = extract_sentences(filename_test,feat_filepath_test)
tokenizedsentence_batch_test, intencode_batch_test = parse_sentence_data_into_batches(sentence_set_test,index2token,tokens,batch_size)


The number of videos in the test set are 100 and each video has 80 frames with 4096 features/units each


In [11]:
n_words = n_words
n_hidden = 600

with tf.Graph().as_default() as graph:
    

    weights_enc = tf.Variable(tf.random_uniform([n_features, n_hidden],-0.1,0.1),name="weights_enc")
    bias_enc = tf.Variable(tf.zeros([n_hidden]),name="bias_enc")

    weights_dec = tf.Variable(tf.random_uniform([n_hidden, n_words],-0.1,0.1),name="weights_dec")
    bias_dec = tf.Variable(tf.zeros([n_words]),name="bias_dec")


    x_video = tf.placeholder(tf.float32, (None, no_of_frames, n_features),'video_features') #inputs

    batch_size = tf.shape(x_video)[0]
    
    x_video_drop = tf.nn.dropout(x_video, 0.5)
    
    x_video_flat = tf.reshape(x_video_drop,[-1,n_features])

    y_label = tf.placeholder(tf.int32,(None, sizeof_sentence),'captions') #outputs


    #sampling = tf.placeholder(tf.bool, [sizeof_sentence], name='sampling')
    padding = tf.zeros([batch_size, n_hidden])

    loss = 0.0

    ########## DATA ###########
    # Example: For i = 0
    #batch_x = np.array(vid_batch[0])
    #batch_y = np.array(intencode_batch[0])
    ###########################

    input_embedding = tf.matmul(x_video_flat,weights_enc) + bias_enc
    input_embedding = tf.reshape(input_embedding,[-1, no_of_frames,n_hidden])
    input_embed = tf.transpose(input_embedding, perm=[1, 0, 2])

    with tf.device("/cpu:0"):
        output_embedding = tf.Variable(tf.random_uniform((n_words, n_hidden),-0.1,0.1), name='dec_embedding')
    # output_embed = tf.nn.embedding_lookup(output_embedding,y_label)
    
    ## ENCODING #################################
    
    with tf.variable_scope("LSTM1"):
        lstm1 = tf.nn.rnn_cell.BasicLSTMCell(n_hidden,state_is_tuple=True)
        lstm1 = tf.contrib.rnn.DropoutWrapper(lstm1, output_keep_prob=0.5)    

    with tf.variable_scope("LSTM2"):
        lstm2 = tf.nn.rnn_cell.BasicLSTMCell(n_hidden, state_is_tuple=True)
        lstm2 = tf.contrib.rnn.DropoutWrapper(lstm2, output_keep_prob=0.5)    


    state1 = lstm1.zero_state(batch_size, dtype=tf.float32)
    state2 = lstm2.zero_state(batch_size, dtype=tf.float32)
    
    for i in range(0, no_of_frames):
        
        if i > 0:
                tf.get_variable_scope().reuse_variables()
                
        with tf.variable_scope("LSTM1"):
            output1, state1 = lstm1(input_embed[i,:,:], state1)

        with tf.variable_scope("LSTM2"):
            output2, state2 = lstm2(tf.concat([padding, output1], axis=1), state2)
    
    ## DECODING ##################################
    
    bos = tf.ones([batch_size, n_hidden])
    padding_in = tf.zeros([batch_size, n_hidden])

    logits = []
    cross_ent_list=[]
    max_prob_index = None


    for i in range(0, MAX_WORDS):
        
        tf.get_variable_scope().reuse_variables()

        
        with tf.variable_scope("LSTM1"):
            output1, state1 = lstm1(padding_in, state1)
            
        if i == 0:
            
            with tf.variable_scope("LSTM2"):
                con = tf.concat([bos, output1], axis=1)
                output2, state2 = lstm2(con, state2)
                
        else:
            
            with tf.device("/cpu:0"):
            
                feed_in = y_label[:,i]
                #feed_in = tf.argmax()
                output_embed = tf.nn.embedding_lookup(output_embedding,feed_in)
                
            with tf.variable_scope("LSTM2"):
                con = tf.concat([output_embed, output1], axis=1)
                output2, state2 = lstm2(con, state2)

        logit_words = tf.matmul(output2, weights_dec) + bias_dec
        logits.append(logit_words)

        word_i = y_label[:,i]

        one_hot_labels = tf.one_hot(word_i, n_words, on_value = 1, off_value = None, axis = 1) 
        cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logit_words, labels=one_hot_labels)
        cross_ent_list.append(cross_entropy)
        
        
        #current_loss = tf.reduce_sum(cross_entropy)/batch_size
        #loss = loss + current_loss

    cross_entropy_tensor = tf.stack(cross_ent_list, 1)
    loss = tf.reduce_sum(cross_entropy_tensor, axis=1)
    loss = tf.divide(loss, tf.cast(tf.Variable(sizeof_sentence), tf.float32))

    loss = tf.reduce_mean(loss, axis=0)
    
    summary = tf.summary.scalar('training_loss', loss)

    params = tf.trainable_variables()
    #optimizer = tf.train.AdamOptimizer(learning_rate)#.minimize(loss_op)
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    train_op = optimizer.minimize(loss)

    #train_step = optimizer.minimize(loss)
    
#     gradients, variables = zip(*optimizer.compute_gradients(loss))
#     gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
#     train_op = optimizer.apply_gradients(zip(gradients, params))
    
#     logits = tf.stack(logits, axis = 0)
#     logits = tf.reshape(logits, (sizeof_sentence, batch_size, n_words))
#     logits = tf.transpose(logits, [1, 0, 2])
#     preds = tf.argmax(logits,2)
#     correct_pred = tf.equal(tf.argmax(preds,1), tf.argmax(y_label,1))
#     accuracy = tf.reduce_mean(correct_pred)

    logits = tf.stack(logits,axis=0)
    logits = tf.transpose(logits, [1, 0, 2])
    output_preds = tf.argmax(logits,2)
    
    #correct_pred = tf.equal(tf.argmax(output_preds, 1), tf.argmax(y_label, 1))
    #accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    
    saver = tf.train.Saver(max_to_keep=3)


Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [12]:
loss

<tf.Tensor 'Mean:0' shape=() dtype=float32>

In [16]:
run_opts = tf.RunOptions(report_tensor_allocations_upon_oom = True)

gpu_config = tf.ConfigProto()

with tf.Session(graph=graph,config=gpu_config) as sess:

    loss_list_train = []
    loss_list_test = []
    preds_dict = {}

    sess.run(tf.global_variables_initializer())
    epochs = 10

    #training
    n=0

    for epoch in range(epochs):

        for i in range(n_batches):

            batch_x = np.array(vid_batch[i])
            #batch_x = np.reshape(batch_x,[-1,n_features])
            batch_y = np.array(intencode_batch[i])

            _, batch_loss, preds = sess.run([train_op, loss, logits], feed_dict = {x_video: batch_x, y_label: batch_y})        

            loss_list_train.append(batch_loss)
            print("train: %f " % (batch_loss))

            
            n = n+1

       
    #testing
    
        saver.save(sess,ckpt_path, global_step=n)
        print('Model saved at ' + ckpt_path)
        
    
    for i in range(n_batches_test):

        batch_x_test = np.array(vid_batch_test[i])
        #batch_x = np.reshape(batch_x,[-1,n_features])
        batch_y_test = np.array(intencode_batch_test[i])

        acc = sess.run(accuracy, feed_dict = {x_video: batch_x_test, y_label: batch_y_test})        
        print("accuracy %f" % acc)

#         loss_list_test.append(batch_loss)
#         print("test:", batch_loss)
    
#         preds_dict[i] = batch_preds 

train: 7.596560 
train: 7.589375 
train: 7.579945 
train: 7.575838 
train: 7.568577 
train: 7.560453 
train: 7.555425 
train: 7.548172 
train: 7.541787 
train: 7.532041 
train: 7.528676 
train: 7.522356 
train: 7.512043 
train: 7.511115 
train: 7.500875 
train: 7.491843 
train: 7.489013 
train: 7.479468 
train: 7.475438 
train: 7.466541 
train: 7.461650 
train: 7.450930 
train: 7.448647 
train: 7.440543 
train: 7.428465 
train: 7.426885 
train: 7.420441 
train: 7.413893 
train: 7.404820 
Model saved at saved_model/trained_model.ckpt
train: 7.398826 
train: 7.393348 
train: 7.380279 
train: 7.371067 
train: 7.371828 
train: 7.363610 
train: 7.355991 
train: 7.353664 
train: 7.345027 
train: 7.332693 
train: 7.328154 
train: 7.319838 
train: 7.309660 
train: 7.310757 
train: 7.299979 
train: 7.291158 
train: 7.293193 
train: 7.277243 
train: 7.271609 
train: 7.262727 
train: 7.265707 
train: 7.255448 
train: 7.250957 
train: 7.236266 
train: 7.221282 
train: 7.223709 
train: 7.209186 
tr

NameError: name 'accuracy' is not defined

array([[ 6.0200958e+00,  1.1233552e+00, -1.3844694e-01, ...,
        -4.0627268e-01,  3.8382119e-01,  6.6204590e-01],
       [ 6.8074985e+00,  5.2401006e-01,  8.9435023e-01, ...,
        -2.1259536e-01,  5.9858853e-01,  1.4138797e-01],
       [ 5.7976232e+00,  1.5898039e+00,  6.6757840e-01, ...,
        -2.6962292e-01,  1.7390178e-01,  2.0177306e-03],
       ...,
       [ 8.2757206e+00,  7.3791459e-02,  8.7332702e-01, ...,
        -5.5419064e-01,  3.6085778e-01,  6.4031482e-01],
       [ 8.1912451e+00, -2.4131502e-01,  8.9898396e-01, ...,
         5.2647275e-01,  2.8451374e-01,  7.5696123e-01],
       [ 8.3191118e+00,  6.0896546e-01,  9.1250223e-01, ...,
        -5.7238603e-01,  3.2245705e-01,  5.4325676e-01]], dtype=float32)

In [None]:
# with tf.variable_scope("encoding") as encoding_scope:
#     lstm_enc = tf.contrib.rnn.BasicLSTMCell(n_hidden)
#     _, last_state = tf.nn.dynamic_rnn(lstm_enc, inputs=input_embed, dtype=tf.float32)

In [None]:
# with tf.variable_scope("decoding") as decoding_scope:
#     # TODO: create the decoder LSTMs, this is very similar to the above
#     # you will need to set initial_state=last_state from the encoder
#     lstm_dec = tf.contrib.rnn.BasicLSTMCell(n_hidden)
#     dec_outputs, _ = tf.nn.dynamic_rnn(lstm_dec,inputs=output_embed, dtype=tf.float32)

In [None]:
# #connect outputs to 
# logits = tf.contrib.layers.fully_connected(dec_outputs, num_outputs=len(index2token), activation_fn=None) 

# with tf.name_scope("optimization"):
#     # Loss function
#     loss = tf.contrib.seq2seq.sequence_loss(logits, targets, tf.ones([batch_size, sizeof_sentence]))
#     # Optimizer
#     optimizer = tf.train.RMSPropOptimizer(1e-3).minimize(loss)

In [None]:
# output_dec.get_shape().as_list()

In [None]:
# state_dec[0].get_shape().as_list()

In [None]:
# x_video.get_shape().as_list()

In [None]:
# from utilities import show_graph
# show_graph(tf.get_default_graph().as_graph_def())

In [None]:
# def RNN(x, weights1, biases1):
    
#     x = tf.unstack(x,no_of_frames,1)
    
#     lstm_encoder = tf.keras.layers.LSTM(n_hidden, return_state=True) #reuse=tf.AUTO_REUSE)
#     output_encoder,state_h,state_c = lstm_encoder(x) #,dtype=tf.float32)
#     encoder_states = [state_h,state_c]
    
#     decoder
    
#     return tf.matmul(output1[-1],weights1) + bias1

In [None]:
# np.shape(vid_batch[1])

In [None]:
# logits = RNN(x_video,weights1,bias1)
# prediction = tf.nn.softmax(logits)


# loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=y_label))


# optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate)
# train_op = optimizer.minimize(loss_op)

# # Evaluate model (with test logits, for dropout to be disabled)
# correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(y_label, 1))
# accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

In [None]:
# batch_y = np.array(intencode_batch[1])
# np.shape(batch_y)

In [None]:
# batch_x = np.array(vid_batch[0])
# print(np.shape(batch_x))
# batch_x = np.reshape(batch_x,[-1,n_features])
# np.shape(batch_x)

In [None]:
preds_dict[0]
def predicted_sentence(preds_dict):
    

In [None]:
# batch_x = np.array(vid_batch[0])
# batch_y = np.array(intencode_batch[0])

# with tf.Session() as sess:
#     sess.run(init)

#     sess.run(train_op, feed_dict={x_video: batch_x, y_label: batch_y})

In [None]:


# image_emb = tf.nn.xw_plus_b(x_video, weights1, bias1) 
# #image_emb = tf.reshape(image_emb, [batch_size, no_of_frames, n_hidden])

# #lstm2 = tf.keras.layers.LSTMCell(n_hidden)

# padding = tf.zeros([batch_size, n_hidden])


# #Only read the frames


        

            
                
# logit_words = tf.nn.xw_plus_b(output2, weights2, bias2)
# cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logit_words,onehot_encoded)

# loss = tf.reduce_sum(cross_entropy)

In [None]:
# with tf.Session() as sess:
#     with sess.as_default():
#         print(tf.nn.embedding_lookup(onehot_encoded,[1]).eval())

In [None]:
# inputs