In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import collections
import json
import string

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
#def dictionary

In [3]:
def tokenize(line,token='word'):
    if token == 'word':
        return [line.split(' ')]
    elif token == 'char':
        return [list(line)]
    else:
        print('ERROR: unknown token type '+token)

In [4]:
def count_tokens(tokanized_sentences):
    # Flatten a list of token lists into a list of tokens
    tokens = [tk for line in tokanized_sentences for tk in line]
    return collections.Counter(tokens)

In [5]:
filename = 'MLDS_hw2_1_data/training_label.json'
with open(filename, 'r') as f:
    datastore = json.load(f)

In [6]:
vid_feat_set = {}
vid_sentence_set = {}

sizeof_train = 0
for data in datastore:
    video_id = data["id"]
    vid_feat_set[video_id]=None
    vid_sentence_set[video_id]=None
    sizeof_train = sizeof_train+1

In [7]:
for data in datastore:
    video_id = data["id"]
    features = np.load("MLDS_hw2_1_data/training_data/feat/{}.npy".format(video_id))
    #print(video_id)
    
    vid_framefeats = []

    for array in features:
        vid_framefeats.append(array)
    
    vid_feat_set[video_id] = vid_framefeats
    
    #print("reading sentences in: %s" % video_id)
    sentences = data["caption"]
    sentences = [word.lower() for word in sentences] #Normalize the case
    table = str.maketrans('', '', string.punctuation) #Normalize the punctuation
    sentences = [word.translate(table) for word in sentences]
    vid_sentence_set[video_id] = sentences

In [8]:
#sentence_set
print("The number of videos in the training set are %d and each video has 80 frames with 4096 features/units each" % len(vid_feat_set))

The number of videos in the training set are 1450 and each video has 80 frames with 4096 features/units each


In [9]:
# for set_i in vid_sentence_set:
#     print(set_i)
#     print(vid_sentence_set[set_i])


In [10]:
# #vid_feat_set['WqQonRVs7WA_0_10.avi']

# count = 0
# for x in vid_sentence_set: 
#     if isinstance(vid_sentence_set[x], list): 
#         count += len(vid_sentence_set[x]) 
# print(count) 

In [11]:
# Mapping string tokens to numertical indices.
def listVocab(vid_sentence_set):
    
    PAD_token = 0
    BOS_token = 1
    EOS_token = 2
    UNK_token = 3
    
    all_tokens = []
    values = [0,1,2,3]
    token_index = [(PAD_token, "<PAD>"), (BOS_token, "<BOS>"), (EOS_token, "<EOS>"), (UNK_token, "<UNK>")]
    for set_i in vid_sentence_set:
        sentence_set = vid_sentence_set[set_i]
        for line in sentence_set: 
            tokenized_captions = tokenize(line) #Seperate the words
            all_tokens += tokenized_captions
    
    counter = count_tokens(all_tokens) #Count the word repeatitions in each set
    
    counter_dict = counter.items()
    counter_sort = sorted(counter_dict, key=lambda x:x[1],reverse=True) #sort by frequency of occurance 
    #print(counter_sort)

    i = len(token_index)
    for token, freq in counter_sort:
        token_index.append((i,token))
        values+=[i]
        i+=1
    
    return [values, token_index, len(token_index)]

In [12]:
values, token_index, nums = listVocab(vid_sentence_set)
print("There are %d unique words in the captions dataset" % nums)

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)


There are 6061 unique words in the captions dataset


In [13]:
#counter_dict

In [14]:
#np.shape(onehot_encoded)

In [15]:
def flattenList(nestedList,output): 
    for i in nestedList: 
        if type(i) == list: 
            flattenList(i,output) 
        else: 
            output.append(i) 
            
    return output

In [16]:
MAX_WORDS = 50

def num_encode(test_sentence,token_index,tokenized_sentence=[],num_encoded_sentence=[],onehot_encoded_sentence=[]):
    
    tokenized_sentence.clear()
    num_encoded_sentence.clear()
    onehot_encoded_sentence.clear()
    
    tokenized_sentence = ["<BOS>"] + tokenize(test_sentence) + ["<EOS>"]
    #print(tokenized_sentence)
    output=[]
    tokenized_sentence = flattenList(tokenized_sentence,output)

    while len(tokenized_sentence) < MAX_WORDS:
        tokenized_sentence.append("<PAD>")
    #print(len(tokenized_sentence))
   
    for token in tokenized_sentence:
        for i in range(0,len(token_index)):
            if token == token_index[i][1]: 
                num_encoded_sentence.append(i) 
                onehot_encoded_sentence.append(onehot_encoded[i])
        
    return tokenized_sentence, num_encoded_sentence, onehot_encoded_sentence


In [17]:
#for set_i in sentence_set:
num_encoded_dict = {}
onehot_encoded_dict = {}

for data in datastore:
    video_id = data["id"]
    num_encoded_dict[video_id]=None
    onehot_encoded_dict[video_id]=None
  
    
for vid in vid_sentence_set:
    sentence_set = vid_sentence_set[vid]
    
    num_encoded_per_set = []
    onehot_encoded_per_set = []

    for line in sentence_set:
        #print(line)
        _,encoded_sentence,onehot_encoded_sentence = num_encode(line,token_index)
        #print(len(tokenized_sen))
        encoded_sentence = list(encoded_sentence)
        onehot_encoded_sentence = list(onehot_encoded_sentence)

        #print(type(encoded_sentence)) 
        #print("nxt")
        num_encoded_per_set.append(encoded_sentence)
        #print(num_encoded_per_set)

        
        onehot_encoded_per_set.append(onehot_encoded_sentence)

    #print(num_encoded_per_set)
    
    num_encoded_dict[vid] = num_encoded_per_set
    onehot_encoded_dict[vid] = onehot_encoded_per_set

In [18]:
n_hidden = 1000
vocab_size = nums

weights = tf.Variable(tf.random_normal([n_hidden,vocab_size]))
bias = tf.Variable(tf.random_normal([vocab_size]))

#the input is the feature set ( i think.) the label is the caption (i think)

In [31]:



n_frames = 80
data = datastore[1]
x_input = vid_feat_set[datastore[1]["id"]]
y_label = onehot_encoded_dict[datastore[1]["id"]]

x = tf.placeholder(tf.float32, shape=[n_frames,None])

def RNN(x, weights, bias):

    x = tf.reshape(x, [-1, n_frames])

    # 1-layer LSTM with n_hidden units.
    rnn_cell = tf.nn.rnn_cell.BasicLSTMCell(n_hidden)

    # generate prediction
    outputs, states = tf.nn.rnn_cell.static_rnn(rnn_cell, x, dtype=tf.float32)

    # there are n_input outputs but
    # we only want the last output
    return tf.matmul(outputs[-1], weights) + bias

pred = RNN(x,weights,bias)

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.


AttributeError: module 'tensorflow._api.v1.nn.rnn_cell' has no attribute 'static_rnn'

In [None]:
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=y))
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)

In [None]:
_, loss, onehot_pred = session.run([optimizer, cost, pred], feed_dict={x: x_input, y: })

In [26]:
np.shape(x_input)

(80, 4096)

In [None]:
np.shape(features[1])

In [None]:
sizeof_train

In [20]:
vid_feat_set[datastore[1]["id"]]

[array([2.84397435, 0.        , 0.        , ..., 0.        , 0.        ,
        2.99545813]),
 array([2.90854335, 0.        , 0.        , ..., 0.        , 0.        ,
        2.98446941]),
 array([3.3231349 , 0.        , 0.        , ..., 0.        , 0.        ,
        3.39244986]),
 array([3.49265981, 0.        , 0.        , ..., 0.        , 0.        ,
        2.94543219]),
 array([3.73490572, 0.        , 0.        , ..., 0.        , 0.        ,
        3.22727823]),
 array([3.83639526, 0.        , 0.        , ..., 0.        , 0.        ,
        3.28317118]),
 array([3.67887425, 0.        , 0.        , ..., 0.        , 0.        ,
        3.11239529]),
 array([3.79342461, 0.        , 0.        , ..., 0.        , 0.00770801,
        3.01491785]),
 array([3.46713281, 0.        , 0.        , ..., 0.        , 0.37530333,
        3.66016102]),
 array([2.77445221, 0.        , 0.        , ..., 0.        , 0.65369743,
        3.74102354]),
 array([2.73554325, 0.        , 0.        , ..., 0