In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split

import unicodedata
import re
import numpy as np
import os
import io
import time
import collections
import json
import string

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
#def dictionary

In [3]:
def tokenize(line,token='word'):
    if token == 'word':
        return [line.split(' ')]
    elif token == 'char':
        return [list(line)]
    else:
        print('ERROR: unknown token type '+token)

In [4]:
def count_tokens(tokanized_sentences):
    # Flatten a list of token lists into a list of tokens
    tokens = [tk for line in tokanized_sentences for tk in line]
    return collections.Counter(tokens)

In [5]:
filename = 'MLDS_hw2_1_data/training_label.json'
with open(filename, 'r') as f:
    datastore = json.load(f)

In [6]:
vid_feat_set = {}
vid_sentence_set = {}

sizeof_train = 0
for data in datastore:
    video_id = data["id"]
    vid_feat_set[video_id]=None
    vid_sentence_set[video_id]=None
    sizeof_train = sizeof_train+1

In [7]:
for data in datastore:
    video_id = data["id"]
    features = np.load("MLDS_hw2_1_data/training_data/feat/{}.npy".format(video_id))
    #print(video_id)
    
    vid_framefeats = []

    for array in features:
        vid_framefeats.append(array)
    
    vid_feat_set[video_id] = vid_framefeats
    
    #print("reading sentences in: %s" % video_id)
    sentences = data["caption"]
    sentences = [word.lower() for word in sentences] #Normalize the case
    table = str.maketrans('', '', string.punctuation) #Normalize the punctuation
    sentences = [word.translate(table) for word in sentences]
    vid_sentence_set[video_id] = sentences

In [8]:
#sentence_set
print("The number of videos in the training set are %d and each video has 80 frames with 4096 features/units each" % len(vid_feat_set))

The number of videos in the training set are 1450 and each video has 80 frames with 4096 features/units each


In [9]:
# for set_i in vid_sentence_set:
#     print(set_i)
#     print(vid_sentence_set[set_i])


In [10]:
# #vid_feat_set['WqQonRVs7WA_0_10.avi']

# count = 0
# for x in vid_sentence_set: 
#     if isinstance(vid_sentence_set[x], list): 
#         count += len(vid_sentence_set[x]) 
# print(count) 

In [11]:
# Mapping string tokens to numertical indices.
def listVocab(vid_sentence_set):
    
    PAD_token = 0
    BOS_token = 1
    EOS_token = 2
    UNK_token = 3
    
    all_tokens = []
    token_index = {"<PAD>": 0,"<BOS>":1,"<EOS>":2,"<UNK>":3}
    index_token = {PAD_token: "<PAD>", BOS_token: "<BOS>", EOS_token: "<EOS>", UNK_token: "<UNK>"}
    
    for set_i in vid_sentence_set:
        sentence_set = vid_sentence_set[set_i]
        for line in sentence_set: 
            tokenized_captions = tokenize(line) #Seperate the words
            all_tokens += tokenized_captions
    
    counter = count_tokens(all_tokens) #Count the word repeatitions in each set
    
    counter_dict = counter.items()
    counter_sort = sorted(counter_dict, key=lambda x:x[1],reverse=True) #sort by frequency of occurance 
    #print(counter_sort)

    i = len(index_token)
    values = [0,1,2,3]
    for token, freq in counter_sort:
        index_token[i] = token
        token_index[token] = i
        values += [i]
        i+=1
    
    return [values,token_index, index_token, len(index_token)]

In [12]:
values,token_index, index_token, nums = listVocab(vid_sentence_set)
print("There are %d unique words in the captions dataset" % nums)

# integer encode
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(values)

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)


There are 6061 unique words in the captions dataset


In [13]:
#counter_dict

In [14]:
#np.shape(onehot_encoded)

In [15]:
def flattenList(nestedList,output): 
    for i in nestedList: 
        if type(i) == list: 
            flattenList(i,output) 
        else: 
            output.append(i) 
            
    return output

In [16]:
MAX_WORDS = 80

def num_encode(test_sentence,index_token,tokenized_sentence=[],num_encoded_sentence=[],onehot_encoded_sentence=[]):
    
    tokenized_sentence.clear()
    num_encoded_sentence.clear()
    onehot_encoded_sentence.clear()
    
    tokenized_sentence = ["<BOS>"] + tokenize(test_sentence) + ["<EOS>"]
    #print(tokenized_sentence)
    output=[]
    tokenized_sentence = flattenList(tokenized_sentence,output)

    while len(tokenized_sentence) < MAX_WORDS:
        tokenized_sentence.append("<PAD>")
    #print(len(tokenized_sentence))
   
    for token in tokenized_sentence:
        for i in range(0,len(index_token)):
            if token == index_token[i]: 
                num_encoded_sentence.append(i) 
                onehot_encoded_sentence.append(onehot_encoded[i])
        
    return tokenized_sentence, num_encoded_sentence, onehot_encoded_sentence


In [None]:
#for set_i in sentence_set:
num_encoded_dict = {}
onehot_encoded_dict = {}

for data in datastore:
    video_id = data["id"]
    num_encoded_dict[video_id]=None
    onehot_encoded_dict[video_id]=None
  
    
for vid in vid_sentence_set:
    sentence_set = vid_sentence_set[vid]
    
    num_encoded_per_set = []
    onehot_encoded_per_set = []

    for line in sentence_set:
        #print(line)
        _,encoded_sentence,onehot_encoded_sentence = num_encode(line,index_token)
        #print(len(tokenized_sen))
        encoded_sentence = list(encoded_sentence)
        onehot_encoded_sentence = list(onehot_encoded_sentence)

        #print(type(encoded_sentence)) 
        #print("nxt")
        num_encoded_per_set.append(encoded_sentence)
        #print(num_encoded_per_set)

        
        onehot_encoded_per_set.append(onehot_encoded_sentence)

    #print(num_encoded_per_set)
    
    num_encoded_dict[vid] = num_encoded_per_set
    onehot_encoded_dict[vid] = onehot_encoded_per_set

In [97]:
n_hidden = 1000
n_words = nums
sizeof_frame = 4096
no_of_frames = 80
sizeof_sentence= 80
batch_size = 100 #100 videos at a time

weights1 = tf.Variable(tf.random_normal([sizeof_frame,n_hidden]))
bias1 = tf.Variable(tf.zeros([n_hidden]))

#weights2 = tf.Variable(tf.random_normal([n_hidden,n_words]))
#bias2 = tf.Variable(tf.zeros([n_words]))


#the input is the feature set ( i think.) the label is the caption (i think)

x_video = tf.placeholder(tf.float32, [batch_size, no_of_frames, sizeof_frame])
#x_input = tf.reshape(x_video,[-1,sizeof_frame])

label_caption = tf.placeholder(tf.int32,[batch_size, sizeof_sentence])

image_emb = tf.nn.xw_plus_b(x_video, weights1, bias1) 
#image_emb = tf.reshape(image_emb, [batch_size, no_of_frames, n_hidden])

lstm1 = tf.keras.layers.LSTMCell(n_hidden)
#lstm2 = tf.keras.layers.LSTMCell(n_hidden)

padding = tf.zeros([batch_size, n_hidden])


#Only read the frames

for i in range(no_of_frames): 
    tf.get_variable_scope().reuse_variables()
    
    output1 = lstm1(image_emb[:,i,:])
    #output2 = lstm2(tf.concat([padding,output1],1))

#Reading the captions

for i in range(sizeof_sentence):
    
    tf.get_variable_scope().reuse_variables()

    #word_i = tf.nn.embedding_lookup(onehot_encoded, label_caption[:,i])
    output1 = lstm1(padding)
    #output2 = lstm2(tf.concat([word_i,output1],1))
        

            
                
logit_words = tf.nn.xw_plus_b(output2, weights2, bias2)
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logit_words,onehot_encoded)

loss = tf.reduce_sum(cross_entropy)

TypeError: Value passed to parameter 'indices' has DataType float32 not in list of allowed values: int32, int64

In [112]:
with tf.Session() as sess:
    with sess.as_default():
        print(tf.nn.embedding_lookup(onehot_encoded,[1]).eval())

[[0. 1. 0. ... 0. 0. 0.]]


In [80]:
inputs

<tf.Tensor 'Placeholder_20:0' shape=(32, 10, 8) dtype=float32>