In [1]:
import tensorflow as tf
import numpy as np
import pickle as pkl
import sklearn as sk
from sklearn import preprocessing as pre
from pathos.pools import ProcessPool as Pool


  from ._conv import register_converters as _register_converters


In [2]:
#some hypers
GO='<START>'
STOP='<END>'
pad='<PAD>'
unknown='<UNKNOWN>'
BATCH=64
BEAM_WIDTH=5
EPOCHS=10

In [3]:
#load data
with open('./captions/token_train.pkl') as f:
    trainCaptions=pkl.load(f)
with open('./captions/token_dev.pkl') as f:
    devCaptions=pkl.load(f)
with open('./captions/token_test.pkl') as f:
    testCaptions=pkl.load(f)

In [4]:
def processToken(caps):
    nk=[]
    
    for i in caps:
        t=[GO]+list(i[1])+[STOP]
        empty=33-len(t)
        
        #t=t+[pad]*empty
        nk.append([i[0],t])
    return nk

In [5]:
trainCaptions=processToken(trainCaptions)
devCaptions=processToken(devCaptions)
testCaptions=processToken(testCaptions)

In [6]:
def listofwords(data):
    '''takes a list of sentences nd returns vocab'''
    a=[]
    for i in data:
        for j in i[1]:
            if j not in a:
                a.append(j)
    return a

In [7]:
#find the vocab and size
trainVocab=listofwords(trainCaptions+devCaptions)
trainVocabSize=len(trainVocab)

In [8]:
#find cap len
trainCapLen=[len(i[1]) for i in trainCaptions]
devCapLen=[len(i[1]) for i in devCaptions]
testCapLen=[len(i[1]) for i in testCaptions]

In [9]:
#Label Encoder for output transform
pre_op=pre.LabelEncoder()
pre_op.fit(trainVocab)
onehoter=np.identity(len(pre_op.classes_))

#word to int
trainSeq=[pre_op.transform(i[1]) for i in trainCaptions]
devSeq=[pre_op.transform(i[1]) for i in devCaptions]
trainID=[i[0] for i in trainCaptions]
devID=[i[0] for i in devCaptions]



In [10]:
#word to int with pool
p=Pool(8)


trainSeq=p.map(pre_op.transform,[i[1] for i in trainCaptions])
devSeq=p.map(pre_op.transform,[i[1] for i in devCaptions])
trainID=[i[0] for i in trainCaptions]
devID=[i[0] for i in devCaptions]



In [11]:
#appending stops

MAX_LEN=33
trainSeqReg=[np.pad(i,(0,MAX_LEN-len(i)),'constant',constant_values=pre_op.transform([STOP])) for i in trainSeq]
devSeqReg=[np.pad(i,(0,MAX_LEN-len(i)),'constant',constant_values=pre_op.transform([STOP])) for i in devSeq]
trainSeqReg=np.array(trainSeqReg)
devSeqReg=np.array(devSeqReg)

In [12]:
#loading video features
videoFeats=np.load(file='./features/consilidated_feats.npy')
videoFeatSize=np.array([len(i) for i in videoFeats])
#making the shape regular
videoFeats=np.array([np.pad(i,mode='constant',pad_width=[(0,28-len(i)),(0,0)]) for i in videoFeats])

In [13]:
#Lets build the graph
embedding_size=512
lstm_units=512
dropout_keep_prob=0.5
tf.reset_default_graph()

In [14]:
source_seq = tf.placeholder(shape=(None,28,2048),dtype=tf.float32)
target_seq = tf.placeholder(shape=(None,33),dtype=tf.int32)
source_seq_len = tf.placeholder(shape=(None,), dtype=tf.int32)
target_seq_len = tf.placeholder(shape=(None,), dtype=tf.int32)
no_start_target_seq = tf.placeholder(shape=(None,32),dtype=tf.int32)
batch_size = tf.placeholder(shape=(None),dtype=tf.int32)
real_target_seq_len= tf.placeholder(shape=(None,), dtype=tf.int32)
keep_prob= tf.placeholder(dtype=tf.float32)

In [15]:
#output embeddings
embedding_matrix_decode = tf.get_variable(
    name="embedding_matrix_de",
    shape=[trainVocabSize, embedding_size],
    dtype=tf.float32)
decoder_input_embedded = tf.nn.embedding_lookup(embedding_matrix_decode, target_seq) 


In [16]:
#output embeddings
embedding_matrix_decode = tf.Variable(initial_value=tf.random_normal(shape=[trainVocabSize, embedding_size],dtype=tf.float32))
decoder_input_embedded = tf.nn.embedding_lookup(embedding_matrix_decode, target_seq) 


In [17]:
#encoder
encoderCell=tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(lstm_units),input_keep_prob=keep_prob,
                                          output_keep_prob=keep_prob)
encoder_outputs,encoder_final_state=tf.nn.dynamic_rnn(cell=encoderCell,inputs=source_seq,sequence_length=source_seq_len,
                 dtype=tf.float32)

#expri
#encoder_outputs_tiled=tf.contrib.seq2seq.tile_batch(encoder_outputs,multiplier=BEAM_WIDTH)
#encoder ends here

Instructions for updating:
Use the retry module or similar alternatives.


#exp
tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
    encoder_outputs, multiplier=BEAM_WIDTH)
tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
    encoder_final_state, multiplier=BEAM_WIDTH)
tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
    source_seq_len, multiplier=BEAM_WIDTH)

#exp
attention_mechanism = tf.contrib.seq2seq.LuongAttention(lstm_units,encoder_outputs,memory_sequence_length=tiled_sequence_length)
attention_cell = tf.contrib.seq2seq.AttentionWrapper(tf.contrib.rnn.LSTMCell(lstm_units), attention_mechanism,attention_layer_size=lstm_units)
decoder_initial_state = attention_cell.zero_state(
    dtype=tf.float32, batch_size=BATCH * BEAM_WIDTH)
decoder_initial_state = decoder_initial_state.clone(
    cell_state=tiled_encoder_final_state)

In [18]:
#attention
with tf.variable_scope("myScope"):
    attention_mechanism_train = tf.contrib.seq2seq.LuongAttention(lstm_units,encoder_outputs)

In [19]:
#attention
#expri
with tf.variable_scope("myScope",reuse=tf.AUTO_REUSE):
    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
        encoder_outputs, multiplier=BEAM_WIDTH)

    attention_mechanism_infer = tf.contrib.seq2seq.LuongAttention(lstm_units,tiled_encoder_outputs)

In [20]:
#Projection layer and decoder cell
with tf.variable_scope("myScope",reuse=tf.AUTO_REUSE):
    output_layer = tf.layers.Dense(trainVocabSize)

    decoder_cell=tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(lstm_units),input_keep_prob=keep_prob,
                                          output_keep_prob=keep_prob)


In [21]:
decoder_initial_state=encoder_final_state

In [22]:
#decoder Attention wrapper
#expri
with tf.variable_scope("myScope",reuse=tf.AUTO_REUSE):
    decoder_cell_train = tf.contrib.seq2seq.AttentionWrapper(
            decoder_cell, attention_mechanism_train,
            attention_layer_size=lstm_units,alignment_history=False)
    decoder_initial_state_train = decoder_cell_train.zero_state(BATCH, tf.float32).clone(cell_state=decoder_initial_state)

In [23]:
#Training helper and decoder
with tf.variable_scope("myScope",reuse=tf.AUTO_REUSE):
    helper = tf.contrib.seq2seq.TrainingHelper(decoder_input_embedded,target_seq_len)
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell_train, helper, initial_state=decoder_initial_state_train,output_layer=output_layer)#,output_layer=projection_layer)
    outputs, state, seq_len = tf.contrib.seq2seq.dynamic_decode(decoder)
    logits = outputs.rnn_output


#Inference helper(greedy) and decoder
helper2 = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding_matrix_decode,tf.fill([batch_size],
                                                    np.int32(pre_op.transform([GO])[0])),
                                                   np.int32(pre_op.transform([STOP])[0]))


decoder2 = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper2, decoder_initial_state,output_layer=output_layer)#,output_layer=projection_layer)

outputs, state, seq_len = tf.contrib.seq2seq.dynamic_decode(decoder2,maximum_iterations=32+10)

translations_logits = outputs.rnn_output
trs=outputs.sample_id

In [24]:
#expri
#decoder Attention wrapper
with tf.variable_scope("myScope",reuse=tf.AUTO_REUSE):
    decoder_cell_infer = tf.contrib.seq2seq.AttentionWrapper(
            decoder_cell, attention_mechanism_infer,
            attention_layer_size=lstm_units,alignment_history=False)
    decoder_initial_state_infer = decoder_cell_infer.zero_state(BATCH, tf.float32).clone(cell_state=decoder_initial_state)

In [25]:
#Beam Search decoder
with tf.variable_scope("myScope",reuse=tf.AUTO_REUSE):
    decoder_initial_state_tiled = tf.contrib.seq2seq.tile_batch(
        decoder_initial_state_infer[0], multiplier=BEAM_WIDTH)

    decoder_initial_state_tiled=decoder_cell_infer.zero_state(batch_size=BATCH*BEAM_WIDTH,dtype=tf.float32).clone(cell_state=decoder_initial_state_tiled)


    # Define a beam-search decoder
    decoder3 = tf.contrib.seq2seq.BeamSearchDecoder(
            cell=decoder_cell_infer,
            embedding=embedding_matrix_decode,
            start_tokens=tf.fill([batch_size],np.int32(pre_op.transform([GO])[0])),
            end_token=np.int32(pre_op.transform([STOP])[0]),
            initial_state=decoder_initial_state_tiled,
            beam_width=BEAM_WIDTH,
            output_layer=output_layer,
            length_penalty_weight=0.0)
    outputs, state, seq_len = tf.contrib.seq2seq.dynamic_decode(decoder3,maximum_iterations=32+10)


    trs_beam=outputs.predicted_ids


In [26]:
#loss and optimizer
cross_entropy=tf.nn.sparse_softmax_cross_entropy_with_logits(labels=no_start_target_seq,logits=logits)

target_weights = tf.sequence_mask(real_target_seq_len, target_seq_len[0], dtype=logits.dtype)

loss=tf.reduce_sum(cross_entropy*target_weights)
train = tf.train.AdamOptimizer().minimize(loss)


In [27]:
#dont touch
maxtlen=max(trainCapLen)
maxvlen=max(devCapLen)
t_newlen=[maxtlen-1 for i in range(len(trainCapLen))]
v_newlen=[maxtlen-1 for i in range(len(devCapLen))]

In [28]:
sess=tf.InteractiveSession()
tf.global_variables_initializer().run()

In [29]:
#makes training batch
def getTrainBatch(indexs):
    sourceBatch=np.array([videoFeats[trainID[i]] for i in indexs])
    targetBatch=np.array([trainSeqReg[i] for i in indexs])
    sourceBatchLen=np.array([videoFeatSize[trainID[i]] for i in indexs])
    targetBatchLen=np.array([trainCapLen[i] for i in indexs])
    return sourceBatch,targetBatch,sourceBatchLen,targetBatchLen

#makes dev batch
def getDevBatch(indexs):
    sourceBatch=np.array([videoFeats[devID[i]] for i in indexs])
    targetBatch=np.array([devSeqReg[i] for i in indexs])
    sourceBatchLen=np.array([videoFeatSize[devID[i]] for i in indexs])
    targetBatchLen=np.array([devCapLen[i] for i in indexs])
    return sourceBatch,targetBatch,sourceBatchLen,targetBatchLen

In [35]:
#training starts here
training_losses=[]
valid_losses=[]
tData=np.arange(len(trainSeqReg))
dData=np.arange(len(devSeqReg))
for j in range(EPOCHS):
    np.random.shuffle(tData) #makes them iid
    training_loss=0
    for i in range(len(trainSeqReg)/BATCH):
        start=i*BATCH
        stop=(i+1)*BATCH
        sourceBatch,targetBatch,sourceBatchLen,targetBatchLen=getTrainBatch(tData[start:stop])
        _,lost=sess.run([train,loss],feed_dict={source_seq:sourceBatch,
                                                target_seq:targetBatch,
                                              source_seq_len:sourceBatchLen,
                                                target_seq_len:t_newlen[start:stop],
                                                real_target_seq_len:targetBatchLen,
                                                no_start_target_seq:np.array(targetBatch)[:,1:],
                                                batch_size:BATCH,keep_prob:dropout_keep_prob
                                                })
        
        training_loss+=lost
        print lost,
    #calculate t_loss
    training_losses.append(training_loss/len(trainSeqReg))
    
    #calculate v_loss
    validation_loss=0
    for k in range(len(devSeqReg)/BATCH):
        start=k*BATCH
        stop=(k+1)*BATCH
        sourceBatch,targetBatch,sourceBatchLen,targetBatchLen=getDevBatch(dData[start:stop])
        lost=sess.run(loss,feed_dict={source_seq:sourceBatch,
                                                target_seq:targetBatch,
                                              source_seq_len:sourceBatchLen,
                                                target_seq_len:t_newlen[start:stop],
                                                real_target_seq_len:targetBatchLen,
                                                no_start_target_seq:np.array(targetBatch)[:,1:],
                                                batch_size:BATCH,keep_prob:1.0
                                                })
        validation_loss += lost
        
    valid_losses.append(validation_loss/len(devSeqReg))
    
    print "Epoch:%d training loss%.4f: valid loss:%.4f"% (j,training_losses[-1],valid_losses[-1])
    #print "Epoch:%d training loss%.4f"% (j,training_losses[-1])

1366.1873 1103.7656 977.2186 1296.4275 1122.3535 1199.1206 1144.1187 1051.6063 1088.3834 1226.6135 934.60736 1269.094 1035.1611 1018.27954 1134.164 1109.9584 1030.8466 1382.7062 1427.0569 1077.8042 1134.3044 1063.8728 1155.397 1098.0195 1000.97107 1067.421 1064.2029 1195.1809 955.45465 1135.749 1282.596 1102.6926 1057.5059 1072.8859 1022.80536 1198.9703 1117.1619 1211.3171 1322.3865 982.9385 1095.1302 1169.1172 1212.7375 1014.91187 1230.5325 1021.62634 1172.9965 1128.5067 1313.7864 1266.1942 1053.6648 996.3048 1261.7249 963.902 1000.3915 1154.362 1130.0903 1119.9026 1204.3511 1211.133 1097.0999 1215.4326 1254.8677 1104.7021 1135.4076 1120.6469 1173.5815 1052.1907 1116.939 1268.6812 1306.9806 1267.7295 980.7903 1134.8469 1186.2616 1181.3304 974.40564 1051.0992 1344.787 1117.3325 956.23376 1186.4305 1000.2412 1097.6564 1074.2224 1160.2568 1175.4019 1046.153 1077.5818 1300.5293 1106.0906 1112.4055 1130.7147 1304.8691 1071.856 1180.9243 1115.1365 973.8957 1118.3229 1184.6088 1011.2898 1353

KeyboardInterrupt: 

In [47]:
data=videoFeats[1300:1364]
data_len=videoFeatSize[1300:1364]
gen_sum=[]
for i in range(len(data)/BATCH):
    start=0
    stop=64
    
    load_trs=trs_beam
    y=sess.run(load_trs,feed_dict={source_seq:data,
                                               source_seq_len:data_len,
                                              batch_size:BATCH,keep_prob:1.0
                                                })
    #print y
    
    
    y=y[:,:,0]
    
    for t in y:
        gen_sum.append(t)



data=videoFeats[1300:1364]
data_len=videoFeatSize[1300:1364]
gen_sum=[]
for i in range(len(data)/BATCH):
    start=i*BATCH
    stop=(i+1)*BATCH
    load_trs=trs
    y=sess.run(load_trs,feed_dict={source_seq:data[start:stop],
                                               source_seq_len:data_len[start:stop],
                                              batch_size:BATCH,
                                                keep_prob:1.0
                                                })
    for t in y:
        gen_sum.append(t)


In [48]:
#processing summaries
summs=[]
for i in gen_sum:
    summ=''
    for j in i:
        
        if j!=58:
            summ = summ+' '+pre_op.inverse_transform(j)
    summs.append(summ[1:])

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:

  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


In [49]:
print summs

['a man cooking his kichen', 'a dog is riding a skateboard', 'a man is pouring oil into a pan', 'a woman is slicing a cucumber', 'a man is dancing', 'a woman is feeding a baby', 'a man is dancing', 'a woman is pouring oil into a bowl', 'a man is dancing', 'a man is shuffling cards', 'a man is carrying another man', 'a man is running', 'a man is lifting weights', 'a man cooking his kichen', 'a man is dancing', 'a man is dancing', 'two polar bears are fighting', 'a man is dancing', 'a man cooking his kichen', 'a polar bear is running', 'a man is riding a motorcycle', 'a girl is brushing her hair', 'an animal is eating', 'a woman is peeling a potato', 'a man is dancing', 'a man is riding a motorcycle', 'a man cooking his kichen', 'a woman is feeding a man', 'a baby is laughing', 'a man cooking his kichen', 'a woman is pouring oil into a bowl', 'a man is swimming', 'a woman is feeding a man', 'a man is eating noodles', 'a man cooking his kichen', 'people are dancing', 'a man cooking his ki

In [46]:
np.load('./clip_index/testIndex.npy')[200:264]

array([['1500', '0xx13BuvVmo_25_36'],
       ['1501', '0hyZ__3YhZc_364_370'],
       ['1502', 'gMqKUPeTAkg_17_30'],
       ['1503', '8MVo7fje_oE_85_90'],
       ['1504', 's-dSFyz_5Ww_31_41'],
       ['1505', 'YmXCfQm0_CA_109_120'],
       ['1506', 'xgIIcPSh4EU_0_6'],
       ['1507', 'uZEGu-TA2cU_42_58'],
       ['1508', '97JhYpoWxzY_0_4'],
       ['1509', 'Kxa0mnDj0bs_113_124'],
       ['1510', 'fvBs0xpEZhQ_10_30'],
       ['1511', '5YJaS2Eswg0_22_26'],
       ['1512', 'i2GgBwlwV0c_24_31'],
       ['1513', 'zSPBC8EO6dY_122_126'],
       ['1514', 'hW8TKz2Aea4_5_12'],
       ['1515', 'zHy7pM0U49w_103_109'],
       ['1516', 'hJFBXHtxKIc_204_209'],
       ['1517', 'm7x8uIdg2XU_67_73'],
       ['1518', 'KPPCwmU5OHQ_227_238'],
       ['1519', '1dfR0A_BXjw_524_532'],
       ['1520', 'yAD_TS5L2d4_4_11'],
       ['1521', 'OIjsSu_I4So_6_10'],
       ['1522', '5CS4nLI2ZX8_50_59'],
       ['1523', '3FnUTQMJVXI_31_36'],
       ['1524', 'DIebwNHGjm8_27_38'],
       ['1525', 'c_XV7nPoRg8_2_12'],
    

In [None]:
tf.all_variables()