In [1]:
import warnings
warnings.filterwarnings('ignore')


import tensorflow as tf
import numpy as np
import pickle as pkl
from sklearn import preprocessing as pre
#from pathos.pools import ProcessPool as Pool
tf.set_random_seed(1234)
np.random.seed(1234)

import xmlrpclib

s = xmlrpclib.ServerProxy('http://10.21.230.64:8778')

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="1"


In [47]:
#some hypers
GO='<START>'
STOP='<END>'
pad='<PAD>'
unknown='<UNKNOWN>'
BATCH=1024
BEAM_WIDTH=5
EPOCHS=100
LAM=0.9
embedding_size=512
lstm_units=1024
dropout_keep_prob=0.5
PATIENCE=50
PATIENCE_MONITOR=True
GLOVE=True #if true embedding size will reset to 300

In [4]:
def loadGloveModel(gloveFile):
    print "Loading Glove Model"
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print "Done.",len(model)," words loaded!"
    return model
if GLOVE:
    embedding_size=300

In [5]:
#load data
with open('./captions/token_train.pkl') as f:
    trainCaptions=pkl.load(f)
with open('./captions/token_dev.pkl') as f:
    devCaptions=pkl.load(f)
with open('./captions/token_test.pkl') as f:
    testCaptions=pkl.load(f)

In [6]:
def processToken(caps):
    nk=[]
    
    for i in caps:
        t=[GO]+list(i[1])+[STOP]
        empty=33-len(t)
        
        #t=t+[pad]*empty
        nk.append([i[0],t])
    return nk

In [7]:
trainCaptions=processToken(trainCaptions)
devCaptions=processToken(devCaptions)
testCaptions=processToken(testCaptions)

In [8]:
def listofwords(data):
    '''takes a list of sentences nd returns vocab'''
    a=[]
    for i in data:
        for j in i[1]:
            if j not in a:
                a.append(j)
    return a

In [9]:
#find the vocab and size
trainVocab=listofwords(trainCaptions+devCaptions)
trainVocabSize=len(trainVocab)

In [10]:
#find cap len
trainCapLen=[len(i[1]) for i in trainCaptions]
devCapLen=[len(i[1]) for i in devCaptions]
testCapLen=[len(i[1]) for i in testCaptions]

In [11]:
#Label Encoder for output transform
pre_op=pre.LabelEncoder()
pre_op.fit(trainVocab)
onehoter=np.identity(len(pre_op.classes_))

In [12]:
#word to int
trainSeq=[pre_op.transform(i[1]) for i in trainCaptions]
devSeq=[pre_op.transform(i[1]) for i in devCaptions]
trainID=[i[0] for i in trainCaptions]
devID=[i[0] for i in devCaptions]



#word to int with pool
p=Pool(8)


trainSeq=p.map(pre_op.transform,[i[1] for i in trainCaptions])
devSeq=p.map(pre_op.transform,[i[1] for i in devCaptions])
trainID=[i[0] for i in trainCaptions]
devID=[i[0] for i in devCaptions]



In [13]:
#appending stops

MAX_LEN=33
trainSeqReg=[np.pad(i,(0,MAX_LEN-len(i)),'constant',constant_values=pre_op.transform([STOP])) for i in trainSeq]
devSeqReg=[np.pad(i,(0,MAX_LEN-len(i)),'constant',constant_values=pre_op.transform([STOP])) for i in devSeq]
trainSeqReg=np.array(trainSeqReg)
devSeqReg=np.array(devSeqReg)

In [14]:
#loading video features
videoFeats=np.load(file='./features/consilidated_feats.npy')
videoFeatSize=np.array([len(i) for i in videoFeats])
#making the shape regular
videoFeats=np.array([np.pad(i,mode='constant',pad_width=[(0,28-len(i)),(0,0)]) for i in videoFeats])

In [15]:
#glove
if GLOVE:
    gloveModel=loadGloveModel('./glove/glove.6B.300d.txt')
    gloveEmbedding=[]
    for i in pre_op.classes_:
        if gloveModel.has_key(i):
            gloveEmbedding.append(gloveModel[i])
        else:
            gloveEmbedding.append(np.random.normal(size=(300)))
    gloveEmbedding=np.array(gloveEmbedding)

Loading Glove Model
Done. 400000  words loaded!


In [57]:
#Lets build the graph
tf.reset_default_graph()

In [58]:
source_seq = tf.placeholder(shape=(None,28,2048),dtype=tf.float32)
target_seq = tf.placeholder(shape=(None,33),dtype=tf.int32)
source_seq_len = tf.placeholder(shape=(None,), dtype=tf.int32)
target_seq_len = tf.placeholder(shape=(None,), dtype=tf.int32)
no_start_target_seq = tf.placeholder(shape=(None,32),dtype=tf.int32)
batch_size = tf.placeholder(shape=(None),dtype=tf.int32)
real_target_seq_len= tf.placeholder(shape=(None,), dtype=tf.int32)
keep_prob= tf.placeholder(dtype=tf.float32)

In [59]:
#output embeddings
if GLOVE:
    embedding_matrix_decode = tf.Variable(initial_value=gloveEmbedding,
    name="embedding_matrix_de",
    expected_shape=[trainVocabSize, embedding_size],
    dtype=tf.float32)
else:
    embedding_matrix_decode = tf.get_variable(
    name="embedding_matrix_de",
    expected_shape=[trainVocabSize, embedding_size],
    dtype=tf.float32)
decoder_input_embedded = tf.nn.embedding_lookup(embedding_matrix_decode, target_seq) 


#output embeddings
embedding_matrix_decode = tf.Variable(initial_value=tf.random_normal(shape=[trainVocabSize, embedding_size],dtype=tf.float32))
decoder_input_embedded = tf.nn.embedding_lookup(embedding_matrix_decode, target_seq) 


In [60]:
#encoder
encoderCell=tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(lstm_units),input_keep_prob=keep_prob,
                                          output_keep_prob=keep_prob)
encoder_outputs,encoder_final_state=tf.nn.dynamic_rnn(cell=encoderCell,inputs=source_seq,sequence_length=source_seq_len,
                 dtype=tf.float32)

#expri
#encoder_outputs_tiled=tf.contrib.seq2seq.tile_batch(encoder_outputs,multiplier=BEAM_WIDTH)
#encoder ends here

#exp
tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
    encoder_outputs, multiplier=BEAM_WIDTH)
tiled_encoder_final_state = tf.contrib.seq2seq.tile_batch(
    encoder_final_state, multiplier=BEAM_WIDTH)
tiled_sequence_length = tf.contrib.seq2seq.tile_batch(
    source_seq_len, multiplier=BEAM_WIDTH)

#exp
attention_mechanism = tf.contrib.seq2seq.LuongAttention(lstm_units,encoder_outputs,memory_sequence_length=tiled_sequence_length)
attention_cell = tf.contrib.seq2seq.AttentionWrapper(tf.contrib.rnn.LSTMCell(lstm_units), attention_mechanism,attention_layer_size=lstm_units)
decoder_initial_state = attention_cell.zero_state(
    dtype=tf.float32, batch_size=BATCH * BEAM_WIDTH)
decoder_initial_state = decoder_initial_state.clone(
    cell_state=tiled_encoder_final_state)

In [61]:
#attention
with tf.variable_scope("myScope"):
    attention_mechanism_train = tf.contrib.seq2seq.LuongAttention(lstm_units,encoder_outputs)

In [62]:
#attention
#expri
with tf.variable_scope("myScope",reuse=tf.AUTO_REUSE):
    tiled_encoder_outputs = tf.contrib.seq2seq.tile_batch(
        encoder_outputs, multiplier=BEAM_WIDTH)

    attention_mechanism_infer = tf.contrib.seq2seq.LuongAttention(lstm_units,tiled_encoder_outputs)

In [63]:
#Projection layer and decoder cell
with tf.variable_scope("myScope",reuse=tf.AUTO_REUSE):
    output_layer = tf.layers.Dense(trainVocabSize)

    decoder_cell=tf.contrib.rnn.DropoutWrapper(tf.contrib.rnn.BasicLSTMCell(lstm_units),input_keep_prob=keep_prob,
                                          output_keep_prob=keep_prob)


In [64]:
decoder_initial_state=encoder_final_state

In [65]:
#decoder Attention wrapper
#expri
with tf.variable_scope("myScope",reuse=tf.AUTO_REUSE):
    decoder_cell_train = tf.contrib.seq2seq.AttentionWrapper(
            decoder_cell, attention_mechanism_train,
            attention_layer_size=lstm_units,alignment_history=False)
    decoder_initial_state_train = decoder_cell_train.zero_state(BATCH, tf.float32).clone(cell_state=decoder_initial_state)

In [66]:
#Training helper and decoder
with tf.variable_scope("myScope",reuse=tf.AUTO_REUSE):
    helper = tf.contrib.seq2seq.TrainingHelper(decoder_input_embedded,target_seq_len)
    decoder = tf.contrib.seq2seq.BasicDecoder(decoder_cell_train, helper, initial_state=decoder_initial_state_train,output_layer=output_layer)#,output_layer=projection_layer)
    outputs, state, seq_len = tf.contrib.seq2seq.dynamic_decode(decoder)
    logits = outputs.rnn_output


In [67]:
#cheap trick
emd_copy=tf.Variable(tf.zeros(shape=embedding_matrix_decode.shape))
emd_copier=emd_copy.assign(embedding_matrix_decode)
mask58=np.ones(shape=emd_copier.shape)
mask58[58]=0
mask58=tf.constant(mask58,dtype=tf.float32)
emd58=emd_copier*mask58

In [68]:
#finding sentence embeddings
sentence_ids=outputs.sample_id
decoder_output_embedded=tf.nn.embedding_lookup(emd58,sentence_ids)
maskMeter=seq_len
sentence_embedding=tf.reduce_mean(decoder_output_embedded,1)

In [69]:
video_embedding=tf.concat([decoder_initial_state.c,decoder_initial_state.h],1)

#Inference helper(greedy) and decoder
helper2 = tf.contrib.seq2seq.GreedyEmbeddingHelper(embedding_matrix_decode,tf.fill([batch_size],
                                                    np.int32(pre_op.transform([GO])[0])),
                                                   np.int32(pre_op.transform([STOP])[0]))


decoder2 = tf.contrib.seq2seq.BasicDecoder(decoder_cell, helper2, decoder_initial_state,output_layer=output_layer)#,output_layer=projection_layer)

outputs, state, seq_len = tf.contrib.seq2seq.dynamic_decode(decoder2,maximum_iterations=32+10)

translations_logits = outputs.rnn_output
trs=outputs.sample_id

In [70]:
#expri
#decoder Attention wrapper
with tf.variable_scope("myScope",reuse=tf.AUTO_REUSE):
    decoder_cell_infer = tf.contrib.seq2seq.AttentionWrapper(
            decoder_cell, attention_mechanism_infer,
            attention_layer_size=lstm_units,alignment_history=False)
    decoder_initial_state_infer = decoder_cell_infer.zero_state(BATCH, tf.float32).clone(cell_state=decoder_initial_state)

In [71]:
#Beam Search decoder
with tf.variable_scope("myScope",reuse=tf.AUTO_REUSE):
    decoder_initial_state_tiled = tf.contrib.seq2seq.tile_batch(
        decoder_initial_state_infer[0], multiplier=BEAM_WIDTH)

    decoder_initial_state_tiled=decoder_cell_infer.zero_state(batch_size=BATCH*BEAM_WIDTH,dtype=tf.float32).clone(cell_state=decoder_initial_state_tiled)


    # Define a beam-search decoder
    decoder3 = tf.contrib.seq2seq.BeamSearchDecoder(
            cell=decoder_cell_infer,
            embedding=embedding_matrix_decode,
            start_tokens=tf.fill([batch_size],np.int32(pre_op.transform([GO])[0])),
            end_token=np.int32(pre_op.transform([STOP])[0]),
            initial_state=decoder_initial_state_tiled,
            beam_width=BEAM_WIDTH,
            output_layer=output_layer,
            length_penalty_weight=0.0)
    outputs, state, seq_len = tf.contrib.seq2seq.dynamic_decode(decoder3,maximum_iterations=32+10)


    trs_beam=outputs.predicted_ids


In [72]:
#loss and optimizer

#loss1
cross_entropy=tf.nn.sparse_softmax_cross_entropy_with_logits(labels=no_start_target_seq,logits=logits)

target_weights = tf.sequence_mask(real_target_seq_len, target_seq_len[0], dtype=logits.dtype)

loss1=tf.reduce_sum(cross_entropy*target_weights)

#loss2
sentence_on_video_space=tf.layers.dense(inputs=sentence_embedding,units=2*lstm_units)

loss2=tf.reduce_sum(tf.nn.l2_loss(sentence_on_video_space- video_embedding))

total_loss = LAM * loss1 + (1-LAM) *loss2
#train = tf.train.AdamOptimizer().minimize(total_loss)

#gradient clipping stackoverflow
optimizer = tf.train.AdamOptimizer()
gradients, variables = zip(*optimizer.compute_gradients(total_loss))
gradients, _ = tf.clip_by_global_norm(gradients, 10.0)
train = optimizer.apply_gradients(zip(gradients, variables))


In [73]:
#dont touch
maxtlen=max(trainCapLen)
maxvlen=max(devCapLen)
t_newlen=[maxtlen-1 for i in range(len(trainCapLen))]
v_newlen=[maxtlen-1 for i in range(len(devCapLen))]

In [74]:
sess=tf.InteractiveSession()
tf.global_variables_initializer().run()

In [75]:
saver=tf.train.Saver()

In [76]:
#makes training batch
def getTrainBatch(indexs):
    sourceBatch=np.array([videoFeats[trainID[i]] for i in indexs])
    targetBatch=np.array([trainSeqReg[i] for i in indexs])
    sourceBatchLen=np.array([videoFeatSize[trainID[i]] for i in indexs])
    targetBatchLen=np.array([trainCapLen[i] for i in indexs])
    return sourceBatch,targetBatch,sourceBatchLen,targetBatchLen

#makes dev batch
def getDevBatch(indexs):
    sourceBatch=np.array([videoFeats[devID[i]] for i in indexs])
    targetBatch=np.array([devSeqReg[i] for i in indexs])
    sourceBatchLen=np.array([videoFeatSize[devID[i]] for i in indexs])
    targetBatchLen=np.array([devCapLen[i] for i in indexs])
    return sourceBatch,targetBatch,sourceBatchLen,targetBatchLen

#restore model
saver.restore(sess, "BestModel/model.ckpt")

In [77]:
def calValBleu():
    data=videoFeats[1200:1300]
    data_len=videoFeatSize[1200:1300]
    if BATCH>100:
        data=np.concatenate([data,videoFeats[:BATCH-100]])
        data_len=np.concatenate([data_len,videoFeatSize[:BATCH-100]])
    gen_sum=[]
    for i in range(len(data)/BATCH):
        start=i*BATCH
        stop=(i+1)*BATCH

        load_trs=trs_beam
        y=sess.run(load_trs,feed_dict={source_seq:data[start:stop],
                                                   source_seq_len:data_len[start:stop],
                                                  batch_size:BATCH,keep_prob:1.0
                                                    })
        y=y[:,:,0]

        for t in y:
            gen_sum.append(t)

    if BATCH<100:        
        start=len(data)-BATCH
        stop=len(data)
        y=sess.run(trs_beam,feed_dict={source_seq:data[start:stop],
                                                       source_seq_len:data_len[start:stop],
                                                      batch_size:BATCH,keep_prob:1.0
                                                        })
        y=y[:,:,0]

        y=y[-(len(data)-len(gen_sum)):]
        for t in y:
            gen_sum.append(t)
    
    gen_sum=gen_sum[:100]
    #processing summaries
    summs=[]
    for i in gen_sum:
        summ=''
        for j in i:

            if j!=58:
                summ = summ+' '+pre_op.inverse_transform(j)
        summs.append(summ[1:])
    vdo=1200
    with open('gen_dev.txt','w+') as fle:
        for i in summs:
            fle.write('beam_size_5'+'\tvid'+str(vdo)+'\t'+i)
            fle.write('\n')
            vdo+=1
    with open('gen_dev.txt','r') as fle:
        pred=fle.read()
    return s.calcScore(pred)

In [78]:
warnings.filterwarnings('ignore')

In [None]:
#training starts here
bestVal=0
patience=PATIENCE

print 'starting training'
training_losses=[]
valid_losses=[]
valid_bleu=[]
tData=np.arange(len(trainSeqReg))
dData=np.arange(len(devSeqReg))
for j in range(EPOCHS):
    np.random.shuffle(tData) #makes them iid
    training_loss=0
    for i in range(len(trainSeqReg)/BATCH):
        start=i*BATCH
        stop=(i+1)*BATCH
        sourceBatch,targetBatch,sourceBatchLen,targetBatchLen=getTrainBatch(tData[start:stop])
        _,lost=sess.run([train,total_loss],feed_dict={source_seq:sourceBatch,
                                                target_seq:targetBatch,
                                              source_seq_len:sourceBatchLen,
                                                target_seq_len:t_newlen[start:stop],
                                                real_target_seq_len:targetBatchLen,
                                                no_start_target_seq:np.array(targetBatch)[:,1:],
                                                batch_size:BATCH,keep_prob:dropout_keep_prob
                                                })
        
        training_loss+=lost
        #print lost,
    #calculate t_loss
    training_losses.append(training_loss/len(trainSeqReg))
    
    #calculate v_loss
    validation_loss=0
    for k in range(len(devSeqReg)/BATCH):
        start=k*BATCH
        stop=(k+1)*BATCH
        sourceBatch,targetBatch,sourceBatchLen,targetBatchLen=getDevBatch(dData[start:stop])
        lost=sess.run(total_loss,feed_dict={source_seq:sourceBatch,
                                                target_seq:targetBatch,
                                              source_seq_len:sourceBatchLen,
                                                target_seq_len:t_newlen[start:stop],
                                                real_target_seq_len:targetBatchLen,
                                                no_start_target_seq:np.array(targetBatch)[:,1:],
                                                batch_size:BATCH,keep_prob:1.0
                                                })
        validation_loss += lost
    valBleu=calValBleu()
    valid_bleu.append(valBleu[-1])
    valid_losses.append(validation_loss/len(devSeqReg))
    
    if(valid_bleu[-1]>bestVal) and PATIENCE_MONITOR==True:
        bestVal=valid_bleu[-1]
        saver.save(sess, "model/bestModel.ckpt")
        print "saving model best"
        patience=PATIENCE
    print "Epoch:%d training loss:%.4f: valid loss:%.4f valid blue:%.4f"% (j,training_losses[-1],valid_losses[-1],valid_bleu[-1])
    patience-=1
    if patience==0 and PATIENCE_MONITOR==True:
        break
    

starting training
saving model best
Epoch:0 training loss:40.5945: valid loss:24.1349 valid blue:0.2440
Epoch:1 training loss:28.6342: valid loss:21.9241 valid blue:0.2073
saving model best
Epoch:2 training loss:25.7554: valid loss:20.7448 valid blue:0.2461
saving model best
Epoch:3 training loss:23.3277: valid loss:19.7748 valid blue:0.2526
saving model best
Epoch:4 training loss:21.3067: valid loss:19.3014 valid blue:0.2663
saving model best
Epoch:5 training loss:19.8009: valid loss:19.1443 valid blue:0.2819
saving model best
Epoch:6 training loss:18.5889: valid loss:19.0879 valid blue:0.2885
Epoch:7 training loss:17.5992: valid loss:19.0181 valid blue:0.2849
saving model best
Epoch:8 training loss:16.7231: valid loss:18.9906 valid blue:0.3178
Epoch:9 training loss:16.0198: valid loss:19.0069 valid blue:0.2980
Epoch:10 training loss:15.3950: valid loss:18.8839 valid blue:0.3175
saving model best
Epoch:11 training loss:14.8891: valid loss:19.0417 valid blue:0.3179
Epoch:12 training lo

In [80]:
if PATIENCE_MONITOR:
    saver.restore(sess, "model/bestModel.ckpt")

INFO:tensorflow:Restoring parameters from model/bestModel.ckpt


In [51]:
#save model
saver.save(sess, "model/continue.ckpt")

'model/continue.ckpt'

In [81]:
data=videoFeats[1300:]
data_len=videoFeatSize[1300:]
if BATCH==1024:
    data=np.concatenate([data,data[:354]])
    data_len=np.concatenate([data_len,data_len[:354]])
gen_sum=[]
for i in range(len(data)/BATCH):
    start=i*BATCH
    stop=(i+1)*BATCH
    
    load_trs=trs_beam
    y=sess.run(load_trs,feed_dict={source_seq:data[start:stop],
                                               source_seq_len:data_len[start:stop],
                                              batch_size:BATCH,keep_prob:1.0
                                                })
    y=y[:,:,0]
    
    for t in y:
        gen_sum.append(t)

'''        
start=len(data)-BATCH
stop=len(data)
y=sess.run(trs_beam,feed_dict={source_seq:data[start:stop],
                                               source_seq_len:data_len[start:stop],
                                              batch_size:BATCH,keep_prob:1.0
                                                })
y=y[:,:,0]

y=y[-(len(data)-len(gen_sum)):]
for t in y:
    gen_sum.append(t)
'''

'        \nstart=len(data)-BATCH\nstop=len(data)\ny=sess.run(trs_beam,feed_dict={source_seq:data[start:stop],\n                                               source_seq_len:data_len[start:stop],\n                                              batch_size:BATCH,keep_prob:1.0\n                                                })\ny=y[:,:,0]\n\ny=y[-(len(data)-len(gen_sum)):]\nfor t in y:\n    gen_sum.append(t)\n'

In [82]:
gen_sum=gen_sum[:670]

data=videoFeats[1300:1364]
data_len=videoFeatSize[1300:1364]
gen_sum=[]
for i in range(len(data)/BATCH):
    start=i*BATCH
    stop=(i+1)*BATCH
    load_trs=trs
    y=sess.run(load_trs,feed_dict={source_seq:data[start:stop],
                                               source_seq_len:data_len[start:stop],
                                              batch_size:BATCH,
                                                keep_prob:1.0
                                                })
    for t in y:
        gen_sum.append(t)


In [83]:
#processing summaries
summs=[]
for i in gen_sum:
    summ=''
    for j in i:
        
        if j!=58:
            summ = summ+' '+pre_op.inverse_transform(j)
    summs.append(summ[1:])

for i in range(len(summs)):
    print i,summs[i],vops[i]

vops=np.load('../MSVD/clip_index/testIndex.npy')[:64]
ref_test

#real shit
with open('ref_dev.txt','w+') as fle:
    for i in devCaptions:
        fle.write('vid'+str(i[0])+'\t'+' '.join(i[1][1:-1]))
        fle.write('\n')

In [84]:
vdo=1300
with open('gen.txt','w+') as fle:
    for i in summs:
        fle.write('beam_size_5'+'\tvid'+str(vdo)+'\t'+i)
        fle.write('\n')
        vdo+=1