In [1]:
import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm
import tensorflow as tf
import numpy as np
import pickle as pkl
from sklearn import preprocessing as pre
#from pathos.pools import ProcessPool as Pool
tf.set_random_seed(1234)
np.random.seed(1234)

#from xmlrpc import client
import xmlrpclib

s = xmlrpclib.ServerProxy('http://10.21.230.64:8778')

In [2]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"


In [3]:
#some hypers
GO='<START>'
STOP='<END>'
PAD='<PAD>'
unknown='<UNKNOWN>'
BATCH=64
BEAM_WIDTH=5
EPOCHS=100
LAM=0.9
embedding_size=512
lstm_units=512
dropout_keep_prob=0.8
PATIENCE=20
PATIENCE_MONITOR=True
GLOVE=False #if true embedding size will reset to 300
CONSTGLOVE=False
MAX_LEN=33
#new

SIN=True
NUM_BLOCKS=6
NUM_HEADS = 8
FEAT_DIM=2048
NUM_UNITS=512
FRAMES=28
LR=0.0001
WARMUP=4000

In [4]:
def loadGloveModel(gloveFile):
    print "Loading Glove Model"
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = np.array([float(val) for val in splitLine[1:]])
        model[word] = embedding
    print "Done.",len(model)," words loaded!"
    return model
if GLOVE:
    embedding_size=300

In [5]:
#load data
with open('./captions/token_train.pkl') as f:
    trainCaptions=pkl.load(f)
with open('./captions/token_dev.pkl') as f:
    devCaptions=pkl.load(f)
with open('./captions/token_test.pkl') as f:
    testCaptions=pkl.load(f)

In [6]:
def processToken(caps):
    nk=[]
    
    for i in caps:
        t=[GO]+list(i[1])+[STOP]
        empty=33-len(t)
        
        #t=t+[pad]*empty
        nk.append([i[0],t])
    return nk

In [7]:
trainCaptions=processToken(trainCaptions)
devCaptions=processToken(devCaptions)
testCaptions=processToken(testCaptions)

In [8]:
def listofwords(data):
    '''takes a list of sentences nd returns vocab'''
    a=[]
    for i in data:
        for j in i[1]:
            if j not in a:
                a.append(j)
    return a

In [9]:
#find the vocab and size
trainVocab=listofwords(trainCaptions+devCaptions)
trainVocabSize=len(trainVocab)

In [10]:
#find cap len
trainCapLen=[len(i[1]) for i in trainCaptions]
devCapLen=[len(i[1]) for i in devCaptions]
testCapLen=[len(i[1]) for i in testCaptions]

In [11]:
#Label Encoder for output transform
pre_op=pre.LabelEncoder()
pre_op.fit(trainVocab+[PAD])
onehoter=np.identity(len(pre_op.classes_))

In [12]:
#word to int
trainSeq=[pre_op.transform(i[1]) for i in trainCaptions]
devSeq=[pre_op.transform(i[1]) for i in devCaptions]
trainID=[i[0] for i in trainCaptions]
devID=[i[0] for i in devCaptions]



#word to int with pool
p=Pool(8)


trainSeq=p.map(pre_op.transform,[i[1] for i in trainCaptions])
devSeq=p.map(pre_op.transform,[i[1] for i in devCaptions])
trainID=[i[0] for i in trainCaptions]
devID=[i[0] for i in devCaptions]



In [13]:
#appending stops


trainSeqReg=[np.pad(i,(0,MAX_LEN-len(i)),'constant',constant_values=pre_op.transform([PAD])) for i in trainSeq]
devSeqReg=[np.pad(i,(0,MAX_LEN-len(i)),'constant',constant_values=pre_op.transform([PAD])) for i in devSeq]
trainSeqReg=np.array(trainSeqReg)
devSeqReg=np.array(devSeqReg)

In [14]:
#loading video features
videoFeats=np.load(file='./features/consilidated_feats.npy')
videoFeatSize=np.array([len(i) for i in videoFeats])
#making the shape regular
videoFeats=np.array([np.pad(i,mode='constant',pad_width=[(0,28-len(i)),(0,0)]) for i in videoFeats])

In [15]:
#glove
if GLOVE:
    gloveModel=loadGloveModel('./glove/glove.6B.300d.txt')
    gloveEmbedding=[]
    for i in pre_op.classes_:
        if gloveModel.has_key(i):
            gloveEmbedding.append(gloveModel[i])
        else:
            gloveEmbedding.append(np.random.normal(size=(300)))
    gloveEmbedding=np.array(gloveEmbedding)

In [16]:
from modules import *



In [17]:
#Lets build the graph
tf.reset_default_graph()


In [18]:
#dummys
enc_ph = tf.placeholder(tf.float32,(BATCH,FRAMES,2048))
enc_len_ph = tf.placeholder(tf.float32,(BATCH))
y_ph = tf.placeholder('int32',(BATCH,MAX_LEN))
y_len_ph=tf.placeholder('int32',(BATCH))

In [19]:
#class Transformer:
'''
    xs: tuple of
        x: int32 tensor. (N, T1)
        x_seqlens: int32 tensor. (N,)
        sents1: str tensor. (N,)
    ys: tuple of
        decoder_input: int32 tensor. (N, T2)
        y: int32 tensor. (N, T2)
        y_seqlen: int32 tensor. (N, )
        sents2: str tensor. (N,)
    training: boolean.
    
def __init__(self, hp):
    #self.hp = hp
    #self.token2idx, self.idx2token = load_vocab(hp.vocab)
'''
embeddings = get_token_embeddings(trainVocabSize, embedding_size, zero_pad=True)

def encode(xs, training=True):
    '''
    Returns
    memory: encoder outputs. (N, T1, d_model)
    '''
    with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE):
        x, seqlens  = xs

        # embedding
        enc = x # (N, T1, d_model)
        enc *= FEAT_DIM**0.5 # scale

        enc += positional_encoding(enc, FRAMES)
        enc = tf.layers.dropout(enc, 1-dropout_keep_prob, training=training)

        ## Blocks
        for i in range(NUM_BLOCKS):
            with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                # self-attention
                enc = multihead_attention(queries=enc,
                                          keys=enc,
                                          values=enc,
                                          num_heads=NUM_HEADS,
                                          dropout_rate=1-dropout_keep_prob,
                                          training=training,
                                          causality=False)
                # feed forward
                enc = ff(enc, num_units=[FEAT_DIM, FEAT_DIM])
    memory = enc
    return memory



In [20]:
def decode(ys, memory, training=True):
    '''
    memory: encoder outputs. (N, T1, d_model)
    Returns
    logits: (N, T2, V). float32.
    y_hat: (N, T2). int32
    y: (N, T2). int32
    sents2: (N,). string.
    '''
    with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE):
        decoder_inputs, y, seqlens = ys

        # embedding
        dec = tf.nn.embedding_lookup(embeddings, decoder_inputs)  # (N, T2, d_model)
        dec *= NUM_UNITS ** 0.5  # scale

        dec += positional_encoding(dec, MAX_LEN-1)
        dec = tf.layers.dropout(dec, 1-dropout_keep_prob, training=training)

        # Blocks
        for i in range(NUM_BLOCKS):
            with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE):
                # Masked self-attention (Note that causality is True at this time)
                dec = multihead_attention(queries=dec,
                                          keys=dec,
                                          values=dec,
                                          num_heads=NUM_HEADS,
                                          dropout_rate=1-dropout_keep_prob,
                                          training=training,
                                          causality=True,
                                          scope="self_attention")

                # Vanilla attention
                dec = multihead_attention(queries=dec,
                                          keys=memory,
                                          values=memory,
                                          num_heads=NUM_HEADS,
                                          dropout_rate=1-dropout_keep_prob,
                                          training=training,
                                          causality=False,
                                          scope="vanilla_attention")
                ### Feed Forward
                dec = ff(dec, num_units=[NUM_UNITS*4,embedding_size])

    # Final linear projection (embedding weights are shared)
    weights = tf.transpose(embeddings) # (d_model, vocab_size)
    logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size)
    y_hat = tf.to_int32(tf.argmax(logits, axis=-1))

    return logits, y_hat, y

In [21]:
padID=pre_op.transform([PAD])[0]
startID=pre_op.transform([GO])[0]
stopID=pre_op.transform([STOP])[0]

def train( xs, ys):
    '''
    Returns
    loss: scalar.
    train_op: training operation
    global_step: scalar.
    summaries: training summary node
    '''
    # forward
    memory = encode(xs)
    logits, preds, y = decode(ys, memory)

    # train scheme
    
    y_ = label_smoothing(tf.one_hot(y, depth=trainVocabSize))
    ce = tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits, labels=y_)
    nonpadding = tf.to_float(tf.not_equal(y,padID ))  # 0: <pad>
    loss = tf.reduce_sum(ce * nonpadding) / (tf.reduce_sum(nonpadding) + 1e-7)

    global_step = tf.train.get_or_create_global_step()
    lr = noam_scheme(LR, global_step, WARMUP)
    optimizer = tf.train.AdamOptimizer(lr)
    train_op = optimizer.minimize(loss, global_step=global_step)

    tf.summary.scalar('lr', lr)
    tf.summary.scalar("loss", loss)
    tf.summary.scalar("global_step", global_step)

    summaries = tf.summary.merge_all()

    return loss, train_op, global_step, summaries


In [22]:

def evalr(xs, ys):
    '''Predicts autoregressively
    At inference, input ys is ignored.
    Returns
    y_hat: (N, T2)
    '''
    decoder_inputs, y, y_seqlen = ys

    decoder_inputs = tf.ones((tf.shape(xs[0])[0], 1), tf.int32) * startID
    ys = (decoder_inputs, y, y_seqlen)

    memory = encode(xs, False)

    print "Inference graph is being built. Please be patient."
    for _ in tqdm(range(MAX_LEN-1)):
        logits, y_hat, y = decode(ys, memory, False)
        if tf.reduce_sum(y_hat, 1) == padID: break

        _decoder_inputs = tf.concat((decoder_inputs, y_hat), 1)
        ys = (_decoder_inputs, y, y_seqlen)

    # monitor a random sample
    '''
    n = tf.random_uniform((), 0, tf.shape(y_hat)[0]-1, tf.int32)
    sent1 = sents1[n]
    pred = convert_idx_to_token_tensor(y_hat[n], self.idx2token)
    sent2 = sents2[n]

    tf.summary.text("sent1", sent1)
    tf.summary.text("pred", pred)
    tf.summary.text("sent2", sent2)
    summaries = tf.summary.merge_all()
    '''
    return y_hat#, summaries


In [23]:
loss, train_op, global_step, summaries= train((enc_ph,enc_len_ph),(y_ph[:,:-1],y_ph[:,1:],y_len_ph))
y_hat = evalr((enc_ph,enc_len_ph),(y_ph[:,:-1],y_ph[:,1:],y_len_ph))

  0%|          | 0/32 [00:00<?, ?it/s]

Inference graph is being built. Please be patient.


100%|██████████| 32/32 [01:04<00:00,  2.12s/it]


In [24]:
#makes training batch

SenEmb=np.squeeze(np.load('./LmEmb.npy'))

def getTrainBatch(indexs):
    sourceBatch=np.array([videoFeats[trainID[i]] for i in indexs])
    targetBatch=np.array([trainSeqReg[i] for i in indexs])
    sourceBatchLen=np.array([videoFeatSize[trainID[i]] for i in indexs])
    targetBatchLen=np.array([trainCapLen[i] for i in indexs])
    targetSenEm=np.array([SenEmb[i] for i in indexs])
    return sourceBatch,targetBatch,sourceBatchLen,targetBatchLen,targetSenEm

#makes dev batch
def getDevBatch(indexs):
    sourceBatch=np.array([videoFeats[devID[i]] for i in indexs])
    targetBatch=np.array([devSeqReg[i] for i in indexs])
    sourceBatchLen=np.array([videoFeatSize[devID[i]] for i in indexs])
    targetBatchLen=np.array([devCapLen[i] for i in indexs])
    return sourceBatch,targetBatch,sourceBatchLen,targetBatchLen

In [25]:
def calValBleu():
    data=videoFeats[1200:1300]
    data_len=videoFeatSize[1200:1300]
    if BATCH>100:
        data=np.concatenate([data,videoFeats[:BATCH-100]])
        data_len=np.concatenate([data_len,videoFeatSize[:BATCH-100]])
    gen_sum=[]
    for i in range(len(data)/BATCH):
        start=i*BATCH
        stop=(i+1)*BATCH

        
        y=sess.run(y_hat,feed_dict={enc_ph:data[start:stop],y_ph:np.zeros((BATCH,MAX_LEN)),
                                              enc_len_ph:data_len[start:stop],
                                                y_len_ph:np.zeros((BATCH))})
        
        
        for t in y:
            gen_sum.append(t)

    if BATCH<100:        
        start=len(data)-BATCH
        stop=len(data)
        y=sess.run(y_hat,feed_dict={enc_ph:data[start:stop],y_ph:np.zeros((BATCH,MAX_LEN)),
                                              enc_len_ph:data_len[start:stop],
                                                y_len_ph:np.zeros((BATCH))})
                        
        

        y=y[-(len(data)-len(gen_sum)):]
        for t in y:
            gen_sum.append(t)
    
    gen_sum=gen_sum[:100]
    #processing summaries
    summs=[]
    for i in gen_sum:
        summ=''
        for j in i:
            if j==stopID:
                break
            summ = summ+' '+pre_op.inverse_transform(j)
        summs.append(summ[1:])
    vdo=1200
    with open('gen_dev.txt','w+') as fle:
        for i in summs:
            fle.write('beam_size_5'+'\tvid'+str(vdo)+'\t'+i)
            fle.write('\n')
            vdo+=1
    with open('gen_dev.txt','r') as fle:
        pred=fle.read()
    return s.calcScore(pred)

In [26]:
saver=tf.train.Saver()

In [27]:
sess=tf.InteractiveSession()
tf.global_variables_initializer().run()

In [28]:
warnings.filterwarnings('ignore')

In [None]:
#training starts here
bestVal=0
patience=PATIENCE

print 'starting training'
training_losses=[]
valid_losses=[]
valid_bleu=[]
tData=np.arange(len(trainSeqReg))
dData=np.arange(len(devSeqReg))
for j in range(EPOCHS):
    np.random.shuffle(tData) #makes them iid
    training_loss=0
    for i in range(len(trainSeqReg)/BATCH):
        start=i*BATCH
        stop=(i+1)*BATCH
        sourceBatch,targetBatch,sourceBatchLen,targetBatchLen,senTargetBatch=getTrainBatch(tData[start:stop])
        
        
        _,lost=sess.run([train_op,loss],feed_dict={enc_ph:sourceBatch,y_ph:targetBatch,enc_len_ph:sourceBatchLen,
                                                y_len_ph:targetBatchLen})
        
        training_loss+=lost
        #print lost,
    #calculate t_loss
    training_losses.append(training_loss/(len(trainSeqReg)/BATCH))
    
    #calculate v_loss
    #disabled
    '''
    validation_loss=0
    for k in range(len(devSeqReg)/BATCH):
        start=k*BATCH
        stop=(k+1)*BATCH
        sourceBatch,targetBatch,sourceBatchLen,targetBatchLen=getDevBatch(dData[start:stop])
        lost=sess.run(loss,feed_dict={enc_ph:sourceBatch,y_ph:targetBatch,enc_len_ph:sourceBatchLen,
                                                y_len_ph:targetBatchLen})
        validation_loss += lost
    
    
    valid_losses.append(validation_loss/len(devSeqReg))
    '''
    
    valBleu=calValBleu()
    valid_bleu.append(valBleu[0])
    
    
    if(valid_bleu[-1]>bestVal) and PATIENCE_MONITOR==True:
        bestVal=valid_bleu[-1]
        saver.save(sess, "transModels/best.ckpt")
        print "saving model best"
        patience=PATIENCE
    
    print "Epoch:%d training loss:%.4f: valid loss:%.4f valid bleu:%.4f"% (j,training_losses[-1],0.0,valid_bleu[-1])
    patience-=1
    if patience==0 and PATIENCE_MONITOR==True:
        break
    

In [None]:
tmp=sess.run(y_hat,feed_dict={enc_ph:sourceBatch,y_ph:targetBatch,enc_len_ph:sourceBatchLen,
                                                y_len_ph:targetBatchLen})

In [None]:
pre_op.inverse_transform(tmp[6])

#restore model
saver.restore(sess, "BestModel/model.ckpt")

In [None]:
#save model
saver.save(sess, "BestTrans/resume.ckpt")

In [29]:
if PATIENCE_MONITOR:
    saver.restore(sess, "transModels/best.ckpt")

INFO:tensorflow:Restoring parameters from transModels/best.ckpt


In [None]:
valid_bleu

In [42]:
data=videoFeats[1300:]
data_len=videoFeatSize[1300:]
if BATCH==1024:
    data=np.concatenate([data,data[:354]])
    data_len=np.concatenate([data_len,data_len[:354]])
gen_sum=[]
for i in range(len(data)/BATCH):
    start=i*BATCH
    stop=(i+1)*BATCH
    
    y=sess.run(y_hat,feed_dict={enc_ph:data[start:stop],y_ph:np.zeros((BATCH,MAX_LEN)),
                                              enc_len_ph:data_len[start:stop],
                                                y_len_ph:np.zeros((BATCH))})
        
    
    for t in y:
        gen_sum.append(t)

if BATCH<1024:        
    start=len(data)-BATCH
    stop=len(data)
    y=sess.run(y_hat,feed_dict={enc_ph:data[start:stop],y_ph:np.zeros((BATCH,MAX_LEN)),
                                              enc_len_ph:data_len[start:stop],
                                                y_len_ph:np.zeros((BATCH))})
    
    y=y[-(len(data)-len(gen_sum)):]
    for t in y:
        gen_sum.append(t)


In [43]:
gen_sum=gen_sum[:670]

data=videoFeats[1300:1364]
data_len=videoFeatSize[1300:1364]
gen_sum=[]
for i in range(len(data)/BATCH):
    start=i*BATCH
    stop=(i+1)*BATCH
    load_trs=trs
    y=sess.run(load_trs,feed_dict={source_seq:data[start:stop],
                                               source_seq_len:data_len[start:stop],
                                              batch_size:BATCH,
                                                keep_prob:1.0
                                                })
    for t in y:
        gen_sum.append(t)


In [44]:
#processing summaries
summs=[]
for i in gen_sum:
    summ=''
    for j in i:
        if j==stopID:
            break
        summ = summ+' '+pre_op.inverse_transform(j)
    summs.append(summ[1:])

for i in range(len(summs)):
    print i,summs[i],vops[i]

vops=np.load('../MSVD/clip_index/testIndex.npy')[:64]
ref_test

#real shit
with open('ref_dev.txt','w+') as fle:
    for i in devCaptions:
        fle.write('vid'+str(i[0])+'\t'+' '.join(i[1][1:-1]))
        fle.write('\n')

In [45]:
vdo=1300
with open('genTrans.txt','w+') as fle:
    for i in summs:
        fle.write('beam_size_1'+'\tvid'+str(vdo)+'\t'+i+'.')
        fle.write('\n')
        vdo+=1

In [None]:
37013