In [1]:
import torch.utils.data as data
from PIL import Image
import torch
import torch.utils.data
import torch.nn as nn
from torch.nn import Module
import torch.nn.functional as F
from torch.autograd import Variable
import torch.backends.cudnn as cudnn
import torch.optim as optim
import numpy as np
import h5py
import json
import time
import pdb
import random
import sys
import math
import os

In [2]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '0'

In [3]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2016 NVIDIA Corporation
Built on Sun_Sep__4_22:14:01_CDT_2016
Cuda compilation tools, release 8.0, V8.0.44


In [4]:
torch.cuda.is_available()

False

In [5]:
torch.cuda.get_device_name(0)

RuntimeError: cuda runtime error (35) : CUDA driver version is insufficient for CUDA runtime version at torch/csrc/cuda/Module.cpp:131

In [6]:
cudnn.benchmark = True

In [7]:
manualSeed = random.randint(1, 10000) # fix seed
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)
np.random.seed(manualSeed)
torch.cuda.manual_seed(manualSeed)

('Random Seed: ', 5013)


In [8]:
class _netE(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ninp, nhid, nlayers, dropout, img_feat_size):
        super(_netE, self).__init__()

        self.d = dropout
        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers
        self.nhid = nhid
        self.ninp = ninp
        self.img_feat_size = img_feat_size

        self.img_embed = nn.Linear(img_feat_size, nhid)
        self.ques_rnn = nn.LSTM(self.ninp, self.nhid, self.nlayers)
        self.his_rnn = nn.LSTM(self.ninp, self.nhid, self.nlayers)

        self.Wq_1 = nn.Linear(self.nhid, self.nhid)
        self.Wh_1 = nn.Linear(self.nhid, self.nhid)
        self.Wi_1 = nn.Linear(self.img_feat_size, self.nhid)
        self.Wa_1 = nn.Linear(self.nhid, 1)

        self.fc1 = nn.Linear(self.nhid*3, self.ninp)

    def forward(self, ques_emb, his_emb, img_raw, ques_hidden, his_hidden, rnd):

        img_emb = F.tanh(self.img_embed(img_raw))
        ques_feat, ques_hidden = self.ques_rnn(ques_emb, ques_hidden)
        ques_feat = ques_feat[-1]
        his_feat, his_hidden = self.his_rnn(his_emb, his_hidden)
        his_feat = his_feat[-1]

        ques_emb_1 = self.Wq_1(ques_feat).view(-1, 1, self.nhid)
        his_emb_1 = self.Wh_1(his_feat).view(-1, rnd, self.nhid)
        his_cat = his_emb_1.mean(1)

        img_cat = img_emb.view(-1,49,self.nhid)
        img_cat = img_cat.mean(1)
        
        concat_feat = torch.cat((ques_feat, his_cat.view(-1, self.nhid), \
                                 img_cat.view(-1, self.nhid)),1)
        
        encoder_feat = F.tanh(self.fc1(F.dropout(concat_feat, self.d, training=self.training)))

        return encoder_feat, ques_hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
#             e = 1
#             return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))
            return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()),
                    Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))
        else:
            return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())


In [9]:
####CHANGE CUDA

class _netW(nn.Module):
    def __init__(self, ntoken, ninp, dropout):
        super(_netW, self).__init__()
        self.word_embed = nn.Embedding(ntoken+1, ninp).cuda()
        self.word_embed = nn.Embedding(ntoken+1, ninp)
        self.Linear = share_Linear(self.word_embed.weight).cuda()
        self.word_embed = nn.Embedding(ntoken+1, ninp)
        self.Linear = share_Linear(self.word_embed.weight)
        self.init_weights()
        self.d = dropout

    def init_weights(self):
        initrange = 0.1
        self.word_embed.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, format ='index'):
        if format == 'onehot':
            out = F.dropout(self.Linear(input), self.d, training=self.training)
        elif format == 'index':
            out = F.dropout(self.word_embed(input), self.d, training=self.training)

        return out

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            #netW
            w = 1
            return (Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()),
                    Variable(weight.new(self.nlayers, bsz, self.nhid).zero_()))
        else:
            return Variable(weight.new(self.nlayers, bsz, self.nhid).zero_())


In [10]:
class _netD(nn.Module):
    """
    Given the real/wrong/fake answer, use a RNN (LSTM) to embed the answer.
    """
    def __init__(self, rnn_type, ninp, nhid, nlayers, ntoken, dropout):
        super(_netD, self).__init__()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers
        self.ntoken = ntoken
        self.ninp = ninp
        self.d = dropout

        self.ans_rnn = nn.LSTM(self.ninp, self.ninp, self.nlayers)
        self.W2 = nn.Linear(self.nhid, 1)
        self.fc = nn.Linear(nhid, ninp)

    def forward(self, input_feat, hidden, opt_ans_emb, vocab_size):

        # opt_ans_emb = self.ans_emb(opt_ans.view(-1,200,9))
        output, _ = self.ans_rnn(opt_ans_emb, hidden)
        output = output[-1]
        output = output.view(100,-1,self.ninp)        
        # redOutput = output.mean(1).view(100,self.ninp,-1)
        output_feat = output.view(100,self.ninp,-1)
        expand_feat = input_feat.view(-1,1,self.ninp)
        print('expand feat', expand_feat.size())
        print('output_feat',output_feat.size())
        prob = F.softmax(torch.bmm(expand_feat,output_feat)).view(100,-1)

        return prob
    
    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            #netD
            d = 1
            return (Variable(weight.new(self.nlayers, bsz, self.ninp).zero_()),
                    Variable(weight.new(self.nlayers, bsz, self.ninp).zero_()))
        else:
            return Variable(weight.new(self.nlayers, bsz, self.ninp).zero_())



In [11]:
def repackage_hidden(h, batch_size):
    """Wraps hidden states in new Variables, to detach them from their history."""
    if type(h) == Variable:
        return Variable(h.data.resize_(h.size(0), batch_size, h.size(2)).zero_())
    else:
        return tuple(repackage_hidden(v, batch_size) for v in h)


In [12]:
class train(data.Dataset): # torch wrapper
    def __init__(self, input_img_h5, input_ques_h5, input_json, negative_sample, num_val, data_split):

        print('DataLoader loading: %s' %data_split)
        print('Loading image feature from %s' %input_img_h5)

        if data_split == 'test':
            split = 'val'
        else:
            split = 'train' # train and val split both corresponding to 'train'

        f = json.load(open(input_json, 'r'))
        self.itow = f['itow']
        self.img_info = f['img_'+split]

        # get the data split.
        total_num = len(self.img_info)
        if data_split == 'train':
            s = 0
            e = total_num - num_val
        elif data_split == 'val':
            s = total_num - num_val
            e = total_num
        else:
            s = 0
            e = total_num
            
        self.img_info = self.img_info[s:e]

        print('%s number of data: %d' %(data_split, e-s))
        # load the data.
        f = h5py.File(input_img_h5, 'r')
        self.imgs = f['images_'+split][s:e]
        f.close()

        print('Loading txt from %s' %input_ques_h5)
        f = h5py.File(input_ques_h5, 'r')
        self.ques = f['ques_'+split][s:e]
        self.ans = f['ans_'+split][s:e]
        self.cap = f['cap_'+split][s:e]

        self.ques_len = f['ques_len_'+split][s:e]
        self.ans_len = f['ans_len_'+split][s:e]
        self.cap_len = f['cap_len_'+split][s:e]

        self.ans_ids = f['ans_index_'+split][s:e]
        self.opt_ids = f['opt_'+split][s:e]
        self.opt_list = f['opt_list_'+split][:]
        self.opt_len = f['opt_len_'+split][:]
        f.close()

        self.ques_length = self.ques.shape[2]
        self.ans_length = self.ans.shape[2]
        self.his_length = self.ques_length + self.ans_length
        self.vocab_size = len(self.itow)+1

        print('Vocab Size: %d' % self.vocab_size)
        self.split = split
        self.rnd = 10
        self.negative_sample = negative_sample

        
    def __getitem__(self, index):
        # get the image
        img = torch.from_numpy(self.imgs[index])

        # get the history
        his = np.zeros((self.rnd, self.his_length))
        his[0,self.his_length-self.cap_len[index]:] = self.cap[index,:self.cap_len[index]]

        ques = np.zeros((self.rnd, self.ques_length))
        ans = np.zeros((self.rnd, self.ans_length+1))
        ans_target = np.zeros((self.rnd, self.ans_length+1))
        ques_ori = np.zeros((self.rnd, self.ques_length))

        opt_ans = np.zeros((self.rnd, self.negative_sample, self.ans_length+1))
        ans_len = np.zeros((self.rnd))
        opt_ans_len = np.zeros((self.rnd, self.negative_sample))
        ans_ids = np.zeros(self.rnd)
        ans_idx = np.zeros((self.rnd))
        opt_ans_idx = np.zeros((self.rnd, self.negative_sample))

        for i in range(self.rnd):
            # get the index
            q_len = self.ques_len[index, i]
            a_len = self.ans_len[index, i]
            qa_len = q_len + a_len

            if i+1 < self.rnd:
                his[i+1, self.his_length-qa_len:self.his_length-a_len] = self.ques[index, i, :q_len]
                his[i+1, self.his_length-a_len:] = self.ans[index, i, :a_len]

            ques[i, self.ques_length-q_len:] = self.ques[index, i, :q_len]

            ques_ori[i, :q_len] = self.ques[index, i, :q_len]
            ans[i, 1:a_len+1] = self.ans[index, i, :a_len]
            ans[i, 0] = self.vocab_size

            ans_target[i, :a_len] = self.ans[index, i, :a_len]
            ans_target[i, a_len] = self.vocab_size
            ans_len[i] = self.ans_len[index, i]
            ans_ids[i] = self.ans_ids[index, i] # since python start from 0
            opt_ids = self.opt_ids[index, i] # since python start from 0
            # random select the negative samples.
            ans_idx[i] = opt_ids[self.ans_ids[index, i]]
            # exclude the gt index.
            opt_ids = np.delete(opt_ids, ans_idx[i], 0)
            random.shuffle(opt_ids)
            for j in range(self.negative_sample):
                ids = opt_ids[j]
                opt_ans_idx[i,j] = ids

                opt_len = self.opt_len[ids]

                opt_ans_len[i, j] = opt_len
                opt_ans[i, j, :opt_len] = self.opt_list[ids,:opt_len]
                opt_ans[i, j, opt_len] = self.vocab_size

        his = torch.from_numpy(his)
        ques = torch.from_numpy(ques)
        ans = torch.from_numpy(ans)
        ans_target = torch.from_numpy(ans_target)
        ques_ori = torch.from_numpy(ques_ori)
        ans_len = torch.from_numpy(ans_len)
        opt_ans_len = torch.from_numpy(opt_ans_len)
        opt_ans = torch.from_numpy(opt_ans)
        ans_idx = torch.from_numpy(ans_idx)
        ans_ids = torch.from_numpy(ans_ids)
        opt_ans_idx = torch.from_numpy(opt_ans_idx)
        return img, his, ques, ans, ans_target, ans_len, ans_idx, ans_ids, ques_ori, \
                opt_ans, opt_ans_len, opt_ans_idx

    def __len__(self):
        return self.ques.shape[0]

In [13]:
class validate(data.Dataset): # torch wrapper
    def __init__(self, input_img_h5, input_ques_h5, input_json, negative_sample, num_val, data_split):

        print('DataLoader loading: %s' %data_split)
        print('Loading image feature from %s' %input_img_h5)

        if data_split == 'test':
            split = 'val'
        else:
            split = 'train' # train and val split both corresponding to 'train'

        f = json.load(open(input_json, 'r'))
        self.itow = f['itow']
        self.img_info = f['img_'+split]

        # get the data split.
        total_num = len(self.img_info)
        if data_split == 'train':
            s = 0
            e = total_num - num_val
        elif data_split == 'val':
            s = total_num - num_val
            e = total_num
        else:
            s = 0
            e = total_num

        self.img_info = self.img_info[s:e]
        print('%s number of data: %d' %(data_split, e-s))

        # load the data.
        f = h5py.File(input_img_h5, 'r')
###########################################################################################
#CHANGE THIS HERE FOR NON DEMO TRAINING SET
        split = 'train'
###########################################################################################
        
        self.imgs = f['images_'+split][s:e]

        f.close()

        print('Loading txt from %s' %input_ques_h5)
        f = h5py.File(input_ques_h5, 'r')
        self.ques = f['ques_'+split][s:e]
        self.ans = f['ans_'+split][s:e]
        self.cap = f['cap_'+split][s:e]

        self.ques_len = f['ques_len_'+split][s:e]
        self.ans_len = f['ans_len_'+split][s:e]
        self.cap_len = f['cap_len_'+split][s:e]

        self.ans_ids = f['ans_index_'+split][s:e]
        self.opt_ids = f['opt_'+split][s:e]
        self.opt_list = f['opt_list_'+split][:]
        self.opt_len = f['opt_len_'+split][:]
        f.close()

        self.ques_length = self.ques.shape[2]
        self.ans_length = self.ans.shape[2]
        self.his_length = self.ques_length + self.ans_length
        self.vocab_size = len(self.itow)+1

        print('Vocab Size: %d' % self.vocab_size)
        self.split = split
        self.rnd = 10
        self.negative_sample = negative_sample

    def __getitem__(self, index):

        # get the image
        img_id = self.img_info[index]['imgId']
        img = torch.from_numpy(self.imgs[index])
        # get the history
        his = np.zeros((self.rnd, self.his_length))
        his[0,self.his_length-self.cap_len[index]:] = self.cap[index,:self.cap_len[index]]

        ques = np.zeros((self.rnd, self.ques_length))
        ans = np.zeros((self.rnd, self.ans_length+1))
        ans_target = np.zeros((self.rnd, self.ans_length+1))
        quesL = np.zeros((self.rnd, self.ques_length))

        opt_ans = np.zeros((self.rnd, 100, self.ans_length+1))
        ans_ids = np.zeros(self.rnd)
        opt_ans_target = np.zeros((self.rnd, 100, self.ans_length+1))

        ans_len = np.zeros((self.rnd))
        opt_ans_len = np.zeros((self.rnd, 100))


        for i in range(self.rnd):
            # get the index
            q_len = self.ques_len[index, i]
            a_len = self.ans_len[index, i]
            qa_len = q_len + a_len

            if i+1 < self.rnd:
                ques_ans = np.concatenate([self.ques[index, i, :q_len], self.ans[index, i, :a_len]])
                his[i+1, self.his_length-qa_len:] = ques_ans

            ques[i, self.ques_length-q_len:] = self.ques[index, i, :q_len]
            quesL[i, :q_len] = self.ques[index, i, :q_len]
            ans[i, 1:a_len+1] = self.ans[index, i, :a_len]
            ans[i, 0] = self.vocab_size

            ans_target[i, :a_len] = self.ans[index, i, :a_len]
            ans_target[i, a_len] = self.vocab_size

            ans_ids[i] = self.ans_ids[index, i] # since python start from 0
            opt_ids = self.opt_ids[index, i] # since python start from 0
            ans_len[i] = self.ans_len[index, i]
            ans_idx = self.ans_ids[index, i]

            for j, ids in enumerate(opt_ids):
                opt_len = self.opt_len[ids]
                opt_ans[i, j, 1:opt_len+1] = self.opt_list[ids,:opt_len]
                opt_ans[i, j, 0] = self.vocab_size

                opt_ans_target[i, j,:opt_len] = self.opt_list[ids,:opt_len]
                opt_ans_target[i, j,opt_len] = self.vocab_size
                opt_ans_len[i, j] = opt_len

        opt_ans = torch.from_numpy(opt_ans)
        opt_ans_target = torch.from_numpy(opt_ans_target)
        ans_ids = torch.from_numpy(ans_ids)

        his = torch.from_numpy(his)
        ques = torch.from_numpy(ques)
        ans = torch.from_numpy(ans)
        ans_target = torch.from_numpy(ans_target)
        quesL = torch.from_numpy(quesL)

        ans_len = torch.from_numpy(ans_len)
        opt_ans_len = torch.from_numpy(opt_ans_len)

        return img, his, ques, ans, ans_target, quesL, opt_ans, \
                    opt_ans_target, ans_ids, ans_len, opt_ans_len, img_id


    def __len__(self):
        return self.ques.shape[0]

In [14]:
def trainModel(epoch):
    netW.train()
    netE.train()
    netD.train()

#     lr = adjust_learning_rate(optimizer, epoch, opt.lr)

    ques_hidden = netE.init_hidden(batchSize)
    hist_hidden = netE.init_hidden(batchSize)

    opt_hidden = netD.init_hidden(batchSize)
#     wrong_hidden = netD.init_hidden(batchSize)

    data_iter = iter(dloader)

    average_loss = 0
    count = 0
    i = 0

    while i < len(dloader):

        t1 = time.time()
        data = data_iter.next()
        image, history, question, answer, answerT, answerLen, answerIdx, answerIds, questionL, \
                                    opt_answerT, opt_answerLen, opt_answerIdx = data

        batch_size = question.size(0)
        image = image.view(-1, img_feat_size)
        img_input.data.resize_(image.size()).copy_(image)

        for rnd in range(10):
            netW.zero_grad()
            netE.zero_grad()
            netD.zero_grad()
            # get the corresponding round QA and history.
            ques = question[:,rnd,:].t()
            his = history[:,:rnd+1,:].clone().view(-1, his_length).t()

            ans = answer[:,rnd,:].t()
            tans = answerT[:,rnd,:].t()
#             wrong_ans = opt_answerT[:,rnd,:].clone().view(-1, ans_length).t()

#             real_len = answerLen[:,rnd]
#             wrong_len = opt_answerLen[:,rnd,:].clone().view(-1)

            ques_input.data.resize_(ques.size()).copy_(ques)
            his_input.data.resize_(his.size()).copy_(his)

            ans_input.data.resize_(ans.size()).copy_(ans)
            ans_target.data.resize_(tans.size()).copy_(tans)
#             wrong_ans_input.data.resize_(wrong_ans.size()).copy_(wrong_ans)

            # sample in-batch negative index
#             batch_sample_idx.data.resize_(batch_size, neg_batch_sample).zero_()
#             sample_batch_neg(answerIdx[:,rnd], opt_answerIdx[:,rnd,:], batch_sample_idx, neg_batch_sample)

            ques_emb = netW(ques_input, format = 'index')
            his_emb = netW(his_input, format = 'index')

            ques_hidden = repackage_hidden(ques_hidden, batch_size)
            hist_hidden = repackage_hidden(hist_hidden, his_input.size(1))
            print('img input size:', img_input.size())
            featD, ques_hidden = netE(ques_emb, his_emb, img_input, \
                                                ques_hidden, hist_hidden, rnd+1)

#             ans_real_emb = netW(ans_target, format='index')
#             ans_wrong_emb = netW(wrong_ans_input, format='index')

#             real_hidden = repackage_hidden(real_hidden, batch_size)
#             wrong_hidden = repackage_hidden(wrong_hidden, ans_wrong_emb.size(1))

#             real_feat = netD(ans_real_emb, ans_target, real_hidden, vocab_size)
#             wrong_feat = netD(ans_wrong_emb, wrong_ans_input, wrong_hidden, vocab_size)

#             batch_wrong_feat = wrong_feat.index_select(0, batch_sample_idx.view(-1))
#             wrong_feat = wrong_feat.view(batch_size, -1, ninp)
#             batch_wrong_feat = batch_wrong_feat.view(batch_size, -1, ninp)

#             nPairLoss = critD(featD, real_feat, wrong_feat, batch_wrong_feat)

            opt_ans = opt_answerT[:,rnd,:].clone().view(-1, ans_length).t()
            opt_ans_input.data.resize_(opt_ans.size()).copy_(opt_ans)

            opt_ans_emb = netW(opt_ans_input, format = 'index')

            opt_hidden = netD.init_hidden(batchSize)
            opt_hidden = repackage_hidden(opt_hidden, opt_ans_input.size(1))
            # print(opt_hidden.shape())
            prob = netD(featD,opt_hidden,opt_ans_emb,vocab_size)
            optIdx = opt_answerIdx[:,rnd,:]
            # print(optIdx.size())
            ansIds = answerIds[:,rnd]
            ansIds = ansIds.long()
            ansIds = Variable(ansIds, requires_grad = False)
            ansIds = ansIds.cuda()
#             print(ansIdx.size())
#             b_no = 0
#             ans_list = []
            
#             for elem in ansIdx:
#                 t = ((optIdx[b_no,:]==elem).nonzero())
#             #     print(t.size())
#                 if (t.size()!=torch.Size([])):
#                     ans_list.append(t[0][0])
#                 else:
#                     ans_list.append((-1))
    
#                 b_no = b_no + 1

#             corr_ans_ind = torch.LongTensor(ans_list)
            

#             corr_ans_ind = Variable(corr_ans_ind)
#             corr_ans_ind = corr_ans_ind.cuda()

            currLoss = critD(prob,ansIds)
#             print(currLoss)
#             average_loss += currLoss.data[0]
            currLoss.backward()
            optimizer.step()
            count += 1

        i += 1
        if i % log_interval == 0:
            average_loss /= count
            print("step {} / {} (epoch {}), g_loss {:.3f}, lr = {:.6f}"\
                .format(i, len(dataloader), epoch, average_loss, lr))
            average_loss = 0
            count = 0

    return average_loss



In [15]:
def valModel():
    netE.eval()
    netW.eval()
    netD.eval()

#     n_neg = 100
    data_iter_val = iter(dataloader_val)
    ques_hidden = netE.init_hidden(batchSize)
    hist_hidden = netE.init_hidden(batchSize)

    opt_hidden = netD.init_hidden(batchSize)
    i = 0

    average_loss = 0
    rank_all_tmp = []

    while i < len(dataloader_val):
        data = data_iter_val.next()
        image, history, question, answer, answerT, questionL, opt_answer, \
                opt_answerT, answer_ids, answerLen, opt_answerLen, img_id  = data

        batch_size = question.size(0)
        image = image.view(-1, img_feat_size)
        #image = l2_norm(image)
        img_input.data.resize_(image.size()).copy_(image)

        for rnd in range(10):
            # get the corresponding round QA and history.
            ques = question[:,rnd,:].t()
            his = history[:,:rnd+1,:].clone().view(-1, his_length).t()

            opt_ans = opt_answerT[:,rnd,:].clone().view(-1, ans_length).t()
            gt_id = answer_ids[:,rnd]

            ques_input.data.resize_(ques.size()).copy_(ques)
            his_input.data.resize_(his.size()).copy_(his)

            opt_ans_input.data.resize_(opt_ans.size()).copy_(opt_ans)
            gt_index.data.resize_(gt_id.size()).copy_(gt_id)
            opt_len = opt_answerLen[:,rnd,:].clone().view(-1)

            ques_emb = netW(ques_input, format = 'index')
            his_emb = netW(his_input, format = 'index')

            ques_hidden = repackage_hidden(ques_hidden, batch_size)
            hist_hidden = repackage_hidden(hist_hidden, his_input.size(1))

            featD, ques_hidden = netE(ques_emb, his_emb, img_input, \
                                                ques_hidden, hist_hidden, rnd+1)

            opt_ans_emb = netW(opt_ans_input, format = 'index')
            opt_hidden = repackage_hidden(opt_hidden, opt_ans_input.size(1))
            score = netD(featD,opt_hidden,opt_ans_emb,vocab_size)
            
#             opt_feat = opt_feat.view(batch_size, -1, ninp)

            #ans_emb = ans_emb.view(ans_length, -1, 100, opt.nhid)
#             featD = featD.view(-1, ninp, 1)
#             score = torch.bmm(opt_feat, featD)
#             score = score.view(-1, 100)

            for b in range(batch_size):
                gt_index.data[b] = gt_index.data[b] + b*100

            gt_score = score.view(-1).index_select(0, gt_index)
            sort_score, sort_idx = torch.sort(score, 1, descending=True)

            count = sort_score.gt(gt_score.view(-1,1).expand_as(sort_score))
            rank = count.sum(1) + 1
            rank_all_tmp += list(rank.view(-1).data.cpu().numpy())
            
        i += 1
        sys.stdout.write('Evaluating: {:d}/{:d}  \r' \
          .format(i, len(dataloader_val)))
        sys.stdout.flush()

    return rank_all_tmp

In [16]:
class share_Linear(Module):
    r"""Applies a linear transformation to the incoming data: :math:`y = Ax + b`
    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        bias: If set to False, the layer will not learn an additive bias. Default: True
    Shape:
        - Input: :math:`(N, in\_features)`
        - Output: :math:`(N, out\_features)`
    Attributes:
        weight: the learnable weights of the module of shape (out_features x in_features)
        bias:   the learnable bias of the module of shape (out_features)
    Examples::
        >>> m = nn.Linear(20, 30)
        >>> input = autograd.Variable(torch.randn(128, 20))
        >>> output = m(input)
        >>> print(output.size())
    """

    def __init__(self, weight):
        super(share_Linear, self).__init__()
        self.in_features = weight.size(0)
        self.out_features = weight.size(1)
        self.weight = weight.t()
        self.register_parameter('bias', None)

    def forward(self, input):
        return F.linear(input, self.weight, self.bias)

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
            + str(self.in_features) + ' -> ' \
            + str(self.out_features) + ')'


In [17]:
model = 'LSTM'
ninp = 300
nhid = 512
nlayers = 1
dropout = 0.5
margin = 2 

## Change Here for input files

In [18]:
# input_img_h5 = 'vdl_img_vgg_demo.h5'
# input_ques_h5 = 'visdial_data_demo.h5'
input_img_h5 = 'vdl_img_vgg.h5'
input_ques_h5 = 'visdial_data.h5'
input_json = 'visdial_params.json'
negative_sample = 20
num_val = 1000
dataset = train(input_img_h5=input_img_h5, input_ques_h5=input_ques_h5,
                input_json=input_json, negative_sample = negative_sample,
                num_val = num_val, data_split = 'train')


DataLoader loading: train
Loading image feature from vdl_img_vgg.h5
train number of data: 81783
Loading txt from visdial_data.h5
Vocab Size: 8964


In [19]:
dataset_val = validate(input_img_h5=input_img_h5, input_ques_h5=input_ques_h5,
                input_json=input_json, negative_sample = negative_sample,
                num_val = num_val, data_split = 'test')

DataLoader loading: test
Loading image feature from vdl_img_vgg.h5
test number of data: 40504
Loading txt from visdial_data.h5
Vocab Size: 8964


In [20]:
batchSize = 100
num_workers = 0
dloader = torch.utils.data.DataLoader(dataset, batch_size=batchSize,
                                         shuffle=True, num_workers=int(num_workers))
dataloader_val = torch.utils.data.DataLoader(dataset_val, batch_size=1,
                                         shuffle=False, num_workers=int(num_workers))

In [21]:
data_iter1 = iter(dloader)
data = data_iter1.next()

In [22]:
image, history, question, answer, answerT, answerLen, answerIdx, answerIds, questionL, \
        opt_answerT, opt_answerLen, opt_answerIdx = data

In [23]:
rnd = 3
ques_length = dataset.ques_length
negative_sample = 20
n_neg = negative_sample
vocab_size = dataset.vocab_size
ques_length = dataset.ques_length
ans_length = dataset.ans_length + 1
his_length = dataset.ans_length + dataset.ques_length
itow = dataset.itow
img_feat_size = 512

In [24]:
vocab_size

8964

In [25]:
netW = _netW(vocab_size, ninp, dropout)

AssertionError: 
The NVIDIA driver on your system is too old (found version 8000).
Please update your GPU driver by downloading and installing a new
version from the URL: http://www.nvidia.com/Download/index.aspx
Alternatively, go to: http://pytorch.org to install
a PyTorch version that has been compiled with your version
of the CUDA driver.

In [None]:
netE = _netE(model, ninp, nhid, nlayers, dropout, img_feat_size)

In [None]:
netE.cuda()
netW.cuda()

In [26]:
ques_input = torch.LongTensor(ques_length, batchSize)
ques_input = Variable(ques_input)
ques = question[:,rnd,:].t()
ques_input.data.resize_(ques.size()).copy_(ques)
ques_input = ques_input.cuda()
ques_emb = netW(ques_input, format = 'index')

AssertionError: 
The NVIDIA driver on your system is too old (found version 8000).
Please update your GPU driver by downloading and installing a new
version from the URL: http://www.nvidia.com/Download/index.aspx
Alternatively, go to: http://pytorch.org to install
a PyTorch version that has been compiled with your version
of the CUDA driver.

In [25]:
image = image.view(-1, img_feat_size)
img_input = torch.FloatTensor(batchSize)
img_input = Variable(img_input)
img_input.data.resize_(image.size()).copy_(image)

his_input = torch.LongTensor(his_length, batchSize)
his_input = Variable(his_input)
his = history[:,:rnd+1,:].clone().view(-1, his_length).t()
his_input.data.resize_(his.size()).copy_(his)
his_input = his_input.cuda()
his_emb = netW(his_input, format = 'index')

batch_size = question.size(0)

ques_hidden = netE.init_hidden(batchSize)
hist_hidden = netE.init_hidden(batchSize)

ques_hidden = repackage_hidden(ques_hidden, batch_size)
hist_hidden = repackage_hidden(hist_hidden, his_input.size(1))

In [26]:
img_input.size()
# image.size()

torch.Size([4900, 512])

In [27]:
netE.cuda()
netW.cuda()
img_input = img_input.cuda()
ques_emb = ques_emb.cuda()
his_emb = his_emb.cuda()

featD, ques_hidden = netE(ques_emb, his_emb, img_input, ques_hidden, hist_hidden, rnd+1)

  "PyTorch was compiled without cuDNN support. To use cuDNN, rebuild "


In [28]:
print(opt_answerT.size())
# t = torch.LongTensor(opt_answerT)
# print('t size',t[0].view(9,-1).size())
t = opt_answerT.long()
temp = t.view(-1,9).t()
print(temp.size())
emb = netW(temp, format = 'index')
emb.size()

torch.Size([100, 10, 20, 9])
torch.Size([9, 20000])


RuntimeError: expected a Variable argument, but got torch.LongTensor

In [44]:
opt_answerLen.size()
print(ques_input.size())
# his_input.size()
ques_emb = netW(ques_input, format = 'index')
print(ques_emb.size())
# his_emb = netW(his_input, format = 'index')

torch.Size([16, 100])
torch.Size([16, 100, 300])


In [527]:
a = torch.Tensor([[2,2],[2,2]])
b = a - 1
b = a.unsqueeze(-1)
b = a.expand(2,2,2)
b[0][1] *=2
# a[0]
# print(b)
c = b.unsqueeze(-1)
c = b.expand(3,2,2,2)
# c.mean(1)

In [355]:
opt_answerIdx.size()
print(opt_answerIdx[4])



Columns 0 to 5 
 1.8902e+04  7.1412e+04  5.4282e+04  1.2387e+05  1.7612e+05  1.6521e+05
 2.2308e+05  3.5196e+04  2.1391e+05  7.8387e+04  1.2205e+05  2.2836e+04
 7.4224e+04  1.5295e+05  1.2205e+05  3.5710e+04  2.3314e+05  4.7583e+04
 1.1093e+05  1.5046e+05  4.4150e+04  9.1056e+04  1.6145e+05  2.6702e+04
 1.7095e+05  2.2009e+05  7.4224e+04  9.8037e+04  1.1651e+05  2.0549e+05
 4.9191e+04  4.0882e+04  1.0184e+05  1.7653e+05  2.3744e+05  1.6056e+05
 1.0410e+05  8.2426e+04  2.1403e+05  2.5105e+05  1.8959e+05  1.7604e+05
 2.0889e+05  8.0082e+04  7.9823e+04  1.4115e+05  7.2807e+04  9.9828e+04
 1.4078e+05  9.8046e+04  1.6878e+04  2.4621e+05  1.2452e+05  5.2650e+04
 2.1887e+05  6.0361e+04  1.9780e+05  2.0731e+05  1.8762e+05  4.0882e+04

Columns 6 to 11 
 6.9593e+04  6.6072e+04  5.4696e+04  4.6490e+03  9.4076e+04  8.0082e+04
 7.4224e+04  5.3300e+04  2.2901e+05  2.2082e+05  8.5776e+04  1.9780e+05
 7.8387e+04  1.2489e+05  1.9592e+05  2.2273e+05  2.1123e+05  2.2116e+05
 1.6182e+05  2.1553e+05  7.1

In [354]:
import h5py
filename = input_ques_h5
f = h5py.File(filename, 'r')

# List all groups
print("Keys: %s" % f.keys())
a_group_key = list(f.keys())[0]


Keys: [u'ans_index_train', u'ans_len_train', u'ans_train', u'cap_len_train', u'cap_train', u'opt_len_train', u'opt_list_train', u'opt_train', u'ques_len_train', u'ques_train']


In [435]:
t = opt_answerT.long()
t.size()

torch.Size([100, 10, 20, 9])

# Decoder

In [29]:
netD = _netD(model, ninp, nhid, nlayers, vocab_size, dropout)
netD.cuda()

_netD (
  (ans_rnn): LSTM(300, 300)
  (W2): Linear (512 -> 1)
  (fc): Linear (512 -> 300)
)

In [30]:
opt_ans_input = torch.LongTensor(ans_length, batchSize)
opt_ans_input = Variable(opt_ans_input)

opt_ans = opt_answerT[:,rnd,:].clone().view(-1, ans_length).t()
opt_ans_input.data.resize_(opt_ans.size()).copy_(opt_ans)
opt_ans_input = opt_ans_input.cuda()

opt_ans_emb = netW(opt_ans_input, format = 'index')

opt_hidden = netD.init_hidden(batchSize)
opt_hidden = repackage_hidden(opt_hidden, opt_ans_input.size(1))
# print(opt_hidden.shape())
prob = netD(featD,opt_hidden,opt_ans_emb,vocab_size)

('expand feat', torch.Size([100, 1, 300]))
('output_feat', torch.Size([100, 300, 20]))


In [31]:
tans = answerT[:,rnd,:].t()
ans_target.data.resize_(tans.size()).copy_(tans)


NameError: name 'ans_target' is not defined

In [51]:
optIdx = opt_answerIdx[:,rnd,:]
# print(optIdx.size())
ansIdx = answerIds[:,rnd]
print(ansIdx.size())
b_no = 0
ans_list = []
for elem in ansIdx:
    t = ((optIdx[b_no,:]==elem).nonzero())
#     print(t.size())
    if (t.size()!=(torch.Size([]))):
        ans_list.append(t[0][0])
    else:
        ans_list.append((-1))
    
    b_no = b_no + 1

temp = torch.LongTensor(ans_list)
# temp

torch.Size([100])



-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
-1
[torch.LongTensor of size 100]

In [97]:
# optIdx[b_no,:]
elem = 23
t = (optIdx[b_no,:]==elem).nonzero()
t.size()

torch.Size([])

In [86]:
t.size()
print(ans_list)
# torch.LongTensor([])

[-1, -1, -1]


In [55]:
m = nn.LogSoftmax()
loss = nn.NLLLoss()
temp = Variable(temp)
output = loss(prob,temp)

NameError: name 'temp' is not defined

# Training & Val Trial

In [32]:
img_input = torch.FloatTensor(batchSize)
ques_input = torch.LongTensor(ques_length, batchSize)
his_input = torch.LongTensor(his_length, batchSize)

# answer input
ans_input = torch.LongTensor(ans_length, batchSize)
ans_target = torch.LongTensor(ans_length, batchSize)
wrong_ans_input = torch.LongTensor(ans_length, batchSize)
sample_ans_input = torch.LongTensor(1, batchSize)
opt_ans_input = torch.LongTensor(ans_length, batchSize)

batch_sample_idx = torch.LongTensor(batchSize)
fake_diff_mask = torch.ByteTensor(batchSize)
fake_len = torch.LongTensor(batchSize)
noise_input = torch.FloatTensor(batchSize)
gt_index = torch.LongTensor(batchSize)

In [33]:
ques_input, his_input, img_input = ques_input.cuda(), his_input.cuda(), img_input.cuda()
ans_input, ans_target = ans_input.cuda(), ans_target.cuda()
wrong_ans_input = wrong_ans_input.cuda()
sample_ans_input = sample_ans_input.cuda()

fake_len = fake_len.cuda()
noise_input = noise_input.cuda()
batch_sample_idx = batch_sample_idx.cuda()
fake_diff_mask = fake_diff_mask.cuda()
opt_ans_input = opt_ans_input.cuda()
gt_index = gt_index.cuda()

In [34]:
ques_input = Variable(ques_input)
img_input = Variable(img_input)
his_input = Variable(his_input)

ans_input = Variable(ans_input)
ans_target = Variable(ans_target)
wrong_ans_input = Variable(wrong_ans_input)
sample_ans_input = Variable(sample_ans_input)

noise_input = Variable(noise_input)
batch_sample_idx = Variable(batch_sample_idx)
fake_diff_mask = Variable(fake_diff_mask)
opt_ans_input = Variable(opt_ans_input)
gt_index = Variable(gt_index)

In [35]:
img_input.size()

torch.Size([100])

In [36]:
critD = nn.NLLLoss()
critD.cuda()
netW.cuda()
epoch = 0

trainModel(epoch)

('img input size:', torch.Size([4900, 512]))
('expand feat', torch.Size([100, 1, 300]))
('output_feat', torch.Size([100, 300, 20]))


RuntimeError: cublas runtime error : the GPU program failed to execute at /opt/conda/conda-bld/pytorch_1501972792122/work/pytorch-0.1.12/torch/lib/THC/THCBlas.cu:246

In [90]:
rank_all = val()
R1 = np.sum(np.array(rank_all)==1) / float(len(rank_all))
R5 =  np.sum(np.array(rank_all)<=5) / float(len(rank_all))
R10 = np.sum(np.array(rank_all)<=10) / float(len(rank_all))
ave = np.sum(np.array(rank_all)) / float(len(rank_all))
mrr = np.sum(1/(np.array(rank_all, dtype='float'))) / float(len(rank_all))
print ('%d/%d: mrr: %f R1: %f R5 %f R10 %f Mean %f' %(epoch, len(dataloader_val), mrr, R1, R5, R10, ave))

('expand feat', torch.Size([1, 1, 300]))
('output_feat', torch.Size([100, 300, 1]))


RuntimeError: equal number of batches expected at /opt/conda/conda-bld/pytorch_1501972792122/work/pytorch-0.1.12/torch/lib/THC/generic/THCTensorMathBlas.cu:443

# Main 

In [None]:
lr = 0.0004
beta1 = 0.8
niter = 5
neg_batch_sample = 30 
log_interval = 50
save_iter = 10000000
save_path = '~/notebooks/saved_checkpoints'
optimizer = optim.Adam([{'params': netW.parameters()},
                        {'params': netE.parameters()},
                        {'params': netD.parameters()}], lr=lr, betas=(beta1, 0.999))


history = []

for epoch in range(niter):
    t = time.time()
    train_loss = train(epoch)
    print ('Epoch: %d learningRate %4f train loss %4f Time: %3f' % (epoch, lr, train_loss, time.time()-t))
    train_his = {'loss': train_loss}

    print('Evaluating ... ')
    rank_all = val()
    R1 = np.sum(np.array(rank_all)==1) / float(len(rank_all))
    R5 =  np.sum(np.array(rank_all)<=5) / float(len(rank_all))
    R10 = np.sum(np.array(rank_all)<=10) / float(len(rank_all))
    ave = np.sum(np.array(rank_all)) / float(len(rank_all))
    mrr = np.sum(1/(np.array(rank_all, dtype='float'))) / float(len(rank_all))
    print ('%d/%d: mrr: %f R1: %f R5 %f R10 %f Mean %f' %(epoch, len(dataloader_val), mrr, R1, R5, R10, ave))
    val_his = {'R1': R1, 'R5':R5, 'R10': R10, 'Mean':ave, 'mrr':mrr}
    history.append({'epoch':epoch, 'train': train_his, 'val': val_his})

    saving the model.
    if epoch % save_iter == 0:
        torch.save({'epoch': epoch,
                    'netW': netW.state_dict(),
                    'netD': netD.state_dict(),
                    'netE': netE.state_dict()},
                    '%s/epoch_%d.pth' % (save_path, epoch))

        json.dump(history, open('%s/log.json' %(save_path), 'w'))

