In [1]:
import os
import sys
import tqdm
import h5py
import yaml
import torch
import random
import datetime
import torchvision

import numpy as np
import pandas as pd
import torch.nn as nn
import matplotlib.pyplot as plt
import torch.nn.functional as F
import torchvision.transforms as transforms

from itertools import cycle
from skimage import transform
from torch.autograd import Variable
from collections import OrderedDict
from tensorboardX import SummaryWriter
from mpl_toolkits.axes_grid1 import ImageGrid
from torch.utils.data import Dataset, DataLoader
from torchvision.utils import make_grid, save_image

from preprocess_3 import preprocess
from dataset import VQADataset, VQABatchSampler

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
random.seed(1)
np.random.seed(1)
torch.manual_seed(1)

<torch._C.Generator at 0x7f08391e2a70>

# Arguments

In [4]:
config = './config_vqa_sgd.yml'
config = yaml.load(open(config))

  


In [5]:
preprocess_data = True
use_att = True
fusion = 'block-attn'
path_to_weights = './vqa_block_100.pth'

# Dataloader

In [6]:
phases = ['train', 'val']
config = config['data']

In [7]:
if preprocess_data:
    print('Preprocessing datasets')
    preprocess(
        data_dir=config['dir'],
        train_ques_file=config['train']['ques'],
        train_ans_file=config['train']['ans'],
            val_ques_file=config['val']['ques'],
        val_ans_file=config['val']['ans'])

Preprocessing datasets
Preprocessing with data root dir: /home/apoorv/Documents/Practice/CV/VQA/Visual-Question-Answering/vqadata/
Creating tsv datasets: train.tsv, val.tsv
TRAIN TSV
[('yes', 84978), ('no', 82516), ('1', 12540), ('2', 12215), ('white', 8916), ('3', 6536), ('blue', 5455), ('red', 5201), ('black', 5066), ('0', 4977), ('4', 4118), ('brown', 3814), ('green', 3750), ('yellow', 2792), ('5', 2367), ('gray', 2113), ('nothing', 1814), ('right', 1766), ('frisbee', 1641), ('baseball', 1597), ('left', 1565), ('none', 1563), ('tennis', 1502), ('6', 1455), ('wood', 1449)]
Dumping ans-to-idx map to /home/apoorv/Documents/Practice/CV/VQA/Visual-Question-Answering/vqadata/ans_itos.tsv
VAL TSV
Creating loaders...
vocabulary size: 3662
Dumping vocabulary to /home/apoorv/Documents/Practice/CV/VQA/Visual-Question-Answering/vqadata/ques_stoi.tsv
Dumping train dataset to /home/apoorv/Documents/Practice/CV/VQA/Visual-Question-Answering/vqadata/train.pkl
Dumping val dataset to /home/apoorv/Doc

In [8]:
datafiles = {x: '{}.pkl'.format(x) for x in phases}
img_dir = {x: config[x]['img_dir'] for x in phases}
datasets = {x: VQADataset(data_dir=config['dir'], qafile=datafiles[x], img_dir=img_dir[x], phase=x,
                          img_scale=config['images']['scale'], img_crop=config['images']['crop'], 
                          raw_images=True) for x in phases}

In [9]:
batch_samplers = {x: VQABatchSampler(datasets[x], config[x]['batch_size']) for x in phases}

dataloaders = {x: DataLoader(
    datasets[x], batch_sampler=batch_samplers[x], num_workers=config['loader']['workers']) for x in phases}
dataset_sizes = {x: len(datasets[x]) for x in phases}

In [10]:
dataset_sizes

{'train': 5000, 'val': 1000}

In [11]:
for i, d in enumerate(dataloaders[phases[0]]):
    print(d[0].shape, d[1].shape, d[2].shape, d[3].shape, d[4].shape)
    print(len(dataloaders[phases[0]]))
    break

torch.Size([32, 7]) torch.Size([32, 3, 224, 224]) torch.Size([32]) torch.Size([32]) torch.Size([32])
157


# Model

### Attention

In [12]:
class Attn(nn.Module):
    def __init__(self, h_dim):
        super(Attn, self).__init__()
        self.h_dim = h_dim
        self.main = nn.Sequential(
            nn.Linear(h_dim, h_dim//2),
            nn.ReLU(True),
            nn.Linear(h_dim//2,1)
        )

    def forward(self, encoder_outputs):
        b_size = encoder_outputs.size(0)
        attn_ene = self.main(encoder_outputs.contiguous().view(-1, self.h_dim))
        return F.softmax(attn_ene.view(b_size, -1), dim=1).unsqueeze(2)

### Block

In [13]:
class BlockFusion(nn.Module):
    def __init__(self, input_dim, output_dim, num_layers, dropout = 0.1, chunks = 16):
        super(BlockFusion, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.num_layers = num_layers
        self.chunks = chunks
        self.dropout = dropout
        # ensure that `chunks` wholly divides `output_dim`
        self.split_size = int(self.output_dim / self.chunks)
        self.size_list = [self.split_size] * self.chunks
        self.v_layers = []
        self.q_layers = []
        for each_size in self.size_list:
            v_do = nn.Dropout(p=self.dropout)
            v_lin = nn.Linear(each_size, each_size * self.num_layers)
            self.v_layers.append(nn.Sequential(v_do, v_lin))

            q_do = nn.Dropout(p=self.dropout)
            q_lin = nn.Linear(each_size, each_size * self.num_layers)
            self.q_layers.append(nn.Sequential(q_do, q_lin))

        self.v_layers = nn.ModuleList(self.v_layers)
        self.q_layers = nn.ModuleList(self.q_layers)
    
    def get_chunks(self, embeds, size_list):
        out = []
        begin = 0
        for each_size in size_list:
            y = embeds.narrow(1, begin, each_size)
            out.append(y)
            begin += each_size
        return out

    def forward(self, ques_emb, img_emb):
        batch_size = img_emb.size()[0]
        img_emb_chunks = self.get_chunks(img_emb, self.size_list)
        ques_emb_chunks = self.get_chunks(ques_emb, self.size_list)
        x_mm = []
        for i, v_lin, q_lin in zip(range(len(self.size_list)), self.v_layers, self.q_layers):
            v_chunk = img_emb_chunks[i]
            q_chunk = ques_emb_chunks[i]
            m = v_lin(v_chunk) * q_lin(q_chunk)
            m = m.view(batch_size, self.num_layers, -1)
            z = torch.sum(m, 1)
            z = torch.sqrt(F.relu(z)) - torch.sqrt(F.relu(-z))
            z = F.normalize(z,p=2)
            x_mm.append(z)
        out = torch.cat(x_mm, 1)
        return out

### MUTAN

In [14]:
class MutanFusion(nn.Module):
    def __init__(self, input_dim, out_dim, num_layers):
        super(MutanFusion, self).__init__()
        self.input_dim = input_dim
        self.out_dim = out_dim
        self.num_layers = num_layers

        hv = []
        for i in range(self.num_layers):
            do = nn.Dropout(p=0.5)
            lin = nn.Linear(input_dim, out_dim)

            hv.append(nn.Sequential(do, lin, nn.Tanh()))
        #
        self.image_transformation_layers = nn.ModuleList(hv)
        #
        hq = []
        for i in range(self.num_layers):
            do = nn.Dropout(p=0.5)
            lin = nn.Linear(input_dim, out_dim)
            hq.append(nn.Sequential(do, lin, nn.Tanh()))
        #
        self.ques_transformation_layers = nn.ModuleList(hq)

    def forward(self, ques_emb, img_emb):
        # Pdb().set_trace()
        batch_size = img_emb.size()[0]
        x_mm = []
        for i in range(self.num_layers):
            x_hv = img_emb
            x_hv = self.image_transformation_layers[i](x_hv)

            x_hq = ques_emb
            x_hq = self.ques_transformation_layers[i](x_hq)
            x_mm.append(torch.mul(x_hq, x_hv))
        #
        x_mm = torch.stack(x_mm, dim=1)
        x_mm = x_mm.sum(1).view(batch_size, self.out_dim)
        x_mm = F.tanh(x_mm)
        return x_mm

### Extract Image Embedding

In [15]:
class Normalize(nn.Module):
    def __init__(self, p=2):
        super(Normalize, self).__init__()
        self.p = p

    def forward(self, x):
        # Pdb().set_trace()
        x = x / x.norm(p=self.p, dim=1, keepdim=True)
        return x


class ImageEmbedding(nn.Module):
    def __init__(self, image_channel_type='I', output_size=1024, mode='train',
                 extract_features=False, features_dir=None):
        super(ImageEmbedding, self).__init__()
        self.extractor = torchvision.models.vgg16(pretrained=True)
        # freeze feature extractor (VGGNet) parameters
        for param in self.extractor.parameters():
            param.requires_grad = False

        extactor_fc_layers = list(self.extractor.classifier.children())[:-1]
        if image_channel_type.lower() == 'normi':
            extactor_fc_layers.append(Normalize(p=2))
        self.extractor.classifier = nn.Sequential(*extactor_fc_layers)

        self.fflayer = nn.Sequential(
            nn.Linear(4096, output_size),
            nn.Tanh())

        # TODO: Get rid of this hack
        self.mode = mode
        self.extract_features = extract_features
        self.features_dir = features_dir

    def forward(self, image, image_ids):
        # Pdb().set_trace()
        if self.extract_features:
            image = self.extractor(image)
            #if self.features_dir is not None:
                #utils.save_image_features(image, image_ids, self.features_dir)
        image_embedding = self.fflayer(image)
        return image_embedding

### Question Embedding

In [16]:
class QuesEmbedding(nn.Module):
    def __init__(self, input_size=300, hidden_size=512, output_size=1024, num_layers=2, batch_first=True):
        super(QuesEmbedding, self).__init__()
        # TODO: take as parameter
        self.bidirectional = True
        if num_layers == 1:
            self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                                batch_first=batch_first, bidirectional=self.bidirectional)

            if self.bidirectional:
                self.fflayer = nn.Sequential(
                    nn.Linear(2 * num_layers * hidden_size, output_size),
                    nn.Tanh())
        else:
            self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                                num_layers=num_layers, batch_first=batch_first)
            self.fflayer = nn.Sequential(
                nn.Linear(2 * num_layers * hidden_size, output_size),
                nn.Tanh())
        if use_att:
            self.attn = Attn(1024)

    def forward(self, ques):
        output, hx = self.lstm(ques)
        if use_att:
            output = output.permute(1,0,2)
            attention_weights = self.attn(output)
            ques_embedding = (output * attention_weights).sum(dim=1)
        else:
            lstm_embedding = torch.cat([hx[0], hx[1]], dim=2)
            ques_embedding = lstm_embedding[0]
            if self.lstm.num_layers > 1 or self.bidirectional:
                for i in range(1, self.lstm.num_layers):
                    ques_embedding = torch.cat(
                        [ques_embedding, lstm_embedding[i]], dim=1)
                ques_embedding = self.fflayer(ques_embedding)
        return ques_embedding

### Complete Model

In [17]:
class VQAModel(nn.Module):

    def __init__(self, vocab_size=10000, word_emb_size=300, emb_size=1024, output_size=1000, image_channel_type='I', ques_channel_type='lstm', use_mutan=True, mode='train', extract_img_features=True, features_dir=None):
        super(VQAModel, self).__init__()
        self.mode = mode
        self.word_emb_size = word_emb_size
        self.image_channel = ImageEmbedding(image_channel_type, output_size=emb_size, mode=mode,
                                            extract_features=extract_img_features, features_dir=features_dir)

        # NOTE the padding_idx below.
        self.word_embeddings = nn.Embedding(vocab_size, word_emb_size)
        if ques_channel_type.lower() == 'lstm':
            self.ques_channel = QuesEmbedding(
                input_size=word_emb_size, output_size=emb_size, num_layers=1, batch_first=False)
        elif ques_channel_type.lower() == 'deeplstm':
            self.ques_channel = QuesEmbedding(
                input_size=word_emb_size, output_size=emb_size, num_layers=2, batch_first=False)
        else:
            msg = 'ques channel type not specified. please choose one of -  lstm or deeplstm'
            print(msg)
            raise Exception(msg)
        if 'mutan' in fusion:
            self.mutan = MutanFusion(emb_size, emb_size, 5)
            self.mlp = nn.Sequential(nn.Linear(emb_size, output_size))
        elif 'block' in fusion:
            self.block = BlockFusion(emb_size, emb_size, 5)
            self.mlp = nn.Sequential(nn.Linear(emb_size, output_size))
        else:
            self.mlp = nn.Sequential(
                nn.Linear(emb_size, 1000),
                nn.Dropout(p=0.5),
                nn.Tanh(),
                nn.Linear(1000, output_size))

    def forward(self, images, questions, image_ids):
        image_embeddings = self.image_channel(images, image_ids)
        embeds = self.word_embeddings(questions)
        ques_embeddings = self.ques_channel(embeds)
        if 'mutan' in fusion:
            combined = self.mutan(ques_embeddings, image_embeddings)
        elif 'block' in fusion:
            combined = self.block(ques_embeddings, image_embeddings)
        else:
            combined = image_embeddings * ques_embeddings
        output = self.mlp(combined)
        return output

In [18]:
model = VQAModel(vocab_size=len(VQADataset.ques_vocab), output_size=len(VQADataset.ans_vocab)).to(device)

# Training Code

In [19]:
ROOT_DIR = './vqa_results/'
now = fusion

if not os.path.exists(ROOT_DIR):
    os.makedirs(ROOT_DIR)

if not os.path.exists(ROOT_DIR + now):
    os.makedirs(ROOT_DIR + now)

LOG_DIR = ROOT_DIR + now + '/logs/'
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

OUTPUTS_DIR = ROOT_DIR  + now + '/outputs/'
if not os.path.exists(OUTPUTS_DIR):
    os.makedirs(OUTPUTS_DIR)

MODEL_DIR = ROOT_DIR + now + '/models/'
if not os.path.exists(MODEL_DIR):
    os.makedirs(MODEL_DIR)

summary_writer = SummaryWriter(LOG_DIR)

In [20]:
optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), momentum=0.9, 
                           lr=0.001, weight_decay=5.0e-4)
scheduler_optimizer = torch.optim.lr_scheduler.MultiStepLR(optimizer, 
                                                        milestones=[50,75], gamma=0.1)

In [21]:
criterion = nn.CrossEntropyLoss().to(device)

In [22]:
max_epoch = 100

In [None]:
for epoch in range(max_epoch):
    model.train()
    for i, d in enumerate(dataloaders['train']):
        questions, images, image_ids, answers, ques_ids = d[0], d[1], d[2], d[3], d[4]
        images = images.to(device)
        questions = questions.permute(1,0).to(device)
        answers = answers.to(device)
        
        optimizer.zero_grad()
        
        ans_scores = model(images, questions, image_ids)
        _, preds = torch.max(ans_scores, 1)
        loss = criterion(ans_scores, answers)
        loss.backward()
        
        optimizer.step()
        
        running_corrects = torch.sum((preds == answers).data)
        
        print('Epoch: {}, Iteration: {}/{}'.format(epoch, i, len(dataloaders['train'])))
        print('Loss: {}'.format(loss.item()))
        print('Correct Answers: {}'.format(running_corrects.item()))
        print('\n')
        
        summary_writer.add_scalar("Training Loss", loss.item())
        summary_writer.add_scalar("Training Running Correct", running_corrects.item())
        
    model.eval()
    for i, d in enumerate(dataloaders['val']):
        questions, images, image_ids, answers, ques_ids = d[0], d[1], d[2], d[3], d[4]
        images = images.to(device)
        questions = questions.permute(1,0).to(device)
        answers = answers.to(device)
        
        with torch.no_grad():
            ans_scores = model(images, questions, image_ids)
            _, preds = torch.max(ans_scores, 1)
            loss = criterion(ans_scores, answers)
        
        running_corrects = torch.sum((preds == answers).data)
        
        print('Epoch: {}, Iteration: {}/{}'.format(epoch, i, len(dataloaders['val'])))
        print('Validation Loss: {}'.format(loss.item()))
        print('Validation Correct Answers: {}'.format(running_corrects.item()))
        print('\n')
        
        summary_writer.add_scalar("Validation Loss", loss.item())
        summary_writer.add_scalar("Validation Running Correct", running_corrects.item())
    
    torch.save(model.state_dict(), MODEL_DIR+'vqa_{}_{}.pth'.format(epoch, now))
    scheduler_optimizer.step()

# Evaluation

In [None]:
model = VQAModel(vocab_size=len(VQADataset.ques_vocab), output_size=len(VQADataset.ans_vocab)).to(device)
model.load_state_dict(torch.load(path_to_weights))

In [None]:
def evaluate(mode='val'):
    model.eval()
    total_corrects = 0.0
    for i, d in enumerate(dataloaders[mode]):
        questions, images, image_ids, answers, ques_ids = d[0], d[1], d[2], d[3], d[4]
        images = images.to(device)
        questions = questions.permute(1,0).to(device)
        answers = answers.to(device)
            
        with torch.no_grad():
            ans_scores = model(images, questions, image_ids)
            _, preds = torch.max(ans_scores, 1)
            loss = criterion(ans_scores, answers)
            
        running_corrects = torch.sum((preds == answers).data)
        total_corrects += running_corrects
            
        acc = total_corrects / len(datasets[mode])
    print('Epoch: {}, Accuracy: {}'.format(epoch, acc))

In [None]:
evaluate()