In [0]:
!git clone https://github.com/howsam/basic_vqa.git

fatal: destination path 'basic_vqa' already exists and is not an empty directory.


In [0]:
%cd basic_vqa/utils

/content/basic_vqa/utils


In [0]:
#!/bin/tcsh

#########################################################

# One may need to change directory for datasets like this.
#set DATASETS_DIR = "/run/media/hoosiki/WareHouse3/mtb/datasets/VQA"

!mkdir -p "../datasets"
DATASETS_DIR = "../datasets"

##########################################################

ANNOTATIONS_DIR = DATASETS_DIR+"/Annotations"
QUESTIONS_DIR = DATASETS_DIR+"/Questions"
IMAGES_DIR = DATASETS_DIR+"/Images"

##########################################################

!mkdir -p {ANNOTATIONS_DIR}
!mkdir -p {QUESTIONS_DIR}
!mkdir -p {IMAGES_DIR}

##########################################################

# Download datasets from VQA official url: https://visualqa.org/download.html

# VQA Annotations
# !wget -O {ANNOTATIONS_DIR}/v2_Annotations_Train_mscoco.zip "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip"
!wget -O {ANNOTATIONS_DIR}/v2_Annotations_Val_mscoco.zip "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip"

# VQA Input Questions
# !wget -O {QUESTIONS_DIR}/v2_Questions_Train_mscoco.zip "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip"
!wget -O {QUESTIONS_DIR}/v2_Questions_Val_mscoco.zip "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip"
# !wget -O {QUESTIONS_DIR}/v2_Questions_Test_mscoco.zip "https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip"

# VQA Input Images (COCO)
# !wget -O {IMAGES_DIR}/train2014.zip "http://images.cocodataset.org/zips/train2014.zip"
!wget -O {IMAGES_DIR}/val2014.zip "http://images.cocodataset.org/zips/val2014.zip"
# !wget -O {IMAGES_DIR}/test2015.zip "http://images.cocodataset.org/zips/test2015.zip"

##########################################################

# !unzip {ANNOTATIONS_DIR}/v2_Annotations_Train_mscoco.zip -d {ANNOTATIONS_DIR}
!unzip {ANNOTATIONS_DIR}/v2_Annotations_Val_mscoco.zip -d {ANNOTATIONS_DIR}

# !unzip {QUESTIONS_DIR}/v2_Questions_Train_mscoco.zip -d {QUESTIONS_DIR}
!unzip {QUESTIONS_DIR}/v2_Questions_Val_mscoco.zip -d {QUESTIONS_DIR}
# !unzip {QUESTIONS_DIR}/v2_Questions_Test_mscoco.zip -d {QUESTIONS_DIR}

# !unzip {IMAGES_DIR}/train2014.zip -d {IMAGES_DIR}
!unzip {IMAGES_DIR}/val2014.zip -d {IMAGES_DIR}
# !unzip {IMAGES_DIR}/test2015.zip -d {IMAGES_DIR}

##########################################################

# Remove unnecessary zip files.

# !rm {ANNOTATIONS_DIR}/v2_Annotations_Train_mscoco.zip
!rm {ANNOTATIONS_DIR}/v2_Annotations_Val_mscoco.zip

# !rm {QUESTIONS_DIR}/v2_Questions_Train_mscoco.zip
!rm {QUESTIONS_DIR}/v2_Questions_Val_mscoco.zip
# !rm {QUESTIONS_DIR}/v2_Questions_Test_mscoco.zip

# !rm {IMAGES_DIR}/train2014.zip
!rm {IMAGES_DIR}/val2014.zip
# !rm {IMAGES_DIR}/test2015.zip

##########################################################

In [0]:
!python resize_images.py --input_dir='../datasets/Images' --output_dir='../datasets/Resized_Images'  
!python make_vacabs_for_questions_answers.py --input_dir='../datasets'

[1000/40504] Resized the images and saved into '../datasets/Resized_Images/val2014'.
[2000/40504] Resized the images and saved into '../datasets/Resized_Images/val2014'.
[3000/40504] Resized the images and saved into '../datasets/Resized_Images/val2014'.
[4000/40504] Resized the images and saved into '../datasets/Resized_Images/val2014'.
[5000/40504] Resized the images and saved into '../datasets/Resized_Images/val2014'.
[6000/40504] Resized the images and saved into '../datasets/Resized_Images/val2014'.
[7000/40504] Resized the images and saved into '../datasets/Resized_Images/val2014'.
[8000/40504] Resized the images and saved into '../datasets/Resized_Images/val2014'.
[9000/40504] Resized the images and saved into '../datasets/Resized_Images/val2014'.
[10000/40504] Resized the images and saved into '../datasets/Resized_Images/val2014'.
[11000/40504] Resized the images and saved into '../datasets/Resized_Images/val2014'.
[12000/40504] Resized the images and saved into '../datasets/Re

In [0]:
import numpy as np
import json
import os
import argparse
import text_helper as text_processing
from collections import defaultdict


def extract_answers(q_answers, valid_answer_set):
    all_answers = [answer["answer"] for answer in q_answers]
    valid_answers = [a for a in all_answers if a in valid_answer_set]
    return all_answers, valid_answers


def vqa_processing(image_dir, annotation_file, question_file, valid_answer_set, image_set):
    print('building vqa %s dataset' % image_set)
    if image_set in ['val2014']:
        load_answer = True
        with open(annotation_file % image_set) as f:
            annotations = json.load(f)['annotations']
            qid2ann_dict = {ann['question_id']: ann for ann in annotations}
    else:
        load_answer = False
    with open(question_file % image_set) as f:
        questions = json.load(f)['questions']
    coco_set_name = image_set.replace('-dev', '')
    abs_image_dir = os.path.abspath(image_dir % coco_set_name)
    image_name_template = 'COCO_'+coco_set_name+'_%012d'
    dataset = [None]*len(questions)
    
    unk_ans_count = 0
    for n_q, q in enumerate(questions):
        if (n_q+1) % 10000 == 0:
            print('processing %d / %d' % (n_q+1, len(questions)))
        image_id = q['image_id']
        question_id = q['question_id']
        image_name = image_name_template % image_id
        image_path = os.path.join(abs_image_dir, image_name+'.jpg')
        question_str = q['question']
        question_tokens = text_processing.tokenize(question_str)
        
        iminfo = dict(image_name=image_name,
                      image_path=image_path,
                      question_id=question_id,
                      question_str=question_str,
                      question_tokens=question_tokens)
        
        if load_answer:
            ann = qid2ann_dict[question_id]
            all_answers, valid_answers = extract_answers(ann['answers'], valid_answer_set)
            if len(valid_answers) == 0:
                valid_answers = ['<unk>']
                unk_ans_count += 1
            iminfo['all_answers'] = all_answers
            iminfo['valid_answers'] = valid_answers
            
        dataset[n_q] = iminfo
    print('total %d out of %d answers are <unk>' % (unk_ans_count, len(questions)))
    return dataset



input_dir = '../datasets'
output_dir = '../datasets'

image_dir = input_dir+'/Resized_Images/%s/'
annotation_file = input_dir+'/Annotations/v2_mscoco_%s_annotations.json'
question_file = input_dir+'/Questions/v2_OpenEnded_mscoco_%s_questions.json'

vocab_answer_file = output_dir+'/vocab_answers.txt'
answer_dict = text_processing.VocabDict(vocab_answer_file)
valid_answer_set = set(answer_dict.word_list)    

# train = vqa_processing(image_dir, annotation_file, question_file, valid_answer_set, 'train2014')
valid = vqa_processing(image_dir, annotation_file, question_file, valid_answer_set, 'val2014')
# test = vqa_processing(image_dir, annotation_file, question_file, valid_answer_set, 'test2015')
# test_dev = vqa_processing(image_dir, annotation_file, question_file, valid_answer_set, 'test-dev2015')

np.save(output_dir+'/train.npy', np.array(valid))
np.save(output_dir+'/valid.npy', np.array(valid))
# np.save(output_dir+'/train_valid.npy', np.array(train+valid))
# np.save(output_dir+'/test.npy', np.array(test))
# np.save(output_dir+'/test-dev.npy', np.array(test_dev))

In [0]:
%cd ..

/content/basic_vqa


In [0]:
!python train.py

Traceback (most recent call last):
  File "train.py", line 178, in <module>
    main(args)
  File "train.py", line 27, in main
    num_workers=args.num_workers)
  File "/content/basic_vqa/data_loader.py", line 71, in get_loader
    transform=transform['train']),
  File "/content/basic_vqa/data_loader.py", line 14, in __init__
    self.vqa = np.load(input_dir+'/'+input_vqa)
  File "/usr/local/lib/python3.6/dist-packages/numpy/lib/npyio.py", line 447, in load
    pickle_kwargs=pickle_kwargs)
  File "/usr/local/lib/python3.6/dist-packages/numpy/lib/format.py", line 696, in read_array
    raise ValueError("Object arrays cannot be loaded when "
ValueError: Object arrays cannot be loaded when allow_pickle=False


In [0]:
import torch
import torch.nn as nn
import torchvision.models as models


class ImgEncoder(nn.Module):

    def __init__(self, embed_size):
        """(1) Load the pretrained model as you want.
               cf) one needs to check structure of model using 'print(model)'
                   to remove last fc layer from the model.
           (2) Replace final fc layer (score values from the ImageNet)
               with new fc layer (image feature).
           (3) Normalize feature vector.
        """
        super(ImgEncoder, self).__init__()
        model = models.resnet50(pretrained=True)
        in_features = model.classifier[-1].in_features  # input size of feature vector
        model.classifier = nn.Sequential(
            *list(model.classifier.children())[:-1])    # remove last fc layer

        self.model = model                              # loaded model without last fc layer
        self.fc = nn.Linear(in_features, embed_size)    # feature vector of image

    def forward(self, image):
        """Extract feature vector from image vector.
        """
        with torch.no_grad():
            img_feature = self.model(image)                  # [batch_size, vgg16(19)_fc=4096]
        img_feature = self.fc(img_feature)                   # [batch_size, embed_size]
        # img_feature = torch.Normalize(img_feature)
        l2_norm = img_feature.norm(p=2, dim=1, keepdim=True).detach()
        img_feature = img_feature.div(l2_norm)               # l2-normalized feature vector

        return img_feature


class QstEncoder(nn.Module):

    def __init__(self, qst_vocab_size, word_embed_size, embed_size, num_layers, hidden_size):

        super(QstEncoder, self).__init__()
        self.word2vec = nn.Embedding(qst_vocab_size, word_embed_size)
        self.tanh = nn.Tanh()
        self.lstm = nn.LSTM(word_embed_size, hidden_size, num_layers)
        self.fc = nn.Linear(2*num_layers*hidden_size, embed_size)     # 2 for hidden and cell states

    def forward(self, question):

        qst_vec = self.word2vec(question)                             # [batch_size, max_qst_length=30, word_embed_size=300]
        qst_vec = self.tanh(qst_vec)
        qst_vec = qst_vec.transpose(0, 1)                             # [max_qst_length=30, batch_size, word_embed_size=300]
        _, (hidden, cell) = self.lstm(qst_vec)                        # [num_layers=2, batch_size, hidden_size=512]
        qst_feature = torch.cat((hidden, cell), 2)                    # [num_layers=2, batch_size, 2*hidden_size=1024]
        qst_feature = qst_feature.transpose(0, 1)                     # [batch_size, num_layers=2, 2*hidden_size=1024]
        qst_feature = qst_feature.reshape(qst_feature.size()[0], -1)  # [batch_size, 2*num_layers*hidden_size=2048]
        qst_feature = self.tanh(qst_feature)
        qst_feature = self.fc(qst_feature)                            # [batch_size, embed_size]

        return qst_feature


class VqaModel(nn.Module):

    def __init__(self, embed_size, qst_vocab_size, ans_vocab_size, word_embed_size, num_layers, hidden_size):

        super(VqaModel, self).__init__()
        self.img_encoder = ImgEncoder(embed_size)
        self.qst_encoder = QstEncoder(qst_vocab_size, word_embed_size, embed_size, num_layers, hidden_size)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.5)
        self.fc1 = nn.Linear(embed_size, ans_vocab_size)
        self.fc2 = nn.Linear(ans_vocab_size, ans_vocab_size)

    def forward(self, img, qst):

        img_feature = self.img_encoder(img)                     # [batch_size, embed_size]
        qst_feature = self.qst_encoder(qst)                     # [batch_size, embed_size]
        combined_feature = torch.mul(img_feature, qst_feature)  # [batch_size, embed_size]
        combined_feature = self.tanh(combined_feature)
        combined_feature = self.dropout(combined_feature)
        combined_feature = self.fc1(combined_feature)           # [batch_size, ans_vocab_size=1000]
        combined_feature = self.tanh(combined_feature)
        combined_feature = self.dropout(combined_feature)
        combined_feature = self.fc2(combined_feature)           # [batch_size, ans_vocab_size=1000]

        return combined_feature

In [0]:
import torch
import torch.nn as nn
import torchvision.models as models

class imgencoder(nn.Module):
    def __init__(self):
        super(imgencoder, self).__init__()
        self.model = models.resnet18(True)
        self.model.fc = nn.Linear(512, 1024)

    def forward(self, img):
        img_feature = self.model(img)
        return img_feature

In [0]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

img_rand = torch.rand(10, 3, 224, 244).to(device)
img_model = imgencoder().to(device)
feature_rand = img_model(img_rand)

print(feature_rand.shape)

torch.Size([10, 1024])


In [0]:
class txtencoder(nn.Module):
    def __init__(self):
        super(txtencoder, self).__init__()
        self.word2vec = nn.Embedding(1000, 300)
        self.lstm = nn.LSTM(300, 512, 2, batch_first=True)
        self.fc = nn.Linear(2048, 1024)

    def forward(self, txt):
        embedd = self.word2vec(txt)
        _, (hidden, cell) = lstm(embedd)
        vec = torch.cat(hidden, cell)
        vec = vec.view(txt.size(0), -1)
        txt_feature = self.fc(vec)
        return txt_feature

In [0]:
class combine(nn.Module):
    def __init__(self):
        super(combine, self).__init__()
        self.img_model = imgencoder()
        self.txt_model = txtencoder()
        self.fc1 = nn.Linear(1024, 1000)
        self.fc2 = nn.Linear(1024, 1000)

    def forward(self, img, qst):
        img_feature = self.img_model(img)
        qst_feature = self.txt_model(qst)
        combined_feature = img_feature * qst_feature
        combined_feature = self.fc1(combined_feature)
        combined_feature = torch.nn.functional.relu(combined_feature)
        combined_feature = self.fc2(combined_feature)
        return combined_feature