<a href="https://colab.research.google.com/github/caoscott/nlp-final-project/blob/master/colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<p><img alt="Colaboratory logo" height="45px" src="https://colab.research.google.com/img/colab_favicon.ico" align="left" hspace="10px" vspace="0px"></p>

<h1>Welcome to Colaboratory!</h1>


Colaboratory is a free Jupyter notebook environment that requires no setup and runs entirely in the cloud.

With Colaboratory you can write and execute code, save and share your analyses, and access powerful computing resources, all for free from your browser.

To execute the code in the above cell, select it with a click and then either press the ▷ button to the left of the code, or use the keyboard shortcut "⌘/Ctrl+Enter".

All cells modify the same global state, so variables that you define by executing a cell can be used in other cells:

In [0]:
import os

if not 'train2014' in os.listdir('.'):
  !sudo apt install -f aria2
  
  !wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip -q --show-progress
  !wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip -q --show-progress
  !wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip -q --show-progress
  !wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip -q --show-progress
  !aria2c -x6 http://images.cocodataset.org/zips/train2014.zip 
  !aria2c -x6 http://images.cocodataset.org/zips/val2014.zip 

  !unzip -q '*.zip'

In [2]:
!rm -rf nlp-final-project
!git clone https://github.com/caoscott/nlp-final-project.git
!cd nlp-final-project && git checkout 9fa14f5a9846eb5bbd56256f55ece101f7d9f54a
!mv nlp-final-project/* .

Cloning into 'nlp-final-project'...
remote: Enumerating objects: 24, done.[K
remote: Counting objects:   4% (1/24)   [Kremote: Counting objects:   8% (2/24)   [Kremote: Counting objects:  12% (3/24)   [Kremote: Counting objects:  16% (4/24)   [Kremote: Counting objects:  20% (5/24)   [Kremote: Counting objects:  25% (6/24)   [Kremote: Counting objects:  29% (7/24)   [Kremote: Counting objects:  33% (8/24)   [Kremote: Counting objects:  37% (9/24)   [Kremote: Counting objects:  41% (10/24)   [Kremote: Counting objects:  45% (11/24)   [Kremote: Counting objects:  50% (12/24)   [Kremote: Counting objects:  54% (13/24)   [Kremote: Counting objects:  58% (14/24)   [Kremote: Counting objects:  62% (15/24)   [Kremote: Counting objects:  66% (16/24)   [Kremote: Counting objects:  70% (17/24)   [Kremote: Counting objects:  75% (18/24)   [Kremote: Counting objects:  79% (19/24)   [Kremote: Counting objects:  83% (20/24)   [Kremote: Counting objects:  87% 

In [3]:
# memory footprint support libraries/code
!ln -sf /opt/bin/nvidia-smi /usr/bin/nvidia-smi
!pip install gputil
!pip install psutil
!pip install humanize
import psutil
import humanize
import os
import GPUtil as GPU
GPUs = GPU.getGPUs()
# XXX: only one GPU on Colab and isn’t guaranteed
gpu = GPUs[0]
def printm():
 process = psutil.Process(os.getpid())
 print("Gen RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ), " | Proc size: " + humanize.naturalsize( process.memory_info().rss))
 print("GPU RAM Free: {0:.0f}MB | Used: {1:.0f}MB | Util {2:3.0f}% | Total {3:.0f}MB".format(gpu.memoryFree, gpu.memoryUsed, gpu.memoryUtil*100, gpu.memoryTotal))
printm() 

Gen RAM Free: 12.9 GB  | Proc size: 120.3 MB
GPU RAM Free: 15079MB | Used: 0MB | Util   0% | Total 15079MB


In [0]:
import torch 
import torch.nn as nn
import torch.optim as optim 
from torch.utils import data
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms

In [5]:
import embedding

normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])

transform_train = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomResizedCrop(224),
    transforms.ToTensor(),
    normalize,
])

transform_test = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    normalize,
])

word_embedding_file = 'glove.6B.300d-relativized.txt'
word_embeddings = embedding.read_word_embeddings(word_embedding_file)

Read in 17615 vectors of size 300


In [0]:
import time
from collections import defaultdict
import json
import copy
import cv2
import io
import os
from PIL import Image
import h5py
import random

class VQADataset(data.Dataset):

    def __init__(self, dataset_path: str, transform, 
            word_embeddings: embedding.WordEmbeddings, mode: str = 'train'):
        self.word_embeddings = word_embeddings
        answer_frequency = defaultdict(int)
        questions_dict = \
        json.loads(open('v2_OpenEnded_mscoco_{}2014_questions.json'.format(mode)).read())['questions']
        annotations_dict = \
        json.loads(open('v2_mscoco_{}2014_annotations.json'.format(mode)).read())['annotations']
        print("JSON loaded.")

        year = '2015' if mode == 'test' else '2014'
        img_prefix = "COCO_" + mode + year + "_"
        dataset_dict = {}
        while questions_dict:
            question = questions_dict.pop()
            image_id = '{:012d}'.format(question['image_id'])
            image_name = img_prefix + image_id + ".jpg"
            dataset_dict[question['question_id']] = {'question': question['question'], 
                                                     'image_name': image_name}
        while annotations_dict:
            annotation = annotations_dict.pop()
            dataset_dict[annotation['question_id']]['multiple_choice_answer'] = copy.deepcopy(annotation['multiple_choice_answer'])
            answer_frequency[annotation['multiple_choice_answer']] += 1
        print("Combined questions and answers.")
        del questions_dict, annotations_dict

        top_answers = sorted([(v, k) for k, v in answer_frequency.items()], reverse=True)[:1000]
        print("Done sorting.")
        answer_to_idx = {ans: idx for idx, (_, ans) in enumerate(top_answers)}
        del top_answers, answer_frequency
        self.dataset = defaultdict(list)
        
        while dataset_dict:
            k, data = dataset_dict.popitem()
            if data['multiple_choice_answer'] in answer_to_idx:
                data['answer_index'] = torch.tensor(answer_to_idx[data['multiple_choice_answer']])
                self.dataset[data['image_name']].append(data)

        del dataset_dict
        print('Pruned examples that\'s not part of top 1000 answer choices')
        self.image_names = [k for k in self.dataset.keys()]
        self.mode = mode
        self.dataset_path = dataset_path
        self.transform = transform
        self.keys = None
        self.shuffle()
        self.last_image_name = ''
        self.last_img = None

    def __len__(self) -> int:
        return len(self.keys)

    def __getitem__(self, idx: int):
        image_name, data_idx = self.keys[idx]
        data = self.dataset[image_name][data_idx]
        question_embedding = torch.tensor([self.word_embeddings.get_embedding(word) for word in data['question']], dtype=torch.float)
        if image_name == self.last_image_name:
          img = self.last_img
        else:
          with open(os.path.join(self.dataset_path, image_name), 'rb') as f:
            img = Image.open(f).convert('RGB')
            self.last_img = img
            self.last_image_name = image_name
  #         f = os.path.join(self.dataset_path, image_name)
  #         img = Image.fromarray(cv2.cvtColor(cv2.imread(f), cv2.COLOR_BGR2RGB))
        question_embedding = F.pad(question_embedding, pad=(0, 0, 60-question_embedding.shape[0], 0))
        t_img = self.transform(img)
        return t_img, question_embedding, data['answer_index']
    
    def shuffle(self):
        random.shuffle(self.image_names)
        self.keys = [(image_name, idx)
                     for image_name in self.image_names
                     for idx in range(len(self.dataset[image_name]))]


In [7]:
vqa_train = VQADataset('train2014', transform_train, word_embeddings, 'train')
vqa_test = VQADataset('val2014', transform_test, word_embeddings, 'val')

JSON loaded.
Combined questions and answers.
Done sorting.
Pruned examples that's not part of top 1000 answer choices
JSON loaded.
Combined questions and answers.
Done sorting.
Pruned examples that's not part of top 1000 answer choices


In [0]:
train_loader = DataLoader(
    vqa_train,
    batch_size=512, shuffle=False, 
    num_workers=4, drop_last=False
)

test_loader = DataLoader(
    vqa_test,
    batch_size=512, shuffle=False, 
    num_workers=4, drop_last=False
)

In [0]:
import models
model = models.FeedForward().cuda()

In [10]:
model.load_state_dict(torch.load('start2268.pth'))

IncompatibleKeys(missing_keys=[], unexpected_keys=[])

In [0]:
criterion = nn.CrossEntropyLoss().cuda()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.1, patience=7, verbose=True)

In [0]:
def accuracy(loader):
  with torch.no_grad():
    total = 0
    correct = 0
    for image, question, answer in loader:
      image, question, answer = image.cuda(), question.cuda(), answer.cuda()
      out = model(image, question)
      _, predicted = torch.max(out.data, 1)
      total += answer.size(0)
      correct += predicted.eq(answer.data).sum().item()
      del image, question, answer, out, predicted
    return correct/total

In [0]:
best_acc = 0

In [20]:
import time
from google.colab import files

for epoch in range(0, 100):
  train_correct = 0
  train_total = 0
  train_loss = 0
  
  epoch_start_time = time.time()
  start_time = time.time()
  for batch_idx, (image, question, answer) in enumerate(train_loader):
    optimizer.zero_grad()
    image, question, answer = image.cuda(), question.cuda(), answer.cuda()
    
    out = model(image, question)
    loss = criterion(out, answer)
    loss.backward()
    optimizer.step()
    
    train_loss += loss.item()
    _, predicted = torch.max(out.data, 1)
    train_total += answer.size(0)
    train_correct += predicted.eq(answer.data).sum().item()
    
    del image, question, answer, out, predicted
    
    print('\r| Epoch [%3d] Iter[%3d] Time: [%.3f] Avg Time: [%.3f]'
          '\t\tLoss: %.4f Acc@1: %.3f' 
              % (epoch, batch_idx+1, time.time()-start_time, 
                 (time.time()-epoch_start_time)/(batch_idx+1),
                 loss.item(), 100.*train_correct/train_total), end='')
    
    start_time = time.time()
    
  epoch_start_time = time.time()
  test_acc = accuracy(test_loader)
  print('\n| Epoch [%3d] Time: [%.3f] Avg Time: [%.3f] \tTest Acc: %.3f' 
        % (epoch, time.time()-epoch_start_time, 
           (time.time()-epoch_start_time)/len(test_loader), test_acc))
  
  scheduler.step(test_acc)
  
  if best_acc < test_acc:
    best_acc = test_acc
    file = 'epoch[%d]acc[%d].pth' %(epoch, int(test_acc * 10000))
    torch.save(model.state_dict(), file)
    
  vqa_train.shuffle()

| Epoch [  0] Iter[759] Time: [0.132] Avg Time: [2.134]		Loss: 3.7415 Acc@1: 22.260
| Epoch [  0] Time: [951.404] Avg Time: [2.599] 	Test Acc: 0.221


MessageError: ignored