In [1]:
%matplotlib notebook
%run preprocess.ipynb
%run models.ipynb

In [2]:
import os
import numpy as np
import shutil
import time
import json
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torchvision as tv
import nntools as nt
import torch

In [204]:
def train_and_validation(model, dataloader, lr=0.01, rho=0.9, mode='training'):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    running_loss, running_accuracy, num_updates = 0.0, 0.0, 0.0
    model.train() 
    
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=rho)
    
    # Iterate over data.
    for image, question, answer in dataloader:
        if (device == 'cuda'):
            image, question, answer = image.cuda(), question.cuda(), answer.cuda()

        image, question, answer = Variable(image), Variable(question), Variable(answer)

        # zero grad
        optimizer.zero_grad()
        predicted_answer = model(image, question)
        

        answer = answer.squeeze().transpose(0, 1)
        predicted_answer = predicted_answer.squeeze().transpose(0,1)
        _, y_pred = torch.max(predicted_answer, 1)
        _, class_indices = torch.max(answer, 1)        

        answer = torch.tensor(answer, dtype=torch.long, device=device)
        
        loss = criterion(predicted_answer, class_indices)
        print(loss.item())
        
        if(mode=='training'):
            # backward + optimize
            loss.backward()
            optimizer.step()

        # statistics
        with torch.no_grad():
            running_loss += loss.item()
            running_accuracy += torch.sum((y_pred == class_indices).data)
    
        num_updates += 1

    loss = running_loss / num_updates
    acc = (running_accuracy / len(dataloader.dataset)) * 100
    print('Train Loss: {:.4f} Acc: {:2.3f} ({}/{})'.format(loss, acc, running_accuracy, num_updates))
    
    return loss, acc

In [205]:
def train_model(model, train_loader, validation_loader, save_dir, num_epochs=25, best_accuracy=0, start_epoch=0):
    best_model_wts = model.state_dict()
    best_acc = best_accuracy
    
    training_loss_history = []
    training_accuracy_history = []

    validation_loss_history = []
    validation_accuracy_history = []

    for epoch in range(start_epoch, num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        train_begin = time.time()
        train_loss, train_acc = train_and_validation(model, train_loader, mode='training') 
        training_loss_history.append(train_loss)
        training_accuracy_history.append(train_acc)    
        print('Epoch= ' + str(epoch) + ' Train Loss= ' + str(train_loss))

        validation_begin = time.time()
        val_loss, val_acc = train_and_validation(model, train_loader, mode='validation')
        validation_loss_history.append(train_loss)
        validation_accuracy_history.append(train_acc)  
        print('Epoch= ' + str(epoch) + ' Validation Loss= ' + str(train_loss))

        # deep copy the model
        is_best = val_acc > best_acc
        if is_best:
            best_acc = val_acc
            best_model_wts = model.state_dict()

        save_checkpoint(save_dir, {
            'epoch': epoch,
            'best_acc': best_acc,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'training_history': [training_loss_history, training_accuracy_history],
            'validation_history': [validation_loss_history, validation_accuracy_history]
        }, is_best)

    print('Best val Acc: {:4f}'.format(best_acc))
    
    # load best model weights
    model.load_state_dict(best_model_wts)

    return model

In [206]:
def save_checkpoint(save_dir, state, is_best):
    savepath = save_dir + '/' + 'checkpoint.pth.tar'
    torch.save(state, savepath)
    if is_best:
        shutil.copyfile(savepath, save_dir + '/' + 'model_best.pth.tar')

In [207]:
indices = np.random.permutation(len(train))
train_ind = indices[:int(len(train)*0.8)]
val_ind = indices[int(len(train)*0.8):]

train_sampler = torch.utils.data.sampler.SubsetRandomSampler(train_ind)
val_sampler = torch.utils.data.sampler.SubsetRandomSampler(val_ind)

In [208]:
train_loader = torch.utils.data.DataLoader(train, batch_size=10, pin_memory=True, sampler=train_sampler)
val_loader = torch.utils.data.DataLoader(train, batch_size=10, pin_memory=True, sampler=val_sampler)

In [209]:
%run models.ipynb
san = SAN(num_classes=1000, batch_size=10, vocab_size=len(train.vocab_q), embedding_dim=1000,
          output_vgg=1024, input_attention=1024, output_attention=512)
san = san.to('cuda')

In [210]:
for i in train_loader: 
    y = san(i[0].to('cuda'),i[1].to('cuda'))
    break
print(y.shape)
_, y_pred = torch.max(y, 2)
print(y_pred.shape)
print(y_pred)

torch.Size([10, 1, 1000])
torch.Size([10, 1])
tensor([[674],
        [517],
        [954],
        [ 56],
        [743],
        [ 78],
        [143],
        [185],
        [171],
        [ 47]], device='cuda:0')


In [211]:
train_model(san, train_loader, val_loader, 'test', num_epochs=1, best_accuracy=0, start_epoch=0)

Epoch 0/0
----------




6.409224987030029
6.999348163604736
6.444126129150391
6.424279689788818
5.944341659545898
6.297928333282471
6.364412307739258
6.029438018798828
6.444243431091309
5.758861541748047
7.448818206787109
5.716835021972656
5.683062553405762
6.019190311431885
5.53284215927124
6.581063747406006
6.639585018157959
6.936239719390869
6.322841167449951
6.184286594390869
6.697512626647949
6.6117987632751465
6.448841094970703
6.53084659576416
6.412446022033691
7.214198112487793
7.837355136871338
8.112953186035156
7.638538837432861
8.678786277770996
7.842965602874756
9.308819770812988
7.269656658172607
7.902585983276367
9.852928161621094
9.378305435180664
10.131352424621582
11.932426452636719
10.703460693359375
9.725846290588379
10.950457572937012
14.05360221862793
17.588022232055664
13.436834335327148
13.856095314025879
17.214458465576172
16.363698959350586
14.676942825317383
14.202319145202637
13.759724617004395
15.607967376708984
17.28377342224121
16.198196411132812
24.730485916137695
16.85327720642

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



62.40434265136719
Traceback (most recent call last):
  File "/opt/conda/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2961, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-211-7ed13cd64066>", line 1, in <module>
    train_model(san, train_loader, val_loader, 'test', num_epochs=1, best_accuracy=0, start_epoch=0)
  File "<ipython-input-205-b7708ecb0f9c>", line 16, in train_model
    train_loss, train_acc = train_and_validation(model, train_loader, mode='training')
  File "<ipython-input-204-61a92009b27e>", line 11, in train_and_validation
    for image, question, answer in dataloader:
  File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 615, in __next__
    batch = self.collate_fn([self.dataset[i] for i in indices])
  File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 615, in <listcomp>
    batch = self.collate_fn([self.dataset[i] for i in indices])
  File "<ipyt

KeyboardInterrupt: 