In [6]:
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import variable
import torchvision as tv
import nntools as nt
import torch

In [7]:
class NNClassifier(nt.NeuralNetwork):
    def __init__(self):
        super(NNClassifier, self).__init__()
        self.cross_entropy = nn.CrossEntropyLoss()
    
    def criterion(self, y, d):
        return self.cross_entropy(y, d)

In [8]:
class VGGNet(nn.Module):
    def __init__(self, output_features, fine_tuning=False):
        super(VGGNet, self).__init__()
        vgg = tv.models.vgg16_bn(pretrained=True)
        
        #freezing the feature extraction layers
        for param in vgg.parameters():
            param.requires_grad = fine_tuning 
            
        self.features = vgg.features
        
        self.num_fts = 512
        self.output_features = output_features
        
        # Linear layer goes from 512 to 1024
        self.classifier = nn.Linear(self.num_fts, self.output_features)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        h = self.features(x)
        h = self.classifier(h.view(-1, self.num_fts)).view(-1, 196, self.num_classes)
        y = self.dropout(self.tanh(h))
        
        return y

In [9]:
class LSTM(nn.Module): 
    def __init__(self, vocab_size, embedding_dim, num_layers=1, batch_size=100, hidden_dim=1024):
        super(LSTM,self).__init__()
        self.vocab_size = vocab_size
        self.batch_size = batch_size
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        
        self.linear = nn.Linear(vocab_size, embedding_dim, bias=False)
        
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, 
                            num_layers=num_layers)
    
    def forward(self, question_vec):
        q = question_vec.transpose(0, 1)
        
        h0 = torch.randn(1, self.batch_size, self.hidden_dim)
        c0 = torch.randn(1, self.batch_size, self.hidden_dim)
        
        h = self.linear(q)
        
        # h is of shape sequence length x batch size x embedding dimension (1000)
        h = h.view(q.shape[0], self.batch_size, self.embedding_dim)
                        
        _, states = self.lstm(h, (h0, c0))
                
        hidden_state,_ = states
        
        return hidden_state

In [11]:
# vocab = 13744*[0]
# embedded_dim = 1000
# dropout_ratio = 0.5
# question_vec = torch.ones(len(vocab))
# print(question_vec.shape)
# lstm = LSTM(vocab, embedded_dim, dropout_ratio)
# y = lstm(question_vec)

In [10]:
class AttentionNet(nn.Module):
    def __init__(self, numClasses):
        #v_i in dxm => 1024x196 vec
        #v_q in d => 1024x1 vec
        #Wia v_i in kxm => kx196
        #will choose k => 512
        super(AttentionNet,self).__init__()
        input_features = 1024 
        output_features = 512 #k 
        self.num_classes = numClasses
        
        self.q_transform1 = nn.Linear(input_features,output_features)
        self.image_transform1 = nn.Linear(input_features,output_features, bias=False)
        self.fc31 = nn.Linear(output_features,1)
        
        self.q_transform2 = nn.Linear(input_features,output_features)
        self.image_transform2 = nn.Linear(input_features,output_features, bias=False)
        self.fc32 = nn.Linear(output_features,1)
        
        self.answerDist = nn.Linear(input_features,self.num_classes)
        
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, question_vec, image_vec):
        #do linear on fc1
        u_0 = question_vec
        q_transformation = self.q_transform1(u_0)
        #do linear on fc2 
        image_transformation = self.image_transform1(image_vec)
        #perform addition of a matrix and a vector 
        hA = self.tanh(image_transformation + q_transformation)
        #perform softmax on fc3 with result of tanh 
        x = self.fc31(hA)
        pI = self.softmax(x) #196x1
        v_0 = image_vec.view(1024, 196).matmul(pI).view(1, 1024)
        u_1 = v_0 + u_0 
        
        q_transformation2 = self.q_transform2(u_0)
        #do linear on fc2 
        image_transformation2 = self.image_transform2(image_vec)
        #perform addition of a matrix and a vector 
        hA = self.tanh(image_transformation2 + q_transformation2)
        #perform softmax on fc3 with result of tanh 
        x = self.fc32(hA)
        pI = self.softmax(x) #196x1
        v_1 = image_vec.view(1024, 196).matmul(pI).view(1, 1024)
        u_2 = v_1 + u_1 
        
        #perform softmax to get a final answer distribution
        pI = self.softmax(self.answerDist(u_2))

        return pI                

In [11]:
class SAN(nn.Module):
    def __init__(self, num_classes, output_features, vocab_size, embedding_dim, fine_tuning=False):
        super(SAN, self).__init__()
        #output_featured -> 1024
        self.imageEmbedder = VGGNet(output_features)
        #vocab_size,embedding_dim =1000
        self.questionEmbedder = LSTM(vocab_size, embedding_dim)
        #num_classes = 10000
        self.attentionLayers = AttentionNet(num_classes)
        
    def forward(self, question, image):
        #image_embedding -> 1024x196
        imageEmbedding = self.imageEmbedded(image)
        #question_embedding -> 1024x1 
        questionEmbedding = self.questionEmbedder(question)
        #should return answer distribution 1000x1 
        return self.attentionLayers(questionEmbedding,imageEmbedding)
        
        

In [12]:
san = SAN(1000, 1024, 13766, 1000)


In [2]:
# for name, param in san.named_parameters():
#     print(name, param.size(), param.requires_grad)

In [3]:
# san.state_dict

In [13]:
san

SAN(
  (imageEmbedder): VGGNet(
    (features): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace)
      (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace)
      (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (9): ReLU(inplace)
      (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (12): ReLU(inplace)
      (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_