In [58]:
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import variable
import torchvision as tv
import nntools as nt
import torch

In [52]:
class NNClassifier(nt.NeuralNetwork):
    def __init__(self):
        super(NNClassifier, self).__init__()
        self.cross_entropy = nn.CrossEntropyLoss()
    
    def criterion(self, y, d):
        return self.cross_entropy(y, d)

In [72]:
class VGGNet(NNClassifier):
    def __init__(self, num_classes, fine_tuning=False):
        super(VGGNet, self).__init__()
        vgg = tv.models.vgg16_bn(pretrained=True)
        
        for param in vgg.parameters():
            param.requires_grad = fine_tuning 
            
        self.features = vgg.features
        
        self.num_fts = 512
        self.num_classes = num_classes
        
        # Linear layer goes from 512 to 1024
        print(self.num_fts)
        print(self.num_classes)
        self.classifier = nn.Linear(self.num_fts, self.num_classes)
        self.tanh = nn.Tanh()
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, x):
        print('in forward')
        h = self.features(x)
        h = self.classifier(h.view(-1, self.num_fts)).view(-1, 196, self.num_classes)
        y = self.dropout(self.tanh(h))
        
        return y

In [190]:
class LSTM(NNClassifier): 
    def __init__(self, vocab, embedding_dim, dropout_ratio,num_layers=1,hidden_dim=1024):
        super(LSTM,self).__init__()
        self.vocab = vocab
        self.vocab_size = len(vocab)
        
        self.embed = nn.Embedding(self.vocab_size, embedding_dim=embedded_dim)        
        self.lstm = nn.LSTM(input_size=embedded_dim, hidden_size=hidden_dim, 
                            num_layers=num_layers, dropout=dropout_ratio)
    
    def forward(self,question_vec):
        question_embed = self.embed(question_vec)
        
        _, states = self.lstm(question_embed)
        hidden_state,_ = states
        embedding = hidden_state[0]
        
        return embedding

In [191]:
vocab = 13744*[0]
embedded_dim = 1000
dropout_ratio = 0.5
question_vec = torch.ones(len(vocab))
print(question_vec.shape)
lstm = LSTM(vocab, embedded_dim, dropout_ratio)
y = lstm(question_vec)

torch.Size([13744])


  "num_layers={}".format(dropout, num_layers))


RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.FloatTensor instead (while checking arguments for embedding)

In [183]:
class AttentionNet(NNClassifier):
    def __init__(self):
        #v_i in dxm => 1024x196 vec
        #v_q in d => 1024x1 vec
        #Wia v_i in kxm => kx196
        #will choose k => 512
        super(AttentionNet,self).__init__()
        input_features = 1024 
        output_features = 512 #k 
        image_region_size = 196
        num_classes = 1000
        
        self.q_transform1 = nn.Linear(input_features,output_features)
        self.image_transform1 = nn.Linear(input_features,output_features, bias=False)
        self.fc31 = nn.Linear(output_features,1)
        
        self.q_transform2 = nn.Linear(input_features,output_features)
        self.image_transform2 = nn.Linear(input_features,output_features, bias=False)
        self.fc32 = nn.Linear(output_features,1)
        
        self.answerDist = nn.Linear(input_features,num_classes)
        
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax()
        self.dropout = nn.Dropout(0.5)
        
    def forward(self, question_vec, image_vec):
        #do linear on fc1
        u_0 = question_vec
        q_transformation = self.q_transform1(u_0)
        #do linear on fc2 
        image_transformation = self.image_transform1(image_vec)
        #perform addition of a matrix and a vector 
        hA = self.tanh(image_transformation + q_transformation)
        #perform softmax on fc3 with result of tanh 
        x = self.fc31(hA)
        pI = self.softmax(x) #196x1
        v_0 = image_vec.view(1024, 196).matmul(pI).view(1, 1024)
        u_1 = v_0 + u_0 
        
        q_transformation2 = self.q_transform2(u_0)
        #do linear on fc2 
        image_transformation2 = self.image_transform2(image_vec)
        #perform addition of a matrix and a vector 
        hA = self.tanh(image_transformation2 + q_transformation2)
        #perform softmax on fc3 with result of tanh 
        x = self.fc32(hA)
        pI = self.softmax(x) #196x1
        v_1 = image_vec.view(1024, 196).matmul(pI).view(1, 1024)
        u_2 = v_1 + u_1 
        
        #perform softmax to get a final answer distribution
        pI = self.softmax(self.answerDist(u_2))

        return pI                

In [179]:
v_i = torch.rand(196, 1024)
print(v_i.shape)
v_q = torch.rand(1, 1024)
print(v_q.shape)
att = AttentionNet()
att(v_q, v_i)

torch.Size([196, 1024])
torch.Size([1, 1024])
q_transformation.shape torch.Size([1, 512])
image_transformation.shape torch.Size([196, 512])
hA.shape: torch.Size([196, 512])
x.shape torch.Size([196, 1])
pI.shape torch.Size([196, 1])
v_0.shape: torch.Size([1, 1024])
u_1.shape: torch.Size([1, 1024])
q_transformation2.shape torch.Size([1, 512])
image_transformation2.shape torch.Size([196, 512])
hA.shape: torch.Size([196, 512])
x.shape torch.Size([196, 1])
pI.shape torch.Size([196, 1])
v_1.shape: torch.Size([1, 1024])
u_2.shape: torch.Size([1, 1024])




tensor([[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 1.0279e-25,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 9.8555e-32,
         0.0000e+00, 0.0000e

In [14]:
class SAN(NNClassifier):
    def __init__(self, num_classes, fine_tuning=False):
        super(SAN, self).__init__()
        
        

In [133]:
input_features = 1024 
output_features = 512 #k 
image_region_size = 196
num_classes = 1000
q_transform1 = nn.Linear(in_features=input_features,out_features=output_features)


AttributeError: 'Linear' object has no attribute 'weight_matrix'