In [None]:
# https://discuss.pytorch.org/t/call-backward-on-function-inside-a-backpropagation-step/3793
# https://discuss.pytorch.org/t/implementing-a-custom-convolution-using-conv2d-input-and-conv2d-weight/18556
# https://discuss.pytorch.org/t/implementing-a-custom-convolution-using-conv2d-input-and-conv2d-weight/18556/21

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms

import numpy as np

In [3]:

class Conv2dFunction(torch.autograd.Function):
    @staticmethod
    def forward(ctx, input, weight, bias=None, stride=1, padding=1, dilation=1, groups=1):
        # Save arguments to context to use on backward
        # WARNING : if stride, padding, dilation etc is array, this will not work properly!!!!
#         print('stride', stride)
        if weight.shape[2] == 1 :
            padding = 0
        elif weight.shape[2] == 5 :
            padding = 2
        elif weight.shape[2] == 7 :
            padding = 3
        confs = torch.from_numpy(np.array([stride, padding, dilation, groups]))
        out = F.conv2d(input, weight, bias=bias, stride=stride, padding=padding, dilation=dilation, groups=groups)
        ctx.save_for_backward(input, out, weight, bias, confs)

        # Compute Convolution
        return out
    
    @staticmethod
    def backward(ctx, grad_output):
        # Load saved tensors
        input, out, weight, bias, confs = ctx.saved_variables
        confs = confs.numpy()
        stride, padding, dilation, groups= confs[0], confs[1], confs[2], confs[3]

        # Calculate Gradient
        grad_input = grad_weight = grad_bias = None
#         print('grad_output', grad_output.shape)
#         print('out', out.shape)
#         print(out[0][0][0])
        grad_output = grad_output * 2*torch.sigmoid(out)
        if ctx.needs_input_grad[0]:
            grad_input = torch.nn.grad.conv2d_input(input.shape, weight, grad_output, stride, padding, dilation, groups)
            
        if ctx.needs_input_grad[1]:
            grad_weight = torch.nn.grad.conv2d_weight(input, weight.shape, grad_output, stride, padding, dilation, groups)
                
        # WARNING : Bias maybe buggy, remove if it is buggy
        if bias is not None and ctx.needs_input_grad[2]:
            grad_bias = grad_output.sum(0).squeeze(0)


        # WARNING : Bias maybe buggy, remove if it is buggy
        if bias is not None:
            return grad_input, grad_weight, grad_bias, None, None, None, None
        else:
            return grad_input, grad_weight, None, None, None, None, None

In [4]:
from torch.autograd import gradcheck
conv = Conv2dFunction.apply
# gradcheck takes a tuple of tensors as input, check if your gradient
# evaluated with these tensors are close enough to numerical
# approximations and returns True if they all verify this condition.
# input = (torch.randn(20,20,dtype=torch.double,requires_grad=True), torch.randn(30,20,dtype=torch.double,requires_grad=True))
# test = gradcheck(linear, input, eps=1e-6, atol=1e-4)
# print(test)


In [5]:
# device = torch.device("cpu")
device = torch.device("cuda")
dtype = torch.float
transform = transforms.Compose(
    [transforms.ToTensor(),
     transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

batch_size = 64

trainset = torchvision.datasets.CIFAR10(root='../data', train=True,
                                        download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                          shuffle=True, num_workers=4)

testset = torchvision.datasets.CIFAR10(root='../data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size,
                                         shuffle=False, num_workers=4)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Files already downloaded and verified
Files already downloaded and verified


In [6]:
# N, C_in, C_out, K_size = batch_size, 3, 12, 3
# Create random Tensors for weights.
conw1 = torch.randn(8,3,5,5, device=device, dtype=dtype, requires_grad=True)
conw2 = torch.randn(32,8,3,3, device=device, dtype=dtype, requires_grad=True)
conw3 = torch.randn(128,32,3,3, device=device, dtype=dtype, requires_grad=True)
conw4 = torch.randn(128,128,3,3, device=device, dtype=dtype, requires_grad=True)
conw5 = torch.randn(10,128,1,1, device=device, dtype=dtype, requires_grad=True)

conw1 = torch.nn.init.xavier_uniform_(conw1, gain=1.0)
conw2 = torch.nn.init.xavier_uniform_(conw2, gain=1.0)
conw3 = torch.nn.init.xavier_uniform_(conw3, gain=1.0)
conw4 = torch.nn.init.xavier_uniform_(conw4, gain=1.0)
conw5 = torch.nn.init.xavier_uniform_(conw5, gain=1.0)

# print(conw1[0][0])
# print(torch.nn.init.xavier_uniform_(conw1, gain=1.0)[0][0])
# print(conw2)

In [7]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = Conv2dFunction.apply
        self.conv2 = Conv2dFunction.apply
        self.conv3 = Conv2dFunction.apply
        self.conv4 = Conv2dFunction.apply
        self.conv5 = Conv2dFunction.apply
        self.avgpool = torch.nn.AvgPool2d((16,16) ,stride=(16,16))
        self.maxpool = torch.nn.MaxPool2d((2,2), stride=(2,2))
        self.linear = torch.nn.Linear(128, 10)
        self.act = torch.nn.ReLU()

    def forward(self, x, w1, w2, w3, w4, w5):
#         x = self.conv1(x, w1)
#         x = self.act(x)
#         x = self.conv2(x, w2)
#         x = self.maxpool(x)
#         x = self.conv3(x, w3)
#         x = self.act(x)
#         x = self.conv4(x, w4)
#         x = self.avgpool(x)
#         x = torch.squeeze(x)
#         x = self.linear(x)
#         x = torch.nn.Softmax(dim=1)(x)

        x = self.conv1(x, w1)
        x = self.act(x)
        x = torch.nn.BatchNorm2d(8).to(device)(x)
        x = self.act(self.conv2(x, w2))
        x = self.maxpool(x)
        x = torch.nn.BatchNorm2d(32).to(device)(x)
        x = self.conv3(x, w3)
        x = self.act(x)
        x = torch.nn.BatchNorm2d(128).to(device)(x)
        x = self.act(self.conv4(x, w4))
        x = torch.nn.BatchNorm2d(128).to(device)(x)
        x = self.conv5(x, w5)
        x = self.avgpool(x)
        x = torch.squeeze(x)
#         x = self.linear(x)
#         x = torch.nn.Softmax(dim=1)(x)
        x = torch.sigmoid(x)
        
        return x

In [8]:
net = Net().to(device)
criterion = nn.CrossEntropyLoss().to(device)

In [9]:
image, label = iter(trainloader).next()
print(image.shape)

torch.Size([64, 3, 32, 32])


In [10]:
image, labels = iter(trainloader).next()
outputs = net(image.to(device), conw1, conw2, conw3, conw4, conw5).to(device)
print(outputs.sum(dim=1))
# print(outputs)
loss = criterion(outputs, labels.to(device))
loss.backward()    


tensor([4.8589, 5.2288, 5.2041, 4.9250, 4.7712, 4.7179, 5.4285, 5.1389, 4.7442,
        4.7521, 4.9298, 5.0724, 5.5307, 4.8354, 5.1092, 5.3463, 4.9261, 4.8634,
        5.1043, 4.9204, 5.0538, 5.0692, 4.8275, 4.8279, 5.0406, 4.9396, 4.7499,
        4.7144, 5.0970, 5.1474, 4.9150, 4.9293, 4.8275, 5.0183, 5.0233, 4.8837,
        5.0943, 5.3667, 4.8771, 4.7986, 5.2493, 5.1001, 4.8170, 5.2058, 5.3425,
        4.6808, 5.0651, 5.1137, 5.5452, 5.0938, 4.8380, 4.8467, 4.5852, 5.0486,
        5.0078, 4.7921, 4.8303, 4.9594, 4.8958, 5.4426, 4.9001, 5.1166, 4.8557,
        5.1766], device='cuda:0', grad_fn=<SumBackward1>)




In [11]:
# conw5.grad

In [12]:
def test (model, w1, w2, w3, w4, w5) :
    correct = 0
    total = 0
    # since we're not training, we don't need to calculate the gradients for our outputs
    with torch.no_grad():
        for data in testloader:
            inputs, labels = data

            outputs = model(inputs.to(device), w1, w2, w3, w4, w5)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels.to(device)).sum().item()

    print('Accuracy of the network on the 10000 test images: %d %%' % (
        100 * correct / total))

In [13]:
# lr_list = [0.05, 0.01, 0.01, 0.002, 0.001, 0.001, 0.0002, 0.0001, 0.00002, 0.00001]
lr_list = [0.05] * 10
criterion = nn.CrossEntropyLoss().to(device)
for epoch in range(10) :    
    running_loss = 0.0
    learning_rate = lr_list[epoch]
    for i, data in enumerate(trainloader, 0):
        inputs, labels = data

        outputs = net(inputs.to(device), conw1, conw2, conw3, conw4, conw5)
#         print(outputs.shape)
#         print(labels.shape)
        loss = criterion(outputs, labels.to(device))
#         print(loss)
        loss.backward()    

        # print statistics
        running_loss += loss.item()
        if i % 200 == 0:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 200))
            running_loss = 0.0 
#             print(conw1[0][0][0])
#             print(conw5[0][0][0])
#             print(conw4.grad)
            
        with torch.no_grad():
            # Update weights using gradient descent
            conw1 -= learning_rate * conw1.grad
            conw2 -= learning_rate * conw2.grad
            conw3 -= learning_rate * conw3.grad
            conw4 -= learning_rate * conw4.grad
            conw5 -= learning_rate * conw5.grad

            # Manually zero the gradients after running the backward pass
            conw1.grad.zero_()
            conw2.grad.zero_()   
            conw3.grad.zero_()
            conw4.grad.zero_()       
            conw5.grad.zero_()                   
            
    test(net, conw1, conw2, conw3, conw4, conw5)



[1,     1] loss: 0.012
tensor([0.1362], device='cuda:0', grad_fn=<SelectBackward>)
[1,   201] loss: 2.084
tensor([0.1382], device='cuda:0', grad_fn=<SelectBackward>)
[1,   401] loss: 2.026
tensor([0.1388], device='cuda:0', grad_fn=<SelectBackward>)
[1,   601] loss: 2.005
tensor([0.1449], device='cuda:0', grad_fn=<SelectBackward>)
Accuracy of the network on the 10000 test images: 42 %
[2,     1] loss: 0.010
tensor([0.1475], device='cuda:0', grad_fn=<SelectBackward>)
[2,   201] loss: 1.977
tensor([0.1512], device='cuda:0', grad_fn=<SelectBackward>)
[2,   401] loss: 1.973
tensor([0.1548], device='cuda:0', grad_fn=<SelectBackward>)
[2,   601] loss: 1.967
tensor([0.1580], device='cuda:0', grad_fn=<SelectBackward>)
Accuracy of the network on the 10000 test images: 44 %
[3,     1] loss: 0.010
tensor([0.1602], device='cuda:0', grad_fn=<SelectBackward>)
[3,   201] loss: 1.952
tensor([0.1626], device='cuda:0', grad_fn=<SelectBackward>)
[3,   401] loss: 1.950
tensor([0.1648], device='cuda:0', gra

In [14]:
# filters = torch.randn(12, 3, 3, 3)
# inputs = torch.randn(2, 3, 32, 32)
# F.conv2d(inputs, filters, padding=1).shape

In [15]:
import torch
x = torch.randn(4,2)
print(x.shape)
torch.nn.Softmax(dim=1)(x)

torch.Size([4, 2])


tensor([[0.2950, 0.7050],
        [0.2994, 0.7006],
        [0.3669, 0.6331],
        [0.6751, 0.3249]])