In [1]:
import numpy as np
from Models import *
from Layers import *

import Activations as a
import Loss_Functions as lf

import torch
import torch.nn as nn

np.set_printoptions(precision = 4,suppress = True)

In [2]:
def printdif(paramname,torchval, myval):
    print("#"*50)
    print(f"{paramname}:\nMine:\n{myval}\n\nPytorch's:\n{torchval.detach().numpy()}")
    print()
    print(f"Sum Difference:\n{np.sum(np.abs(myval - torchval.detach().numpy()))}")
    print()
    

class SimpleConv(nn.Module):
    def __init__(self, imwidth, imheight, 
                 inchannels, outchannels, 
                 kernelsize, outsize, 
                 stride = (1,1), padding = (0,0)):
        super().__init__()

        out_h = int((imheight + 2 * padding[0]- kernelsize[0]) / stride[0] + 1)
        out_w = int((imwidth + 2 * padding[1] - kernelsize[1]) / stride[1] + 1)
        
        self.conv = nn.Conv2d(inchannels, outchannels, kernelsize, stride = stride, padding = padding)
        self.fc = nn.Linear(outchannels * out_h * out_w, outsize)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        out = self.conv(x)
        out = out.reshape(out.size(0),-1)
        out = self.relu(out)
        out = self.fc(out)
        return out

## Comparing outputs and gradients of a model with a conv2d layer and linear layer

In [3]:
#Set hyperparams
imwidth = 6
imheight = 5
inchannels = 3
outchannels = 2
kernelsize = (4,4)
outsize = 3
batchsize = 2
padding = (0,0)
stride = (1,1)
out_h = int((imheight + 2 * padding[0]- kernelsize[0]) / stride[0] + 1)
out_w = int((imwidth + 2 * padding[1] - kernelsize[1]) / stride[1] + 1)

#Instantiate Torch Model
simpleconv = SimpleConv(imwidth, imheight,
                        inchannels, outchannels,
                        kernelsize, outsize)
  
#Create Numpy Model
model = Model()
model.addLayer(Conv2d(inchannels,outchannels, kernelsize), activation = a.ReLU())
model.addLayer(Flatten())
model.addLayer(Linear(batchsize * out_h * out_w, outsize))
model.set_loss_fn(lf.CrossEntropyLoss())

#Load Torch weights into Numpy Model
model.torch_weighted(simpleconv)

#Input and ytrue
torchin = torch.randn((batchsize,inchannels,imheight,imwidth))
torchtrue = torch.randint(outsize,(batchsize,))

npin = torchin.detach().numpy()
nptrue = torchtrue.detach().numpy()

#Get model outputs
torchout = simpleconv(torchin)
torchout.retain_grad()
npout = model(npin)
printdif("outputs", torchout, npout)

#Calculate losses
torchloss = nn.CrossEntropyLoss()(torchout, torchtrue)
nploss = model.calculate_loss(nptrue)
printdif("loss", torchloss, nploss)

#Calculate gradients
torchloss.backward(retain_graph = True)
model.backward()
printdif("output gradient", torchout.grad, model.loss_fn.gradient)
printdif("fc weight gradient", simpleconv.fc.weight.grad, model.layers[2].gradients[0])
printdif("fc bias gradient", simpleconv.fc.bias.grad, model.layers[2].gradients[1])
printdif("conv weight gradient", simpleconv.conv.weight.grad, model.layers[0].gradients[0])
printdif("conv bias gradient", simpleconv.conv.bias.grad, model.layers[0].gradients[1])

##################################################
outputs:
Mine:
[[-0.1782  0.0566  0.0256]
 [-0.0877  0.3313  0.017 ]]

Pytorch's:
[[-0.1782  0.0566  0.0256]
 [-0.0877  0.3313  0.017 ]]

Sum Difference:
1.510139682886491e-07

##################################################
loss:
Mine:
1.0602596228002306

Pytorch's:
1.0602595806121826

Sum Difference:
4.218804794398068e-08

##################################################
output gradient:
Mine:
[[-0.3568  0.1811  0.1756]
 [ 0.1377 -0.2906  0.1529]]

Pytorch's:
[[-0.3568  0.1811  0.1756]
 [ 0.1377 -0.2906  0.1529]]

Sum Difference:
9.22599028285731e-08

##################################################
fc weight gradient:
Mine:
[[-0.0696  0.      0.      0.0423  0.     -0.2924 -0.2428 -0.0089  0.0411
   0.     -0.1306 -0.193 ]
 [-0.2022  0.      0.     -0.0893  0.      0.0895  0.1085  0.0045 -0.2084
   0.      0.0663  0.0765]
 [ 0.2718  0.      0.      0.047   0.      0.2029  0.1343  0.0044  0.1673
   0.      0.0643  0.1165]]

Py

## Comparing outputs and gradients of a 2 layer LSTM

In [4]:
batch_size = 2
input_size = 3
hidden_size = 5
time_steps = 1
num_layers = 2

#Pytorch Model
torchrnn = nn.LSTM(input_size, hidden_size, num_layers)

#My Model
myrnn = LSTM(input_size, hidden_size, num_layers)
myrnn.torch_weighted(torchrnn)

#Inputs
torchin = torch.randn(time_steps, batch_size, input_size)
htorch = torch.randn(num_layers, batch_size, hidden_size)
ctorch = torch.randn(num_layers, batch_size, hidden_size)
npin = torchin.detach().numpy()
hnp = htorch.detach().numpy()
cnp = ctorch.detach().numpy()

#targets
torchtrue = torch.randint(hidden_size,(batch_size,))
nptrue = torchtrue.detach().numpy()

#Outputs
torchout, (htorch, ctorch) = torchrnn(torchin,(htorch, ctorch))
torchout = torchout[-1]
torchout.retain_grad()
npout, (hnp, cnp) = myrnn(npin, (hnp, cnp))
npout = npout[-1]

printdif("Outputs",torchout,npout)

#Loss
torch_loss_fn = nn.CrossEntropyLoss()
torchloss = torch_loss_fn(torchout,torchtrue)
np_loss_fn = lf.CrossEntropyLoss()
nploss = np_loss_fn(npout, nptrue)
printdif("Loss", torchloss, nploss)

#Calculate gradients
torchloss.backward(retain_graph=True)
myrnn.backward(np_loss_fn.gradient)

#Gradients

printdif("Output gradient", torchout.grad, np_loss_fn.gradient)

for l in reversed(range(num_layers)):
    printdif(f"layer {l} weight_ih gradient",
             torchrnn.all_weights[l][0].grad,
             myrnn.layers[l].gradients[0])
    printdif(f"layer {l} weight_hh gradient",
             torchrnn.all_weights[l][1].grad,
             myrnn.layers[l].gradients[1])
    printdif(f"layer {l} bias_ih gradient",
             torchrnn.all_weights[l][2].grad,
             myrnn.layers[l].gradients[2])
    printdif(f"layer {l} bias_hh gradient",
             torchrnn.all_weights[l][3].grad,
             myrnn.layers[l].gradients[3])

##################################################
Outputs:
Mine:
[[-0.21   -0.3226 -0.1606 -0.1621  0.485 ]
 [-0.4966  0.1495 -0.0338  0.0855  0.2625]]

Pytorch's:
[[-0.21   -0.3226 -0.1606 -0.1621  0.485 ]
 [-0.4966  0.1495 -0.0338  0.0855  0.2625]]

Sum Difference:
1.2293457984924316e-07

##################################################
Loss:
Mine:
1.637831211090088

Pytorch's:
1.637831211090088

Sum Difference:
0.0

##################################################
Output gradient:
Mine:
[[ 0.0834 -0.4255  0.0876  0.0875  0.1671]
 [ 0.0594  0.1133  0.0943  0.1062 -0.3732]]

Pytorch's:
[[ 0.0834 -0.4255  0.0876  0.0875  0.1671]
 [ 0.0594  0.1133  0.0943  0.1062 -0.3732]]

Sum Difference:
5.21540641784668e-08

##################################################
layer 1 weight_ih gradient:
Mine:
[[-0.0015  0.0006  0.0007  0.0002  0.0004]
 [ 0.0016 -0.0023 -0.0006 -0.0006  0.0001]
 [-0.0021  0.0005  0.0011  0.0001  0.0007]
 [-0.0011  0.0007  0.0005  0.0002  0.0002]
 [ 0.0025  0.0027 

## Comparing outputs and gradients of a 2 layer GRU 
- doesn't work yet, for some reason grad_z doesn't seem to be accurate even though it's a simple gradient to calculate

In [7]:
batch_size = 2
input_size = 3
hidden_size = 5
time_steps = 1
num_layers = 1

#Pytorch Model
torchrnn = nn.GRU(input_size, hidden_size, num_layers)

#My Model
myrnn = GRU(input_size, hidden_size, num_layers)
myrnn.torch_weighted(torchrnn)

#Inputs
torchin = torch.randn(time_steps, batch_size, input_size)
htorch = torch.randn(num_layers, batch_size, hidden_size)
npin = torchin.detach().numpy()
hnp = htorch.detach().numpy()

#targets
torchtrue = torch.randint(hidden_size,(batch_size,))
nptrue = torchtrue.detach().numpy()

#Outputs
torchout, htorch = torchrnn(torchin,htorch)
torchout = torchout[-1]
torchout.retain_grad()
npout, hnp = myrnn(npin, hnp)
npout = npout[-1]

printdif("Outputs",torchout,npout)

#Loss
torch_loss_fn = nn.CrossEntropyLoss()
torchloss = torch_loss_fn(torchout,torchtrue)
np_loss_fn = lf.CrossEntropyLoss()
nploss = np_loss_fn(npout, nptrue)
printdif("Loss", torchloss, nploss)

#Calculate gradients
torchloss.backward(retain_graph=True)
myrnn.backward(np_loss_fn.gradient)

#Gradients

# printdif("Output gradient", torchout.grad, np_loss_fn.gradient)

for l in reversed(range(num_layers)):
    printdif(f"layer {l} weight_ih gradient",
             torchrnn.all_weights[l][0].grad,
             myrnn.layers[l].gradients[0])
    printdif(f"layer {l} weight_hh gradient",
             torchrnn.all_weights[l][1].grad,
             myrnn.layers[l].gradients[1])
    printdif(f"layer {l} bias_ih gradient",
             torchrnn.all_weights[l][2].grad,
             myrnn.layers[l].gradients[2])
    printdif(f"layer {l} bias_hh gradient",
             torchrnn.all_weights[l][3].grad,
         myrnn.layers[l].gradients[3])

##################################################
Outputs:
Mine:
[[ 0.7246  0.4534  0.5055  0.6272 -0.8576]
 [ 0.7988  0.1712  0.1131  0.4587 -0.0219]]

Pytorch's:
[[ 0.7246  0.4534  0.5055  0.6272 -0.8576]
 [ 0.7988  0.1712  0.1131  0.4587 -0.0219]]

Sum Difference:
2.60770320892334e-07

##################################################
Loss:
Mine:
1.279862403869629

Pytorch's:
1.279862642288208

Sum Difference:
2.384185791015625e-07

h
[[ 0.7246  0.4534  0.5055  0.6272 -0.8576]
 [ 0.7988  0.1712  0.1131  0.4587 -0.0219]]
[ 1.5234  0.6246  0.6186  1.086  -0.8795]

dh
[[ 0.1359  0.1036  0.1092 -0.3767  0.0279]
 [-0.3432  0.0837  0.079   0.1116  0.069 ]]
[-0.2073  0.1873  0.1881 -0.2651  0.0969]

r
[[0.5449 0.5337 0.6016 0.2643 0.2659]
 [0.4064 0.4697 0.5425 0.2926 0.4221]]
[0.9513 1.0034 1.1441 0.5569 0.688 ]

dr
[[0.248  0.2489 0.2397 0.1945 0.1952]
 [0.2412 0.2491 0.2482 0.207  0.2439]]
[0.4892 0.498  0.4879 0.4014 0.4391]

n
[[ 0.7529  0.4885  0.5455  0.0623 -0.1133]
 [ 0.7819  0.

## Comparing outputs and gradients of a 2 layer RNN

In [8]:
batch_size = 2
input_size = 3
hidden_size = 5
time_steps = 1
num_layers = 2

#Pytorch Model
torchrnn = nn.RNN(input_size, hidden_size, num_layers, )

#My Model
myrnn = RNN(input_size, hidden_size, num_layers)
myrnn.torch_weighted(torchrnn)

#Inputs
torchin = torch.randn(time_steps, batch_size, input_size)
htorch = torch.randn(num_layers, batch_size, hidden_size)
npin = torchin.detach().numpy()
hnp = htorch.detach().numpy()

#targets
torchtrue = torch.randint(hidden_size,(batch_size,))
nptrue = torchtrue.detach().numpy()

#Outputs
torchout, htorch = torchrnn(torchin,htorch)
torchout = torchout[-1]
torchout.retain_grad()
npout, hnp = myrnn(npin, hnp)
npout = npout[-1]

printdif("Outputs",torchout,npout)

#Loss
torch_loss_fn = nn.CrossEntropyLoss()
torchloss = torch_loss_fn(torchout,torchtrue)
np_loss_fn = lf.CrossEntropyLoss()
nploss = np_loss_fn(npout, nptrue)
printdif("Loss", torchloss, nploss)

#Calculate gradients
torchloss.backward(retain_graph=True)
myrnn.backward(np_loss_fn.gradient)

#Gradients

printdif("Output gradient", torchout.grad, np_loss_fn.gradient)

for l in reversed(range(num_layers)):
    printdif(f"layer {l} weight_ih gradient",
             torchrnn.all_weights[l][0].grad,
             myrnn.layers[l].gradients[0])
    printdif(f"layer {l} weight_hh gradient",
             torchrnn.all_weights[l][1].grad,
             myrnn.layers[l].gradients[1])
    printdif(f"layer {l} bias_ih gradient",
             torchrnn.all_weights[l][2].grad,
             myrnn.layers[l].gradients[2])
    printdif(f"layer {l} bias_hh gradient",
             torchrnn.all_weights[l][3].grad,
         myrnn.layers[l].gradients[3])

##################################################
Outputs:
Mine:
[[-0.6362  0.6754  0.3392  0.8744  0.265 ]
 [ 0.0648  0.3008  0.6824 -0.7481  0.6173]]

Pytorch's:
[[-0.6362  0.6754  0.3392  0.8744  0.265 ]
 [ 0.0648  0.3008  0.6824 -0.7481  0.6173]]

Sum Difference:
2.3096799850463867e-07

##################################################
Loss:
Mine:
1.4972327947616577

Pytorch's:
1.4972327947616577

Sum Difference:
0.0

##################################################
Output gradient:
Mine:
[[ 0.0348  0.1293  0.0924 -0.3423  0.0858]
 [-0.4207  0.1005  0.1471  0.0352  0.1379]]

Pytorch's:
[[ 0.0348  0.1293  0.0924 -0.3423  0.0858]
 [-0.4207  0.1005  0.1471  0.0352  0.1379]]

Sum Difference:
4.470348358154297e-08

##################################################
layer 1 weight_ih gradient:
Mine:
[[-0.009  -0.2445  0.0207 -0.3033 -0.2654]
 [ 0.025   0.0137 -0.0177 -0.002   0.023 ]
 [ 0.0281  0.0006 -0.019  -0.021   0.0099]
 [-0.0243  0.0513  0.0133  0.0839  0.047 ]
 [ 0.0277  0.00