# []

In [1]:
# inbuilt 
import os
import sys
import math

# most common
import numpy as np
import matplotlib.pyplot as plt

# pytorch
import torch as tt
import torch.nn as nn
import torch.functional as ff
import torch.distributions as dd
import torch.utils.data as ud

# custom
import known
import known.ktorch as kt

import known.ktorch.rnns.rnn_0 as rnn_0
import known.ktorch.rnns.rnn_1 as rnn_1
import known.ktorch.rnns.rnn_2 as rnn_2
print(f'{sys.version=}\n{np.__version__=}\n{tt.__version__=}\n{known.__version__=}')

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

sys.version='3.8.10 (tags/v3.8.10:3d8993a, May  3 2021, 11:48:03) [MSC v.1928 64 bit (AMD64)]'
np.__version__='1.22.2'
tt.__version__='1.10.1+cu102'
known.__version__='0.0.1'


In [2]:
input_size = 2
seq_len = 4
hidden_size = 3
batch_size=1

rng = np.random.default_rng(10)

xx = rng.uniform(size=(batch_size, seq_len, input_size)).astype(np.float32)
print(xx.shape)

(1, 4, 2)


In [3]:
gru_keras = layers.GRU(hidden_size, return_sequences=True)
out_gru_keras = gru_keras(xx)
out_gru_keras

<tf.Tensor: shape=(1, 4, 3), dtype=float32, numpy=
array([[[-0.13218383, -0.11781129, -0.20121543],
        [-0.18715054, -0.17592502, -0.29832956],
        [-0.17149174, -0.15530027, -0.31345797],
        [-0.19957122, -0.01634341, -0.511578  ]]], dtype=float32)>

In [4]:
gru_keras_weights = gru_keras.get_weights()
print(f'{len(gru_keras_weights)=}')

len(gru_keras_weights)=3


In [5]:
for w in gru_keras_weights:
    print(w.shape, w.dtype, w)

(2, 9) float32 [[ 0.24508941  0.5015604   0.13833356 -0.06305367 -0.64629054 -0.10404563
  -0.33334228 -0.505028   -0.3268228 ]
 [ 0.54777604  0.32911712 -0.08986628 -0.22636646  0.60320705  0.32333213
  -0.05923879  0.7194287  -0.69014966]]
(3, 9) float32 [[-0.12936139 -0.316025    0.49045676  0.168047   -0.13731022  0.08267243
  -0.21405011  0.427722    0.6001281 ]
 [-0.23423842 -0.10378698 -0.20747174  0.63924444  0.4949848   0.1764111
  -0.02985656 -0.38566563  0.23858431]
 [ 0.20267263 -0.37664112 -0.10306689  0.34862775 -0.5382743  -0.5365092
  -0.05048047 -0.32365122 -0.00462342]]
(2, 9) float32 [[0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.]]


In [6]:
gru_torch = nn.GRU(
    input_size=input_size,
    hidden_size=hidden_size,
    bias=True,
    batch_first=True,
    num_layers=1,
    dropout=0.0,
    bidirectional=False,
    dtype=tt.float32
)
sd_gru_torch = gru_torch.state_dict()
for k,v in sd_gru_torch.items():
    print(f'{k}, {v.shape}')

weight_ih_l0, torch.Size([9, 2])
weight_hh_l0, torch.Size([9, 3])
bias_ih_l0, torch.Size([9])
bias_hh_l0, torch.Size([9])


In [7]:
with tt.no_grad():
    gru_torch.get_parameter('weight_ih_l0').copy_(tt.tensor(gru_keras_weights[0].T))
    gru_torch.get_parameter('weight_hh_l0').copy_(tt.tensor(gru_keras_weights[1].T))
    gru_torch.get_parameter('bias_ih_l0').copy_(tt.tensor(gru_keras_weights[2].T[:,0]))
    gru_torch.get_parameter('bias_hh_l0').copy_(tt.tensor(gru_keras_weights[2].T[:,1]))

In [8]:
with tt.no_grad():
    for p in gru_torch.parameters():
        print(p.shape, p)

torch.Size([9, 2]) Parameter containing:
tensor([[ 0.2451,  0.5478],
        [ 0.5016,  0.3291],
        [ 0.1383, -0.0899],
        [-0.0631, -0.2264],
        [-0.6463,  0.6032],
        [-0.1040,  0.3233],
        [-0.3333, -0.0592],
        [-0.5050,  0.7194],
        [-0.3268, -0.6901]], requires_grad=True)
torch.Size([9, 3]) Parameter containing:
tensor([[-0.1294, -0.2342,  0.2027],
        [-0.3160, -0.1038, -0.3766],
        [ 0.4905, -0.2075, -0.1031],
        [ 0.1680,  0.6392,  0.3486],
        [-0.1373,  0.4950, -0.5383],
        [ 0.0827,  0.1764, -0.5365],
        [-0.2141, -0.0299, -0.0505],
        [ 0.4277, -0.3857, -0.3237],
        [ 0.6001,  0.2386, -0.0046]], requires_grad=True)
torch.Size([9]) Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)
torch.Size([9]) Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)


In [9]:
with tt.no_grad():
    out_gru_torch, _ = gru_torch(tt.tensor(xx))
print (out_gru_torch.shape, out_gru_torch)

torch.Size([1, 4, 3]) tensor([[[-0.1683, -0.1996, -0.2168],
         [-0.2151, -0.2336, -0.3181],
         [-0.1707, -0.1600, -0.3287],
         [-0.2118,  0.0547, -0.4854]]])


In [10]:
out_gru_torch.numpy() - out_gru_keras.numpy()

array([[[-0.0360751 , -0.08179066, -0.01554832],
        [-0.02790597, -0.05766286, -0.01975289],
        [ 0.0007631 , -0.00474924, -0.01525414],
        [-0.01224101,  0.07103702,  0.026187  ]]], dtype=float32)

In [11]:
np.sum(np.abs(out_gru_torch.numpy() - out_gru_keras.numpy()))

0.36896732

In [12]:
gru_custom = rnn_0.GRU(
    input_bias=True,
    hidden_bias=True,
    actF=tt.tanh,
    input_size=input_size,         # input features
    hidden_sizes=(hidden_size,),       # hidden features at each layer
    dropout=0.0,        # dropout after each layer, only if hidden_sizes > 1
    batch_first=True,  # if true, excepts input as (batch_size, seq_len, input_size) else (seq_len, batch_size, input_size)
    stack_output=True, # if true, stack output from all timesteps, else returns a list of outputs
    dtype=tt.float32,
    device=None,)

In [13]:
gru_custom.copy_torch(gru_torch)

In [14]:
with tt.no_grad():
    for p in gru_custom.parameters():
        print(p.shape, p)

torch.Size([3, 2]) Parameter containing:
tensor([[ 0.2451,  0.5478],
        [ 0.5016,  0.3291],
        [ 0.1383, -0.0899]], requires_grad=True)
torch.Size([3]) Parameter containing:
tensor([0., 0., 0.], requires_grad=True)
torch.Size([3, 3]) Parameter containing:
tensor([[-0.1294, -0.2342,  0.2027],
        [-0.3160, -0.1038, -0.3766],
        [ 0.4905, -0.2075, -0.1031]], requires_grad=True)
torch.Size([3]) Parameter containing:
tensor([0., 0., 0.], requires_grad=True)
torch.Size([3, 2]) Parameter containing:
tensor([[-0.0631, -0.2264],
        [-0.6463,  0.6032],
        [-0.1040,  0.3233]], requires_grad=True)
torch.Size([3]) Parameter containing:
tensor([0., 0., 0.], requires_grad=True)
torch.Size([3, 3]) Parameter containing:
tensor([[ 0.1680,  0.6392,  0.3486],
        [-0.1373,  0.4950, -0.5383],
        [ 0.0827,  0.1764, -0.5365]], requires_grad=True)
torch.Size([3]) Parameter containing:
tensor([0., 0., 0.], requires_grad=True)
torch.Size([3, 2]) Parameter containing:
tenso

In [15]:
with tt.no_grad():
    out_gru_custom, _ = gru_custom(tt.tensor(xx))
print (out_gru_custom.shape, out_gru_custom)

torch.Size([1, 4, 3]) tensor([[[-0.1683, -0.1996, -0.2168],
         [-0.2151, -0.2336, -0.3181],
         [-0.1707, -0.1600, -0.3287],
         [-0.2118,  0.0547, -0.4854]]])


In [16]:
out_gru_custom.numpy() - out_gru_torch.numpy()

array([[[ 0.0000000e+00,  0.0000000e+00,  0.0000000e+00],
        [ 1.4901161e-08, -1.4901161e-08,  0.0000000e+00],
        [ 0.0000000e+00, -1.4901161e-08,  0.0000000e+00],
        [ 0.0000000e+00, -1.4901161e-08,  0.0000000e+00]]], dtype=float32)

In [17]:
np.sum(np.abs(out_gru_custom.numpy() - out_gru_torch.numpy()))

5.9604645e-08

In [18]:
out_gru_custom.numpy() - out_gru_keras.numpy()

array([[[-0.0360751 , -0.08179066, -0.01554832],
        [-0.02790596, -0.05766287, -0.01975289],
        [ 0.0007631 , -0.00474925, -0.01525414],
        [-0.01224101,  0.071037  ,  0.026187  ]]], dtype=float32)

In [19]:
np.sum(np.abs(out_gru_custom.numpy() - out_gru_keras.numpy()))

0.3689673

#LSTM

In [20]:
lstm_keras = layers.LSTM(hidden_size, return_sequences=True)
out_lstm_keras = lstm_keras(xx)
out_lstm_keras

<tf.Tensor: shape=(1, 4, 3), dtype=float32, numpy=
array([[[0.07615212, 0.03750532, 0.03732687],
        [0.12489272, 0.06556307, 0.04804401],
        [0.14298427, 0.074992  , 0.04078937],
        [0.22945048, 0.05844793, 0.03792481]]], dtype=float32)>

In [21]:
lstm_keras_weights = lstm_keras.get_weights()
print(f'{len(lstm_keras_weights)=}')

len(lstm_keras_weights)=3


In [22]:
for w in lstm_keras_weights:
    print(w.shape, w.dtype, w)

(2, 12) float32 [[-0.16593987 -0.562787    0.51564646  0.13148141 -0.03364134 -0.62979895
   0.25195098  0.2638734   0.15455168  0.25584412  0.04808438 -0.4428633 ]
 [ 0.07767743 -0.19603553  0.43247926 -0.46720794  0.01474583  0.17917931
   0.24092597 -0.18939957  0.04401672  0.43179798 -0.2470138  -0.42046884]]
(3, 12) float32 [[-0.3884369   0.09312984 -0.35791096 -0.18676479  0.29619458  0.34487045
   0.5095191   0.23359406 -0.24399294 -0.26757216  0.14015505 -0.0764764 ]
 [ 0.37449586  0.07638407 -0.23299094  0.26163605 -0.03129346  0.12542845
   0.13972591 -0.04758845 -0.5246375   0.08506239 -0.41515207  0.4876979 ]
 [ 0.05110543  0.6813131  -0.17572173  0.39505234  0.34311208 -0.24711327
  -0.15951802 -0.19148712  0.0113897  -0.13146505  0.29318908 -0.04448877]]
(12,) float32 [0. 0. 0. 1. 1. 1. 0. 0. 0. 0. 0. 0.]


In [23]:
lstm_torch = nn.LSTM(
    input_size=input_size,
    hidden_size=hidden_size,
    bias=True,
    batch_first=True,
    num_layers=1,
    dropout=0.0,
    bidirectional=False,
    dtype=tt.float32
)
sd_lstm_torch = lstm_torch.state_dict()
for k,v in sd_lstm_torch.items():
    print(f'{k}, {v.shape}')

weight_ih_l0, torch.Size([12, 2])
weight_hh_l0, torch.Size([12, 3])
bias_ih_l0, torch.Size([12])
bias_hh_l0, torch.Size([12])


In [24]:
with tt.no_grad():
    lstm_torch.get_parameter('weight_ih_l0').copy_(tt.tensor(lstm_keras_weights[0].T))
    lstm_torch.get_parameter('weight_hh_l0').copy_(tt.tensor(lstm_keras_weights[1].T))
    lstm_torch.get_parameter('bias_ih_l0').copy_(tt.tensor(lstm_keras_weights[2]))
    lstm_torch.get_parameter('bias_hh_l0').copy_(tt.zeros(lstm_keras_weights[2].shape))

In [25]:
with tt.no_grad():
    for p in lstm_torch.parameters():
        print(p.shape, p)

torch.Size([12, 2]) Parameter containing:
tensor([[-0.1659,  0.0777],
        [-0.5628, -0.1960],
        [ 0.5156,  0.4325],
        [ 0.1315, -0.4672],
        [-0.0336,  0.0147],
        [-0.6298,  0.1792],
        [ 0.2520,  0.2409],
        [ 0.2639, -0.1894],
        [ 0.1546,  0.0440],
        [ 0.2558,  0.4318],
        [ 0.0481, -0.2470],
        [-0.4429, -0.4205]], requires_grad=True)
torch.Size([12, 3]) Parameter containing:
tensor([[-0.3884,  0.3745,  0.0511],
        [ 0.0931,  0.0764,  0.6813],
        [-0.3579, -0.2330, -0.1757],
        [-0.1868,  0.2616,  0.3951],
        [ 0.2962, -0.0313,  0.3431],
        [ 0.3449,  0.1254, -0.2471],
        [ 0.5095,  0.1397, -0.1595],
        [ 0.2336, -0.0476, -0.1915],
        [-0.2440, -0.5246,  0.0114],
        [-0.2676,  0.0851, -0.1315],
        [ 0.1402, -0.4152,  0.2932],
        [-0.0765,  0.4877, -0.0445]], requires_grad=True)
torch.Size([12]) Parameter containing:
tensor([0., 0., 0., 1., 1., 1., 0., 0., 0., 0., 0., 0.]

In [26]:
with tt.no_grad():
    out_lstm_torch, _ = lstm_torch(tt.tensor(xx))
print (out_lstm_torch.shape, out_lstm_torch)

torch.Size([1, 4, 3]) tensor([[[0.0762, 0.0375, 0.0373],
         [0.1249, 0.0656, 0.0480],
         [0.1430, 0.0750, 0.0408],
         [0.2295, 0.0584, 0.0379]]])


In [27]:
out_lstm_torch.numpy() - out_lstm_keras.numpy()

array([[[7.4505806e-09, 1.1175871e-08, 3.7252903e-09],
        [2.2351742e-08, 1.4901161e-08, 0.0000000e+00],
        [1.4901161e-08, 2.2351742e-08, 7.4505806e-09],
        [2.9802322e-08, 2.6077032e-08, 0.0000000e+00]]], dtype=float32)

In [28]:
np.sum(np.abs(out_lstm_torch.numpy() - out_lstm_keras.numpy()))

1.6018748e-07

In [29]:
lstm_custom = rnn_0.LSTM(
    input_bias=True,
    hidden_bias=True,
    actF=tt.tanh, actC=tt.tanh,
    input_size=input_size,         # input features
    hidden_sizes=(hidden_size,),       # hidden features at each layer
    dropout=0.0,        # dropout after each layer, only if hidden_sizes > 1
    batch_first=True,  # if true, excepts input as (batch_size, seq_len, input_size) else (seq_len, batch_size, input_size)
    stack_output=True, # if true, stack output from all timesteps, else returns a list of outputs
    dtype=tt.float32,
    device=None,)

In [30]:
lstm_custom.copy_torch(lstm_torch)

In [31]:
with tt.no_grad():
    for p in lstm_custom.parameters():
        print(p.shape, p)

torch.Size([3, 2]) Parameter containing:
tensor([[-0.1659,  0.0777],
        [-0.5628, -0.1960],
        [ 0.5156,  0.4325]], requires_grad=True)
torch.Size([3]) Parameter containing:
tensor([0., 0., 0.], requires_grad=True)
torch.Size([3, 3]) Parameter containing:
tensor([[-0.3884,  0.3745,  0.0511],
        [ 0.0931,  0.0764,  0.6813],
        [-0.3579, -0.2330, -0.1757]], requires_grad=True)
torch.Size([3]) Parameter containing:
tensor([0., 0., 0.], requires_grad=True)
torch.Size([3, 2]) Parameter containing:
tensor([[ 0.1315, -0.4672],
        [-0.0336,  0.0147],
        [-0.6298,  0.1792]], requires_grad=True)
torch.Size([3]) Parameter containing:
tensor([1., 1., 1.], requires_grad=True)
torch.Size([3, 3]) Parameter containing:
tensor([[-0.1868,  0.2616,  0.3951],
        [ 0.2962, -0.0313,  0.3431],
        [ 0.3449,  0.1254, -0.2471]], requires_grad=True)
torch.Size([3]) Parameter containing:
tensor([0., 0., 0.], requires_grad=True)
torch.Size([3, 2]) Parameter containing:
tenso

In [32]:
with tt.no_grad():
    out_lstm_custom, _ = lstm_custom(tt.tensor(xx))
print (out_lstm_custom.shape, out_lstm_custom)

torch.Size([1, 4, 3]) tensor([[[0.0762, 0.0375, 0.0373],
         [0.1249, 0.0656, 0.0480],
         [0.1430, 0.0750, 0.0408],
         [0.2295, 0.0584, 0.0379]]])


In [33]:
out_lstm_custom.numpy() - out_lstm_torch.numpy()

array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]], dtype=float32)

In [34]:
np.sum(np.abs(out_lstm_custom.numpy() - out_lstm_torch.numpy()))

0.0

In [35]:
out_lstm_custom.numpy() - out_lstm_keras.numpy()

array([[[7.4505806e-09, 1.1175871e-08, 3.7252903e-09],
        [2.2351742e-08, 1.4901161e-08, 0.0000000e+00],
        [1.4901161e-08, 2.2351742e-08, 7.4505806e-09],
        [2.9802322e-08, 2.6077032e-08, 0.0000000e+00]]], dtype=float32)

In [36]:
np.sum(np.abs(out_lstm_custom.numpy() - out_lstm_keras.numpy()))

1.6018748e-07