# []

In [1]:
# inbuilt 
import os
import sys
import math

# most common
import numpy as np
import matplotlib.pyplot as plt

# pytorch
import torch as tt
import torch.nn as nn
import torch.functional as ff
import torch.distributions as dd
import torch.utils.data as ud

import random
import time
# custom
import known
import known.ktorch as kt
print(f'{sys.version=}\n{np.__version__=}\n{tt.__version__=}\n{known.__version__=}')


sys.version='3.10.2 (tags/v3.10.2:a58ebcc, Jan 17 2022, 14:12:15) [MSC v.1929 64 bit (AMD64)]'
np.__version__='1.24.1'
tt.__version__='1.13.1+cpu'
known.__version__='0.0.1'


# Sample Data

In [2]:
# set seed
tt.manual_seed(281703975047300) # manually sets a seed for random sampling creation ops
print('Manual-Seed:', tt.initial_seed()) # current seed for default rng

batch_size = 5 
input_size = 4
seq_len = 3

dt=tt.float32
batch_first=False
dropout=0.0
num_layers = 2

num_samples=50
num_loops=10

xx = [tt.rand(size=(batch_size, seq_len, input_size), dtype=dt) for _ in range(num_samples)] \
            if batch_first else \
    [tt.rand(size=(seq_len, batch_size, input_size), dtype=dt) for _ in range(num_samples) ]
len(xx)

Manual-Seed: 281703975047300


50

# Elman

In [None]:
rnn = kt.ELMANX(
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=None,
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_gate=tt.sigmoid,
    activation_out=None
)
kt.show_parameters(rnn)
y, (h,) = rnn(xx[0])

print(f'{y.shape=}')

print(f'h: {len(h)}')
for i,t in enumerate(h):
    print (f'{i}::{t.shape}')

In [None]:
rnnc = kt.ELMAN(
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    has_bias=True,
    dtype=dt,
    actF=tt.sigmoid,
)
kt.copy_parameters(rnn, rnnc)
kt.show_parameters(rnnc)
yc, (hc,) = rnnc(xx[0])

print(f'{yc.shape=}')

print(f'h: {len(hc)}')
for i,t in enumerate(hc):
    print (f'{i}::{t.shape}')

In [None]:
with tt.no_grad():
    print(tt.sum(tt.abs(yc - y)))
    for ih,ihc in zip(h,hc):
        assert ih.shape==ihc.shape
        print(tt.sum(tt.abs(ihc - ih)))

In [None]:
rnn = kt.ELMANX(
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=(16, 8, input_size),
    dropout=0.0,
    batch_first=True,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_gate=tt.sigmoid,
    activation_out=None,
    activation_last=(nn.LogSoftmax,{'dim':-1})
)
kt.show_parameters(rnn)
y, (h,) = rnn(xx[0])

print(f'{y.shape=}')

print(f'h: {len(h)}')
for i,t in enumerate(h):
    print (f'{i}::{t.shape}')

In [3]:
rnnc = kt.ELMANX(
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=(10, 8, 5),
    output_sizes2=(16, 8, input_size),
    dropout=0.0,
    batch_first=True,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_gate=tt.sigmoid,
    activation_out=None,
    activation_out2=None,
    activation_last=(nn.LogSoftmax,{'dim':-1})
)

kt.show_parameters(rnnc)
yc, (hc,) = rnnc(xx[0])

print(f'{yc.shape=}')

print(f'h: {len(hc)}')
for i,t in enumerate(hc):
    print (f'{i}::{t.shape}')

#[1]	Shape[torch.Size([32, 36])]	Params: 1152
#[2]	Shape[torch.Size([32])]	Params: 32
#[3]	Shape[torch.Size([16, 32])]	Params: 512
#[4]	Shape[torch.Size([16])]	Params: 16
#[5]	Shape[torch.Size([8, 16])]	Params: 128
#[6]	Shape[torch.Size([8])]	Params: 8
#[7]	Shape[torch.Size([10, 36])]	Params: 360
#[8]	Shape[torch.Size([10])]	Params: 10
#[9]	Shape[torch.Size([8, 32])]	Params: 256
#[10]	Shape[torch.Size([8])]	Params: 8
#[11]	Shape[torch.Size([5, 16])]	Params: 80
#[12]	Shape[torch.Size([5])]	Params: 5
#[13]	Shape[torch.Size([16, 42])]	Params: 672
#[14]	Shape[torch.Size([16])]	Params: 16
#[15]	Shape[torch.Size([8, 24])]	Params: 192
#[16]	Shape[torch.Size([8])]	Params: 8
#[17]	Shape[torch.Size([4, 13])]	Params: 52
#[18]	Shape[torch.Size([4])]	Params: 4
Total Parameters: 3511
yc.shape=torch.Size([3, 5, 4])
h: 3
0::torch.Size([3, 32])
1::torch.Size([3, 16])
2::torch.Size([3, 8])


In [None]:
rnnc

In [None]:
print(y.shape, y)

# GRU

In [None]:
rnn = kt.GRUX2    (
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=None,#[input_size for _ in range(3)],
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_r_gate=tt.sigmoid, activation_z_gate=tt.sigmoid, activation_n_gate=tt.sigmoid, 
    activation_out=None
)
kt.show_parameters(rnn)
y, (h,) = rnn(xx[0])

print(f'{y.shape=}')

print(f'h: {len(h)}')
for i,t in enumerate(h):
    print (f'{i}::{t.shape}')

In [None]:
rnnc = kt.GRU    (
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    has_bias=True,
    dtype=dt,
    actF=tt.sigmoid,
)
kt.copy_parameters(rnn, rnnc)
kt.show_parameters(rnnc)
yc, (hc,) = rnnc(xx[0])

print(f'{yc.shape=}')

print(f'h: {len(hc)}')
for i,t in enumerate(hc):
    print (f'{i}::{t.shape}')

In [None]:
with tt.no_grad():
    print(tt.sum(tt.abs(yc - y)))
    for ih,ihc in zip(h,hc):
        assert ih.shape==ihc.shape
        print(tt.sum(tt.abs(ihc - ih)))

In [4]:
rnn = kt.GRUX    (
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=[input_size for _ in range(3)],
    output_sizes2=(16, 8, input_size),
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_r_gate=tt.sigmoid, activation_z_gate=tt.sigmoid, activation_n_gate=tt.sigmoid, 
    activation_last=(nn.LogSoftmax,{'dim':-1})
)
kt.show_parameters(rnn)
y, (h,) = rnn(xx[0])

print(f'{y.shape=}')

print(f'h: {len(h)}')
for i,t in enumerate(h):
    print (f'{i}::{t.shape}')

#[1]	Shape[torch.Size([32, 36])]	Params: 1152
#[2]	Shape[torch.Size([32])]	Params: 32
#[3]	Shape[torch.Size([16, 32])]	Params: 512
#[4]	Shape[torch.Size([16])]	Params: 16
#[5]	Shape[torch.Size([8, 16])]	Params: 128
#[6]	Shape[torch.Size([8])]	Params: 8
#[7]	Shape[torch.Size([32, 36])]	Params: 1152
#[8]	Shape[torch.Size([32])]	Params: 32
#[9]	Shape[torch.Size([16, 32])]	Params: 512
#[10]	Shape[torch.Size([16])]	Params: 16
#[11]	Shape[torch.Size([8, 16])]	Params: 128
#[12]	Shape[torch.Size([8])]	Params: 8
#[13]	Shape[torch.Size([32, 36])]	Params: 1152
#[14]	Shape[torch.Size([32])]	Params: 32
#[15]	Shape[torch.Size([16, 32])]	Params: 512
#[16]	Shape[torch.Size([16])]	Params: 16
#[17]	Shape[torch.Size([8, 16])]	Params: 128
#[18]	Shape[torch.Size([8])]	Params: 8
#[19]	Shape[torch.Size([4, 36])]	Params: 144
#[20]	Shape[torch.Size([4])]	Params: 4
#[21]	Shape[torch.Size([4, 32])]	Params: 128
#[22]	Shape[torch.Size([4])]	Params: 4
#[23]	Shape[torch.Size([4, 16])]	Params: 64
#[24]	Shape[torch.Si

In [None]:
rnnc = kt.GRUX2    (
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=[input_size for _ in range(3)],
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_r_gate=tt.sigmoid, activation_z_gate=tt.sigmoid, activation_n_gate=tt.sigmoid, 
    activation_last=(nn.LogSoftmax,{'dim':-1})
)
kt.copy_parameters(rnn, rnnc)
kt.show_parameters(rnnc)
yc, (hc,) = rnnc(xx[0])

print(f'{yc.shape=}')

print(f'h: {len(hc)}')
for i,t in enumerate(hc):
    print (f'{i}::{t.shape}')

In [None]:
with tt.no_grad():
    print(tt.sum(tt.abs(yc - y)))
    for ih,ihc in zip(h,hc):
        assert ih.shape==ihc.shape
        print(tt.sum(tt.abs(ihc - ih)))

In [None]:
print(y.shape, y)

# LSTM

In [None]:
rnn = kt.LSTMX2    (
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=None,#[input_size for _ in range(3)],
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_i_gate=tt.sigmoid, activation_f_gate=tt.sigmoid, activation_g_gate=tt.sigmoid, 
    activation_o_gate=tt.sigmoid, activation_cell=tt.tanh, activation_out=None
)
kt.show_parameters(rnn)
y, (h,c) = rnn(xx[0])

print(f'{y.shape=}')
print(f'h: {len(h)}')
for i,t in enumerate(h):
    print (f'{i}::{t.shape}')

print(f'h: {len(c)}')
for i,t in enumerate(c):
    print (f'{i}::{t.shape}')

In [None]:
rnnc = kt.LSTM    (
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    has_bias=True,
    dtype=dt,
    actF=tt.sigmoid,
    actC=tt.tanh
)
kt.copy_parameters(rnn, rnnc)
kt.show_parameters(rnnc)
yc, (hc,cc) = rnnc(xx[0])

print(f'{yc.shape=}')

print(f'h: {len(hc)}')
for i,t in enumerate(hc):
    print (f'{i}::{t.shape}')

print(f'c: {len(cc)}')
for i,t in enumerate(cc):
    print (f'{i}::{t.shape}')

In [None]:
with tt.no_grad():
    print(tt.sum(tt.abs(yc - y)))
    for ih,ihc in zip(h,hc):
        assert ih.shape==ihc.shape
        print(tt.sum(tt.abs(ihc - ih)))
    for ih,ihc in zip(c,cc):
        assert ih.shape==ihc.shape
        print(tt.sum(tt.abs(ihc - ih)))

In [5]:
rnn = kt.LSTMX    (
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=[input_size for _ in range(3)],
    output_sizes2=(16, 8, input_size),
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_i_gate=tt.sigmoid, activation_f_gate=tt.sigmoid, activation_g_gate=tt.sigmoid, 
    activation_o_gate=tt.sigmoid, activation_cell=tt.tanh, activation_out=None
)
kt.show_parameters(rnn)
y, (h,c) = rnn(xx[0])

print(f'{y.shape=}')
print(f'h: {len(h)}')
for i,t in enumerate(h):
    print (f'{i}::{t.shape}')

print(f'h: {len(c)}')
for i,t in enumerate(c):
    print (f'{i}::{t.shape}')

#[1]	Shape[torch.Size([32, 36])]	Params: 1152
#[2]	Shape[torch.Size([32])]	Params: 32
#[3]	Shape[torch.Size([16, 32])]	Params: 512
#[4]	Shape[torch.Size([16])]	Params: 16
#[5]	Shape[torch.Size([8, 16])]	Params: 128
#[6]	Shape[torch.Size([8])]	Params: 8
#[7]	Shape[torch.Size([32, 36])]	Params: 1152
#[8]	Shape[torch.Size([32])]	Params: 32
#[9]	Shape[torch.Size([16, 32])]	Params: 512
#[10]	Shape[torch.Size([16])]	Params: 16
#[11]	Shape[torch.Size([8, 16])]	Params: 128
#[12]	Shape[torch.Size([8])]	Params: 8
#[13]	Shape[torch.Size([32, 36])]	Params: 1152
#[14]	Shape[torch.Size([32])]	Params: 32
#[15]	Shape[torch.Size([16, 32])]	Params: 512
#[16]	Shape[torch.Size([16])]	Params: 16
#[17]	Shape[torch.Size([8, 16])]	Params: 128
#[18]	Shape[torch.Size([8])]	Params: 8
#[19]	Shape[torch.Size([32, 36])]	Params: 1152
#[20]	Shape[torch.Size([32])]	Params: 32
#[21]	Shape[torch.Size([16, 32])]	Params: 512
#[22]	Shape[torch.Size([16])]	Params: 16
#[23]	Shape[torch.Size([8, 16])]	Params: 128
#[24]	Shape[

In [None]:
rnnc = kt.LSTMX2   (
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=[input_size for _ in range(3)],
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_i_gate=tt.sigmoid, activation_f_gate=tt.sigmoid, activation_g_gate=tt.sigmoid, 
    activation_o_gate=tt.sigmoid, activation_cell=tt.tanh, activation_out=None
)
kt.copy_parameters(rnn, rnnc)
kt.show_parameters(rnnc)
yc, (hc,cc) = rnnc(xx[0])

print(f'{yc.shape=}')

print(f'h: {len(hc)}')
for i,t in enumerate(hc):
    print (f'{i}::{t.shape}')

print(f'c: {len(cc)}')
for i,t in enumerate(cc):
    print (f'{i}::{t.shape}')

# JANET

In [None]:
rnn = kt.JANETX2(
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=None,#[input_size for _ in range(3)],
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_f_gate=tt.sigmoid, activation_g_gate=tt.sigmoid,  activation_out=None, beta=0.0
)
kt.show_parameters(rnn)
y, (h,) = rnn(xx[0])

print(f'{y.shape=}')

print(f'h: {len(h)}')
for i,t in enumerate(h):
    print (f'{i}::{t.shape}')

In [None]:
rnnc = kt.JANET(
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    has_bias=True,
    dtype=dt,
    actF=tt.sigmoid, beta=0.0
)
kt.copy_parameters(rnn, rnnc)
kt.show_parameters(rnnc)
yc, (hc,) = rnnc(xx[0])

print(f'{yc.shape=}')

print(f'h: {len(hc)}')
for i,t in enumerate(hc):
    print (f'{i}::{t.shape}')

In [None]:
with tt.no_grad():
    print(tt.sum(tt.abs(yc - y)))
    for ih,ihc in zip(h,hc):
        assert ih.shape==ihc.shape
        print(tt.sum(tt.abs(ihc - ih)))

In [6]:
rnn = kt.JANETX(
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=[input_size for _ in range(3)],
    output_sizes2=(16, 8, input_size),
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_f_gate=tt.sigmoid, activation_g_gate=tt.sigmoid,  activation_out=None
)
kt.show_parameters(rnn)
y, (h,) = rnn(xx[0])

print(f'{y.shape=}')

print(f'h: {len(h)}')
for i,t in enumerate(h):
    print (f'{i}::{t.shape}')

#[1]	Shape[torch.Size([32, 36])]	Params: 1152
#[2]	Shape[torch.Size([32])]	Params: 32
#[3]	Shape[torch.Size([16, 32])]	Params: 512
#[4]	Shape[torch.Size([16])]	Params: 16
#[5]	Shape[torch.Size([8, 16])]	Params: 128
#[6]	Shape[torch.Size([8])]	Params: 8
#[7]	Shape[torch.Size([32, 36])]	Params: 1152
#[8]	Shape[torch.Size([32])]	Params: 32
#[9]	Shape[torch.Size([16, 32])]	Params: 512
#[10]	Shape[torch.Size([16])]	Params: 16
#[11]	Shape[torch.Size([8, 16])]	Params: 128
#[12]	Shape[torch.Size([8])]	Params: 8
#[13]	Shape[torch.Size([4, 36])]	Params: 144
#[14]	Shape[torch.Size([4])]	Params: 4
#[15]	Shape[torch.Size([4, 32])]	Params: 128
#[16]	Shape[torch.Size([4])]	Params: 4
#[17]	Shape[torch.Size([4, 16])]	Params: 64
#[18]	Shape[torch.Size([4])]	Params: 4
#[19]	Shape[torch.Size([16, 36])]	Params: 576
#[20]	Shape[torch.Size([16])]	Params: 16
#[21]	Shape[torch.Size([8, 20])]	Params: 160
#[22]	Shape[torch.Size([8])]	Params: 8
#[23]	Shape[torch.Size([4, 12])]	Params: 48
#[24]	Shape[torch.Size([4

In [None]:
rnnc = kt.JANETX2(
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=[input_size for _ in range(3)],
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_f_gate=tt.sigmoid, activation_g_gate=tt.sigmoid,  activation_out=None
)
kt.copy_parameters(rnn, rnnc)
kt.show_parameters(rnnc)
yc, (hc,) = rnnc(xx[0])

print(f'{yc.shape=}')

print(f'h: {len(hc)}')
for i,t in enumerate(hc):
    print (f'{i}::{t.shape}')

# MGU

In [None]:
rnn = kt.MGUX(
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=None,#[input_size for _ in range(3)],
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_f_gate=tt.sigmoid, activation_g_gate=tt.sigmoid,  activation_out=None
)
kt.show_parameters(rnn)
y, (h,) = rnn(xx[0])

print(f'{y.shape=}')

print(f'h: {len(h)}')
for i,t in enumerate(h):
    print (f'{i}::{t.shape}')

In [None]:
rnnc = kt.MGU(
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    has_bias=True,
    dtype=dt,
    actF=tt.sigmoid,
)
kt.copy_parameters(rnn, rnnc)
kt.show_parameters(rnnc)
yc, (hc,) = rnnc(xx[0])

print(f'{yc.shape=}')

print(f'h: {len(hc)}')
for i,t in enumerate(hc):
    print (f'{i}::{t.shape}')

In [None]:
with tt.no_grad():
    print(tt.sum(tt.abs(yc - y)))
    for ih,ihc in zip(h,hc):
        assert ih.shape==ihc.shape
        print(tt.sum(tt.abs(ihc - ih)))

In [7]:
rnn = kt.MGUX(
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=[input_size for _ in range(3)],
    output_sizes2=(16, 8, input_size),
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_f_gate=tt.sigmoid, activation_g_gate=tt.sigmoid,  activation_out=None
)
kt.show_parameters(rnn)
y, (h,) = rnn(xx[0])

print(f'{y.shape=}')

print(f'h: {len(h)}')
for i,t in enumerate(h):
    print (f'{i}::{t.shape}')

#[1]	Shape[torch.Size([32, 36])]	Params: 1152
#[2]	Shape[torch.Size([32])]	Params: 32
#[3]	Shape[torch.Size([16, 32])]	Params: 512
#[4]	Shape[torch.Size([16])]	Params: 16
#[5]	Shape[torch.Size([8, 16])]	Params: 128
#[6]	Shape[torch.Size([8])]	Params: 8
#[7]	Shape[torch.Size([32, 36])]	Params: 1152
#[8]	Shape[torch.Size([32])]	Params: 32
#[9]	Shape[torch.Size([16, 32])]	Params: 512
#[10]	Shape[torch.Size([16])]	Params: 16
#[11]	Shape[torch.Size([8, 16])]	Params: 128
#[12]	Shape[torch.Size([8])]	Params: 8
#[13]	Shape[torch.Size([4, 36])]	Params: 144
#[14]	Shape[torch.Size([4])]	Params: 4
#[15]	Shape[torch.Size([4, 32])]	Params: 128
#[16]	Shape[torch.Size([4])]	Params: 4
#[17]	Shape[torch.Size([4, 16])]	Params: 64
#[18]	Shape[torch.Size([4])]	Params: 4
#[19]	Shape[torch.Size([16, 36])]	Params: 576
#[20]	Shape[torch.Size([16])]	Params: 16
#[21]	Shape[torch.Size([8, 20])]	Params: 160
#[22]	Shape[torch.Size([8])]	Params: 8
#[23]	Shape[torch.Size([4, 12])]	Params: 48
#[24]	Shape[torch.Size([4

In [None]:
rnnc = kt.MGUX2(
    input_size=input_size,
    hidden_sizes=(32, 16, 8),
    output_sizes=[input_size for _ in range(3)],
    dropout=0.0,
    batch_first=False,
    stack_output=True,
    cell_bias=True,
    out_bias=True,
    dtype=dt,
    activation_f_gate=tt.sigmoid, activation_g_gate=tt.sigmoid,  activation_out=None
)
kt.copy_parameters(rnn, rnnc)
kt.show_parameters(rnnc)
yc, (hc,) = rnnc(xx[0])

print(f'{yc.shape=}')

print(f'h: {len(hc)}')
for i,t in enumerate(hc):
    print (f'{i}::{t.shape}')