In [1]:
import sys
import os

# Add the path to the parent module
sys.path.append(os.path.abspath('../..'))

import warnings
warnings.filterwarnings('ignore')


import deeppy as dp

import torch
import torch.optim as optim
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


import numpy as np
import matplotlib.pyplot as plt

import tiktoken
from datasets import load_dataset


from deeppy import LearnFrame,LayerGenerator,FromLoader
from deeppy import Network
from deeppy.models.cv import Sane


In [2]:
batch_size = 64
vocab_size = 200
embed_dim = 1024
latent_dim = 128
num_heads = 4
num_layers = 4
context_size = 50
dropout = 0.1

In [3]:
Optimizer_params = {
    "optimizer":optim.AdamW,
    "optimizer_args":{"lr":3e-4, "amsgrad" : True},
    "clipper":nn.utils.clip_grad_norm_,
    "clipper_params":{"max_norm" : 1.0},
    "scheduler_params":None,
}

Sane_params = {
    "optimizer_params":Optimizer_params,
    "vocab_size":vocab_size,
    "embed_dim":embed_dim,
    "latent_dim":latent_dim,
    "num_heads":num_heads,
    "num_layers":num_layers,
    "context_size":context_size,
    "dropout":dropout,
    "device":device,
    "criterion":nn.MSELoss(),
}

model = dp.cv.Sane(**Sane_params)

In [4]:
model.net

Network(
  (model): Sequential(
    (0): Linear(in_features=200, out_features=1022, bias=True)
    (1): DummyPositionalEmbedding()
    (2): Dropout(p=0.1, inplace=False)
    (3): MaskedTransformerEncoder(
      (encoder): TransformerEncoder(
        (layers): ModuleList(
          (0-3): 4 x TransformerEncoderLayer(
            (self_attn): MultiheadAttention(
              (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
            )
            (linear1): Linear(in_features=1024, out_features=2048, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
            (linear2): Linear(in_features=2048, out_features=1024, bias=True)
            (norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (dropout1): Dropout(p=0.1, inplace=False)
            (dropout2): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (4): Lin

In [5]:
cout = 10
cr = vocab_size
tokenized_input = torch.rand(size = (batch_size, cout, cr))

print("Assume that layer of a NN is already flattened and the following tensor\n is batch_size x cout x cr")
print(f"{tokenized_input.shape}")


Assume that layer of a NN is already flattened and the following tensor
 is batch_size x cout x cr
torch.Size([64, 10, 200])


# Encode

In [13]:
latent = model.encode(tokenized_input)
print(f"Latent space : {latent.shape}")

Latent space : torch.Size([64, 10, 128])


In [14]:
T = model.net.model[0](tokenized_input)
print(f"Tokenized input : {T.shape}")

Tp = model.net.model[1:3](T)
print(f"Position encoding + dropout : {Tp.shape}")

Tr = model.net.model[3](Tp)
print(f"After transformer encoder : {Tr.shape}")

latent = model.net.model[4](Tp)
print(f"Latent space : {latent.shape}")

Tokenized input : torch.Size([64, 10, 1022])
Position encoding + dropout : torch.Size([64, 10, 1024])
After transformer encoder : torch.Size([64, 10, 1024])
Latent space : torch.Size([64, 10, 128])


# Decode

In [16]:
z = model.decode(latent)
print(f"Output : {z.shape}")

Output : torch.Size([64, 10, 200])


In [17]:
T = model.net.model[5](latent)
print(f"Decoder compression : {T.shape}")

T = model.net.model[6:7](T)
print(f"Decoder position encoding + dropout : {T.shape}")

T = model.net.model[8](T)
print(f"Decoder transformer : {T.shape}")

z = model.net.model[9](T)
print(f"Output : {z.shape}")

Decoder compression : torch.Size([64, 10, 1022])
Decoder position encoding + dropout : torch.Size([64, 10, 1024])
Decoder transformer : torch.Size([64, 10, 1024])
Output : torch.Size([64, 10, 200])


# Autoencoder Pass

In [22]:
z = model(tokenized_input)
print(f"Output : {z.shape}")

Output : torch.Size([64, 10, 200])
