In [1]:
import sys
import os

# Add the path to the parent module
sys.path.append(os.path.abspath('../..'))

import warnings
warnings.filterwarnings('ignore')


import deeppy as dp

import torch
import torch.optim as optim
import torch.nn as nn
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


import numpy as np
import matplotlib.pyplot as plt

import tiktoken
from datasets import load_dataset


from deeppy import LearnFrame,LayerGenerator,FromLoader
from deeppy import Network
from deeppy.models.cv import Sane


In [2]:
Sane.print_args()

        arch_params = {
            "layers":[],
            "blocks":[],
            "block_args":[],
            "out_act":<class 'torch.nn.modules.linear.Identity'>,
            "out_params":{},
            "weight_init":None,
        }
            Scheduler_params = {
                "scheduler",
                "auto_step":True,
                "**kwargs",
            }
        Optimizer_params = {
            "configure_optimizer":None,
            "optimizer":<class 'torch.optim.adamw.AdamW'>,
            "optimizer_args":{},
            "clipper":None,
            "clipper_params":{},
            "scheduler_params":None,
        }
    Network_params = {
        "arch_params",
        "decoder_params":None,
        "task":'reg',
        "optimizer_params":None,
    }
Sane_params = {
    "optimizer_params",
    "max_positions",
    "input_dim":201,
    "latent_dim":128,
    "projection_dim":30,
    "embed_dim":1024,
    "num_heads":4,
    "num_layers":4,
    "dropout":0.1,
    "c

In [34]:
batch_size = 32
input_dim = 20
embed_dim = 64
latent_dim = 128
num_heads = 4
num_layers = 4
window_size = 15
dropout = 0.1
bias = False
projection_dim = 10

one_epoch_length = 1000
epochs = 50 * one_epoch_length

In [35]:
scheduler_config = {
   
}

Scheduler_params = {
                "scheduler" : optim.lr_scheduler.OneCycleLR,
                "auto_step":True,
                 "max_lr": 3e-4,
                "total_steps": epochs,
                "pct_start": 0.3,
                "anneal_strategy": "cos",
                "cycle_momentum": True,
                "base_momentum": 0.85,
                "max_momentum": 0.95,
                "div_factor": 25.0,
                "final_div_factor": 10000.0,
                "three_phase": False,
                "last_epoch": -1,
                "verbose": False,
}

Optimizer_params = {
    "optimizer":optim.AdamW,
    "optimizer_args":{"lr":3e-4, "amsgrad" : True, "weight_decay" : 3e-4, "fused" : True},
    "clipper":nn.utils.clip_grad_norm_,
    "clipper_params":{"max_norm" : 5.0},
    "scheduler_params":Scheduler_params,
}

Sane_params = {
    "optimizer_params":Optimizer_params,
    "max_positions" : [500,500,500],
    "input_dim":input_dim,
    "latent_dim":latent_dim,
    "projection_dim" : projection_dim,
    "embed_dim":embed_dim,
    "num_heads":num_heads,
    "num_layers":num_layers,
    "context_size":window_size,
    "dropout":dropout,
    "bias" : bias,
    "device":device,
    "gamma" : 0.5,
    "ntx_temp" : 0.1,
    "torch_compile" : False

}

model = dp.cv.Sane(**Sane_params)

# Inputs

In [36]:
cout = window_size
cr = input_dim

tokenized_input = torch.rand(size = (batch_size, cout, cr)).to(device)
mask = torch.log(torch.randint(0,2,size = (batch_size*num_heads, cout, cout))).to(device)
positions = torch.randint(0,500, size = (batch_size,cout,3)).to(device)

tokenized_input2 = torch.rand(size = (batch_size, cout, cr)).to(device)
mask2 = torch.log(torch.randint(0,2,size = (batch_size*num_heads, cout, cout))).to(device)
positions2 = torch.randint(0,500, size = (batch_size,cout,3)).to(device)

print("Assume that layer of a NN is already flattened and the following tensor\n is batch_size x cout x cr")


print(f"Inp shape : {tokenized_input.shape}")
print(f"mask shape : {mask.shape}")
print(f"positions shape : {positions.shape}")

Assume that layer of a NN is already flattened and the following tensor
 is batch_size x cout x cr
Inp shape : torch.Size([32, 15, 20])
mask shape : torch.Size([128, 15, 15])
positions shape : torch.Size([32, 15, 3])


# Autoencoder

In [37]:
model.autoencoder

Network(
  (model): Sequential(
    (0): LinearBeforePosition(
      (linear): Linear(in_features=20, out_features=64, bias=True)
    )
    (1): SanePositionalEmbedding(
      (pe1): Embedding(500, 32)
      (pe2): Embedding(500, 32)
      (pe3): Embedding(500, 32)
    )
    (2): Dropout(p=0.1, inplace=False)
    (3): TransformerEncoder(
      (layers): ModuleList(
        (0-3): 4 x TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=64, out_features=64, bias=False)
          )
          (linear1): Linear(in_features=64, out_features=256, bias=False)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=256, out_features=64, bias=False)
          (norm1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=False)
          (dropout2): Dropout(p=0.1, 

## Encode

In [38]:
latent = model.encode((tokenized_input,positions))
print(f"Latent space : {latent.shape}")

Latent space : torch.Size([32, 15, 128])


In [39]:
T = model.autoencoder.model[0]((tokenized_input,positions))
print(f"Tokenized input : {T[0].shape}")

T = model.autoencoder.model[1](T)
T = model.autoencoder.model[2](T)
print(f"Position encoding + dropout : {T.shape}")

T = model.autoencoder.model[3](T)
print(f"After transformer encoder : {T.shape}")

latent = model.autoencoder.model[4](T)
print(f"Latent space : {latent.shape}")

Tokenized input : torch.Size([32, 15, 64])
Position encoding + dropout : torch.Size([32, 15, 64])
After transformer encoder : torch.Size([32, 15, 64])
Latent space : torch.Size([32, 15, 128])


## Decode

In [40]:
z = model.decode((latent,positions))
print(f"Output : {z.shape}")

Output : torch.Size([32, 15, 20])


In [41]:
T = model.autoencoder.model[5]((latent,positions))
print(f"Decoder compression : {T[0].shape}")

T = model.autoencoder.model[6](T)
T = model.autoencoder.model[7](T)
print(f"Decoder position encoding + dropout : {T.shape}")

T = model.autoencoder.model[8](T)
print(f"Decoder transformer : {T.shape}")

z = model.autoencoder.model[9](T)
print(f"Output : {z.shape}")

Decoder compression : torch.Size([32, 15, 64])
Decoder position encoding + dropout : torch.Size([32, 15, 64])
Decoder transformer : torch.Size([32, 15, 64])
Output : torch.Size([32, 15, 20])


## Autoencoder Full Pass

In [42]:
z,y, zp = model((tokenized_input,positions))
print(f"Output : {y.shape}")

Output : torch.Size([32, 15, 20])


# Projection Head

In [43]:
model.project

Network(
  (model): Sequential(
    (0): SqueezeLastDimention()
    (1): Linear(in_features=1920, out_features=10, bias=False)
    (2): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
    (3): ReLU()
    (4): Linear(in_features=10, out_features=10, bias=False)
    (5): LayerNorm((10,), eps=1e-05, elementwise_affine=True)
    (6): ReLU()
  )
)

In [44]:
latent = model.encode((tokenized_input,positions))
p = model.project(latent)
print(f"Latent size : {latent.shape}")
print(f"Projection head output size : {p.shape}")

Latent size : torch.Size([32, 15, 128])
Projection head output size : torch.Size([32, 10])


# Train a Batch

In [45]:
for i in range(10):
    mask = torch.randint(0,2, size = tokenized_input.shape)
    mask2 = torch.randint(0,2, size = tokenized_input.shape)
    
    model.train()
    batch = (tokenized_input,positions,mask,tokenized_input2,positions2,mask2)
    loss = model.optimize(batch)
    print(f"Loss train : {loss}")

Loss train : 3.614069700241089
Loss train : 3.294264078140259
Loss train : 3.525251626968384
Loss train : 3.455906867980957
Loss train : 3.2388103008270264
Loss train : 3.193763256072998
Loss train : 3.1323351860046387
Loss train : 3.2865750789642334
Loss train : 3.2153615951538086
Loss train : 3.0552492141723633


# Test a Batch

In [46]:
mask = torch.randint(0,2, size = tokenized_input.shape)
mask2 = torch.randint(0,2, size = tokenized_input.shape)

model.eval()
batch = (tokenized_input,positions,mask,tokenized_input2,positions2,mask2)
loss = model.test(batch)
print(f"Loss train : {loss}")

Loss train : 3.0454814434051514
