# Initialize the model
A small snippet to initialize a model for pretraining

In [10]:
import sys
sys.path.insert(0, '..')
from src.transformer.transformer import Transformer
import argparse

## Hyperparameters

In [47]:
class Args(argparse.Namespace):
  hidden_size = 96 #size of the hidden layers and embeddings
  hidden_ff = 512 #size of the position-wise feed-forward layer
  n_encoders = 4 # number of encoder blocks
  n_heads = 8 # number of attention heads in the multiheadattention module
  n_local = 2 # number of local attention heads 
  local_window_size = 4 # size of the window for local attention
  max_length = 100 # maximum length of the input sequence
  vocab_size = 100 # size of the vocabulary
  num_classes = 3 # number of classes for the SOP class (we have 3: original, reversed, shuffled)
  lr = 0.001
  batch_size = 4
  num_epochs = 30
  device = 'cuda'
  attention_type = "performer"
  norm_type = "rezero"
  num_random_features = 32 # number of random features for the Attention module (Performer uses this)
  parametrize_emb = True # whether to center the token embeddin matrix
  
  emb_dropout = 0.1 #dropout for the embedding block
  fw_dropout = 0.1 #dropout for the position-wise feed-forward layer
  att_dropout = 0.1 # dropout for the multiheadattention module
  hidden_act = "swish" # activation function for the hidden layers (attention layers use ReLU)
  

hparams=Args()

## Initialization of the model

In [48]:
l2v = Transformer(hparams=hparams)

In [44]:
l2v

Transformer(
  (embedding): Embeddings(
    (token): ParametrizedEmbedding(
      100, 96, padding_idx=0
      (parametrizations): ModuleDict(
        (weight): ParametrizationList(
          (0): Center()
        )
      )
    )
    (segment): Embedding(4, 96, padding_idx=0)
    (age): PositionalEmbedding()
    (abspos): PositionalEmbedding()
    (res_age): ReZero()
    (res_abs): ReZero()
    (res_seg): ReZero()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoders): ModuleList(
    (0-3): 4 x EncoderLayer(
      (attention): MultiHeadAttention(
        (attention): CustomSelfAttention(
          (fast_attention): FastAttention(
            (kernel_fn): ReLU()
          )
          (local_attn): LocalAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (rel_pos): None
          )
          (to_q): Linear(in_features=96, out_features=96, bias=False)
          (to_k): Linear(in_features=96, out_features=96, bias=False)
          (to_v): Linear(in_features=