In [5]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

def create_dataset(root_folder="./dataset", seed=1234, debug=False):
    data_path = os.path.join(root_folder, "dataset.txt")
    train_path, val_path, test_path = os.path.join(root_folder, "train.txt"), \
                                        os.path.join(root_folder, "val.txt"), \
                                        os.path.join(root_folder, "test.txt")
    data = np.loadtxt(data_path, dtype=str, delimiter="=")
    train, val_test = train_test_split(data, test_size = 0.2, random_state=seed)
    val, test = train_test_split(val_test, test_size = 0.5, random_state=seed)
    if debug:
        print(train.shape, val.shape, test.shape)
    np.savetxt(train_path, train, fmt="%s", delimiter='=')
    np.savetxt(val_path, val, fmt="%s", delimiter='=')
    np.savetxt(test_path, test, fmt="%s", delimiter='=')

In [6]:
create_dataset()

(800000, 2) (100000, 2) (100000, 2)


In [1]:
import torch
model = torch.load("models/transformer/nlayers3hdim256nhead10/best_model_full_epoch45.pth")
print(model)

Transformer(
  (encoder): Encoder(
    (tok_embedding): Embedding(36, 256)
    (pos_embedding): Embedding(100, 256)
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (ff_layer_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (self_attention): MultiHeadAttentionLayer(
          (fc_q): Linear(in_features=256, out_features=256, bias=True)
          (fc_k): Linear(in_features=256, out_features=256, bias=True)
          (fc_v): Linear(in_features=256, out_features=256, bias=True)
          (fc_o): Linear(in_features=256, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (positionwise_feedforward): PositionwiseFeedforwardLayer(
          (fc_1): Linear(in_features=256, out_features=512, bias=True)
          (fc_2): Linear(in_features=512, out_features=256, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
   

In [2]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Number of model parameters: {count_parameters(model):,}")

Number of model parameters: 4,032,548


In [5]:
from torchsummaryX import summary

df = summary(model, torch.zeros((1,21)).cuda().long(), torch.zeros((1,21)).cuda().long())

                                                   Kernel Shape  \
Layer                                                             
0_encoder.Embedding_tok_embedding                     [256, 36]   
1_encoder.Embedding_pos_embedding                    [256, 100]   
2_encoder.Dropout_dropout                                     -   
3_encoder.layers.0.self_attention.Linear_fc_q        [256, 256]   
4_encoder.layers.0.self_attention.Linear_fc_k        [256, 256]   
5_encoder.layers.0.self_attention.Linear_fc_v        [256, 256]   
6_encoder.layers.0.self_attention.Dropout_dropout             -   
7_encoder.layers.0.self_attention.Linear_fc_o        [256, 256]   
8_encoder.layers.0.Dropout_dropout                            -   
9_encoder.layers.0.LayerNorm_self_attn_layer_norm         [256]   
10_encoder.layers.0.positionwise_feedforward.Li...   [256, 512]   
11_encoder.layers.0.positionwise_feedforward.Dr...            -   
12_encoder.layers.0.positionwise_feedforward.Li...   [512, 256