In [2]:
import torch
import pandas as pd
from Code.Utils.Read import read_config
from Code.Utils.Logging import setup_logging
from Code.Model.Model import Transformer
from Code.Train.Train import train_epoch
from Code.SFT.DataClean.CleanAlpacaGpt4 import cleanAlpacaGpt4File
from Code.DataSet.Dataset import PretrainDataset
from Code.Tokenizer.Tokenizer import DataPreProcess

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
input = torch.randn(3, 5, requires_grad=True)
target = torch.empty(3, dtype=torch.long).random_(5)

print(input.size())
print(target.size())

torch.Size([3, 5])
torch.Size([3])


In [2]:
config = {
    "embed_dim" : 512,
    "max_seq_len" : 512,
    "n_layers" : 2,
    "n_heads" : 2,
    "multiple_of" : 32,
    "dropout" : 0.0,
    "bias" : False,
    "learning_rate" : 3e-4 ,
    "weight_decay" : 1e-1,
    "beta1" : 0.9,
    "beta2" : 0.95,
    "grad_clip" : 1.0,
    "batch_size" : 32,
    "vocab_size" : 64793,
    "max_epoch" : 1,
    "device": 'cpu',
    "norm_eps": 1e-5
}

In [3]:
setup_logging("./Log/training.log")

In [4]:
dtype = 'float16'
data_path_list=[
    './data/pretrain_data.bin'
]
# Dataset Preparation
train_ds = PretrainDataset(data_path_list, max_length=config["max_seq_len"],use_memmap=True)
train_loader = torch.utils.data.DataLoader(train_ds, batch_size=config["batch_size"],
        pin_memory=False, drop_last=False, shuffle=False,        
        num_workers=0 if config["device"] == 'cpu' else 4
)

memmap: True train data.shape: (266249, 512)
Downloading finished...


In [5]:
model = Transformer(config)
model.to(config["device"])

Transformer(
  (tok_embeddings): Embedding(64793, 512)
  (dropout): Dropout(p=0.0, inplace=False)
  (layers): ModuleList(
    (0-1): 2 x TransformerBlock(
      (attention): Attention(
        (wq): Linear(in_features=512, out_features=512, bias=False)
        (wk): Linear(in_features=512, out_features=512, bias=False)
        (wv): Linear(in_features=512, out_features=512, bias=False)
        (wo): Linear(in_features=512, out_features=512, bias=False)
        (attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_dropout): Dropout(p=0.0, inplace=False)
      )
      (feed_forward): FeedForward(
        (w1): Linear(in_features=512, out_features=1376, bias=False)
        (w2): Linear(in_features=1376, out_features=512, bias=False)
        (w3): Linear(in_features=512, out_features=1376, bias=False)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (attention_norm): RMSNorm()
      (ffn_norm): RMSNorm()
    )
  )
  (norm): RMSNorm()
  (output): Linear(in_features=512,

In [6]:
scaler = torch.cuda.amp.GradScaler(enabled=(dtype == 'float16'))
optimizer = model.configure_optimizers(config["weight_decay"], config["learning_rate"], 
                                           (config["beta1"], config["beta2"]), config["device"])
raw_model = model



num decayed parameter tensors: 15, with 39,498,240 parameters
num non-decayed parameter tensors: 5, with 2,560 parameters
using fused AdamW: False


In [7]:
for epoch in range(config["max_epoch"]):
        train_epoch(epoch, model, raw_model, train_loader, optimizer, scaler,
                learning_rate = 3e-4, decay_lr = None, 
                gradient_accumulation_steps = 1, grad_clip = 1.0,
                device = config["device"])

        torch.save(raw_model.state_dict(),f'Weight/epoch_{epoch}.pth')

X size torch.Size([32, 511])
Y size torch.Size([32, 511])


2024-06-27 14:38:50,798 - INFO - step: 0, lr,  0.0000, loss:  11.1775


logits size torch.Size([32, 511, 64793])
