# 组件化模型构建+开始训练
使用 lightning 训练 Decoder-Only 的 Transformer 模型

In [6]:
# 使用 lightning 训练 Decoder-Only 的 Transformer 模型
import os
import math
import tiktoken
import torch
import torch.nn as nn
from torch.nn import functional as F
import lightning as L

from dotenv import load_dotenv
import logging
from termcolor import colored

load_dotenv()
log_level = os.getenv("LOG_LEVEL", "INFO").upper()
logging.basicConfig(
    level=getattr(logging, log_level),
    format="%(asctime)s-%(levelname)s: %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)
logger = logging.getLogger(__name__)

In [9]:
# 超参数设置
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 4 # 批次大小
context_length = 16 # 文本长度 ==> 截取一段话,其最多有 context_length 个 token 
d_model = 64  # 模型维度
num_blocks = 8  # 模型 block 层数
num_heads = 4  # 通过 d_model / num_heads = 来获取 head_size
learning_rate = 1e-3  # 学习率 0.001
dropout = 0.1 # Dropout rate
max_iters = 500  # 迭代次数
eval_interval = 50  # 每多少轮验证模型 
eval_iters = 20  # 评估模型时，需要对损失进行多少次迭代平均

TORCH_SEED = 1337
torch.manual_seed(TORCH_SEED)

logger.info(colored(f"device:{device}", "green"))

2025-01-12 21:25:50-INFO: [32mdevice:cpu[0m


In [10]:
# 文件读取
with open('../../../z_using_files/txt/Reverend_Insanity.txt','r',encoding='utf-8') as f:
    text = f.read()
text[:200]

'Information\nTable of Contents URL: https://novelfull.com/reverend-insanity.html\n\n\n\nAuthor:Gu Zhen Ren, 蛊真人\nAlternative names:Cổ chân nhân, Daoist Gu, Gu Zhen Ren, 蛊真人\nGenre:Fantasy, Martial Arts, Acti'

In [11]:
# 使用 TikToken 将文本 token 化
import tiktoken
cl100k_base = tiktoken.get_encoding("cl100k_base")
enc = cl100k_base.encode(text)
tokenized_text = torch.tensor(enc, dtype=torch.long, device=device) # Convert tokens into a tensor
max_token_value = tokenized_text.max().item()
logger.info(colored(f"{len(enc)}", "green"))
logger.info(colored(f"{enc[:20]}", "green"))
logger.info(colored(f"max_token_value:{max_token_value}", "green"))

# 数据分割训练集,验证集
split_idx = int(len(tokenized_text) * 0.9)
train_data = tokenized_text[:split_idx]
val_data = tokenized_text[split_idx:]
train_data, val_data , len(train_data), len(val_data)
logger.info(colored(f"{len(train_data)}, {len(val_data)}", "green"))

2025-01-12 22:06:56-INFO: [32m6581059[0m
2025-01-12 22:06:56-INFO: [32m[15218, 198, 2620, 315, 36962, 5665, 25, 3788, 1129, 39142, 491, 620, 916, 10991, 424, 408, 22610, 39370, 2628, 1038][0m
2025-01-12 22:06:56-INFO: [32mmax_token_value:100252[0m
2025-01-12 22:06:56-INFO: [32m5922953, 658106[0m


In [None]:
class ScaledDotProductAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.Dk = self.d_model // self.num_heads
        self.Wq = nn.Linear(self.d_model, self.d_model)
        self.Wk = nn.Linear(self.d_model, self.d_model)
        self.Wv = nn.Linear(self.d_model, self.d_model)

    def forward(self, x, mask=None):
        batch_size, seq_length, _ = x.size()
        
        Q = self.Wq(x)
        K = self.Wk(x)
        V = self.Wv(x)
        
        # Reshape to (batch_size, num_heads, seq_length, Dk)
        Q = Q.view(batch_size, seq_length, self.num_heads, self.Dk).transpose(1, 2)
        K = K.view(batch_size, seq_length, self.num_heads, self.Dk).transpose(1, 2)
        V = V.view(batch_size, seq_length, self.num_heads, self.Dk).transpose(1, 2)
        
        # Scaled dot-product attention
        attention = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.Dk)
        
        if mask is not None:
            attention = attention.masked_fill(mask == 0, float('-inf'))
        
        attention = F.softmax(attention, dim=-1)
        attention = torch.matmul(attention, V)
        
        # Reshape back to (batch_size, seq_length, d_model)
        attention = attention.transpose(1, 2).contiguous().view(batch_size, seq_length, self.d_model)
        
        return attention

In [None]:
# FFN
class FeedForward(nn.Module):
    def __init__(self, d_model, dropout=0.1):
        super().__init__()
        self.d_model = d_model
        self.dropout = dropout
        self.ffn = nn.Sequential(
            nn.Linear(d_model, d_model * 4),
            nn.ReLU(),
            nn.Linear(d_model * 4, d_model),
            nn.Dropout(dropout),
        )

    def forward(x):
        return self.ffn(x)

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads):
        super().__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.attention = ScaledDotProductAttention(d_model, num_heads)
        self.Wo = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        attention_output = self.attention(x, mask)
        output = self.Wo(attention_output)
        return output

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads, dropout=0.1):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, num_heads)
        self.ffn = FeedForward(d_model, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        # Multi-head attention
        attention_output = self.attention(x, mask)
        x = x + self.dropout(attention_output)
        x = self.norm1(x)
        
        # Feed-forward network
        ffn_output = self.ffn(x)
        x = x + self.dropout(ffn_output)
        x = self.norm2(x)
        
        return x

In [None]:
# https://github.com/Lightning-AI/pytorch-lightning
class LitDecoderOnlyTransformer(L.LightningModule):
    def __init__(self):
        super().__init__()