In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
!ls "/content/drive/MyDrive/Colab Notebooks/LLM25/MinimindReproductionGuide/Codes_pure/trainer"

Pretrain.ipynb


In [21]:
import os
import sys
file_path = "/content/drive/MyDrive/Colab Notebooks/LLM25/MinimindReproductionGuide/Codes_pure/trainer"
os.chdir(file_path)
parent_path = os.path.join(file_path, '..')
sys.path.append(parent_path)
__package__ = "trainer"
import argparse
import time
import math
import warnings
import torch
import torch.distributed as dist
from torch import optim, nn
from torch.nn.parallel import DistributedDataParallel
from torch.utils.data import DataLoader, DistributedSampler
from contextlib import nullcontext
from transformers import AutoTokenizer
from model.model_minimind import MiniMindConfig, MiniMindForCausalLM
from dataset.lm_dataset import PretrainDataset

warnings.filterwarnings('ignore')

In [22]:
class Arguments:
  def __init__(self,
      out_dir = "../out",
      epochs = 1,
      batch_size = 32,
      learning_rate = 5e-4,
      device = "cuda" if torch.cuda.is_available() else "cpu",
      dtype = "bfloat16",
      accumulation_steps = 8,
      grad_clip = 1,
      warmup_iters = 0,
      log_interval = 100,
      save_interval = 100,
      local_rank = -1,
      hidden_size = 512,
      num_hidden_layers = 8,
      max_seq_len = 512,
      data_path = "../dataset/pretrain_hq.jsonl"
  ):
    self.out_dir = out_dir
    self.epochs = epochs
    self.batch_size = batch_size
    self.learning_rate = learning_rate
    self.device = device
    self.dtype = dtype
    self.accumulation_steps = accumulation_steps
    self.grad_clip = grad_clip
    self.warmup_iters = warmup_iters
    self.log_interval = log_interval
    self.save_interval = save_interval
    self.local_rank = local_rank
    self.hidden_size = hidden_size
    self.num_hidden_layers = num_hidden_layers
    self.max_seq_len = max_seq_len
    self.data_path = data_path
    self.save_dir = None
    self.tokens_per_iter = self.batch_size * self.max_seq_len

In [23]:
def Logger(content):
  # 由于笔者暂时不考虑分布式训练，因此此处以及以下各处与源码有所不同。
  print(content)


def get_lr(current_step, total_steps, lr):
  # 余弦退火调度学习率，随着训练的进行，学习率逐渐减小但不至于为0
  return lr / 10 + 0.5 * lr * (1 + math.cos(math.pi * current_step / total_steps))


def init_model(lm_config):
    # 自动读取目标目录的json文件，初始化为tokenizer对象
    tokenizer = AutoTokenizer.from_pretrained('../model/')
    model = MiniMindForCausalLM(lm_config).to(args.device)
    Logger(f'LLM可训练总参数量：{sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.3f} 百万')
    return model, tokenizer


def output_logger(step, loss, iter_per_epoch, start_time):
    if step % args.log_interval == 0:
        spend_time = time.time() - start_time
        Logger(
            'Epoch:[{}/{}]({}/{}) loss:{:.3f} lr:{:.12f} epoch_Time:{}min:'.format(
                epoch + 1,
                args.epochs,
                step,
                iter_per_epoch,
                loss.item() * args.accumulation_steps,
                optimizer.param_groups[-1]['lr'],
                spend_time / (step + 1) * iter_per_epoch // 60 - spend_time // 60))

In [24]:
def train_epoch_new(epoch, train_loader):
  print(optimizer)
  # 1. 更新学习率
  # 2. 计算loss
  # 3. 更新参数
  # 初始化一个loss_function
  loss_fct = nn.CrossEntropyLoss(reduction='none')
  start_time = time.time()

  # 取出X, Y, loss_mask
  for step, (X, Y, loss_mask) in enumerate(train_loader):
    X = X.to(args.device)
    Y = Y.to(args.device)
    loss_mask = loss_mask.to(args.device)

    # 更新学习率
    iter_per_epoch = len(train_loader)
    lr = get_lr(epoch * iter_per_epoch + step, args.epochs * iter_per_epoch, args.learning_rate)
    for param_group in optimizer.param_groups:
      param_group['lr'] = lr

    # 把计算过程用ctx套起来
    with ctx:
      # 计算掩码后loss
      result = model(X)
      loss = loss_fct(result.logits.view(-1, result.logits.size(-1)), Y.view(-1)).view(Y.size())
      loss = (loss * loss_mask).sum() / loss_mask.sum()
      loss += result.aux_loss

      # 优化参数
      optimizer.zero_grad()
      scaler.scale(loss).backward()
      scaler.step(optimizer)
      scaler.update()

      # 打印loss等
      output_logger(step, loss, iter_per_epoch, start_time)

In [25]:
# 拆解后的代码如下：
if __name__ == "__main__":
    args = Arguments()
    lm_config = MiniMindConfig(hidden_size=args.hidden_size, num_hidden_layers=args.num_hidden_layers)
    args.save_dir = os.path.join(args.out_dir)
    os.makedirs(args.save_dir, exist_ok=True)
    os.makedirs(args.out_dir, exist_ok=True)

    # token字典 长度设置
    device_type = args.device

    ctx = nullcontext() if device_type == "cpu" else torch.cuda.amp.autocast()

    tokens_per_iter = args.tokens_per_iter

    # 初始化模型和tokenizer
    model, tokenizer = init_model(lm_config)
    train_ds = PretrainDataset(args.data_path, tokenizer, max_length=args.max_seq_len)
    train_loader = DataLoader(
        train_ds,
        batch_size=args.batch_size,
        pin_memory=True,
        drop_last=False,
        shuffle=False
    )

    scaler = torch.cuda.amp.GradScaler()
    optimizer = optim.AdamW(model.parameters(), lr=args.learning_rate)
    for epoch in range(args.epochs):
        train_epoch_new(epoch, train_loader)

LLM可训练总参数量：25.830 百万


KeyboardInterrupt: 