In [1]:
import torch

if torch.cuda.is_available():
    num_gpus = torch.cuda.device_count()
    print(f"Number of GPUs: {num_gpus}")

    for i in range(num_gpus):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(f"  CUDA Capability: {torch.cuda.get_device_capability(i)}")
else:
    print("No GPU available. Please check your CUDA installation.")

Number of GPUs: 8
GPU 0: NVIDIA A100-PCIE-40GB
  CUDA Capability: (8, 0)
GPU 1: NVIDIA A100-PCIE-40GB
  CUDA Capability: (8, 0)
GPU 2: NVIDIA A100-PCIE-40GB
  CUDA Capability: (8, 0)
GPU 3: NVIDIA A100-PCIE-40GB
  CUDA Capability: (8, 0)
GPU 4: NVIDIA A100-PCIE-40GB
  CUDA Capability: (8, 0)
GPU 5: NVIDIA A100-PCIE-40GB
  CUDA Capability: (8, 0)
GPU 6: NVIDIA A100-PCIE-40GB
  CUDA Capability: (8, 0)
GPU 7: NVIDIA A100-PCIE-40GB
  CUDA Capability: (8, 0)


In [2]:
%cd ../no_git_oic/minimind
!pwd

/mnt/data/llch/for_llm_upgrade/no_git_oic/minimind
/mnt/data/llch/for_llm_upgrade/no_git_oic/minimind


In [None]:
# 去命令行执行,界面执行很卡
# modelscope download --dataset gongjy/minimind_dataset --local_dir ./dataset

Downloading Dataset to directory: /mnt/data/llch/for_llm_upgrade/0_预训练/dataset
Processing 15 items:   0%|                          | 0.00/15.0 [00:00<?, ?it/s]
Downloading [.gitattributes]:   0%|                 | 0.00/3.73k [00:00<?, ?B/s][A

Downloading [images/1.png]:   0%|                    | 0.00/9.00 [00:00<?, ?B/s][A[A


Downloading [images/dataset.jpg]:   0%|              | 0.00/146k [00:00<?, ?B/s][A[A[A



Downloading [dataset_infos.json]:   0%|               | 0.00/165 [00:00<?, ?B/s][A[A[A[A




Downloading [dpo.jsonl]:   0%|                       | 0.00/867M [00:00<?, ?B/s][A[A[A[A[A





Downloading [images/logo.png]:   0%|                 | 0.00/495k [00:00<?, ?B/s][A[A[A[A[A[A






Downloading [lora_identity.jsonl]:   0%|            | 0.00/22.3k [00:00<?, ?B/s][A[A[A[A[A[A[A







Downloading [lora_medical.jsonl]:   0%|             | 0.00/32.4M [00:00<?, ?B/s][A[A[A[A[A[A[A[A

Downloading [images/1.png]: 100%|█████████████| 9.00/9.

In [21]:
# Step 1: 导入必要库
import os
import math
import warnings
import torch
from torch import optim, nn
import torch.distributed as dist
from torch.utils.tensorboard import SummaryWriter  # 新增TensorBoard
from torch.utils.data import DataLoader, DistributedSampler
from transformers import AutoTokenizer
from model.model import MiniMindLM
from model.LMConfig import LMConfig
from model.dataset import PretrainDataset

warnings.filterwarnings('ignore')

In [None]:
# Step 2: 配置参数解析
class Args:
    def __init__(self):
        self.out_dir = "out"
        self.epochs = 3
        self.batch_size = 32
        self.learning_rate = 5e-4
        self.device = "cuda" if torch.cuda.is_available() else "cpu" # 是否 cuda:0 
        self.dtype = "bfloat16"
        self.num_workers = 16
        self.accumulation_steps = 4
        self.grad_clip = 1.0  # 梯度裁剪（Gradient Clipping）的阈值
        self.log_interval = 50
        self.save_interval = 100
        self.dim = 512
        self.n_layers = 8
        self.max_seq_len = 512
        self.use_moe = False
        self.data_path = "./dataset/pretrain_hq.jsonl"
        self.ddp = False  # 设置为True时需要配合torchrun使用
        self.local_rank = -1

args = Args()
args.dim

512

In [14]:
# Step 3: 初始化TensorBoard
writer = SummaryWriter(log_dir=os.path.join(args.out_dir, 'tensorboard'))

In [None]:
# Step 4: 分布式训练初始化
def init_distributed():
    # 如果 args.ddp 为 False，直接返回，不执行分布式初始化
    if not args.ddp:
        return
    # 初始化分布式进程组 使用 NCCL 后端，适合 GPU 集群的分布式训练
    dist.init_process_group(backend="nccl")
    args.local_rank = int(os.environ["LOCAL_RANK"])
    args.device = torch.device(f"cuda:{args.local_rank}")
    torch.cuda.set_device(args.device)
args.local_rank, args.device

(-1, 'cuda')

In [17]:
# Step 5: 模型和分词器初始化
def init_model():
    lm_config = LMConfig(
        dim=args.dim,
        n_layers=args.n_layers,
        max_seq_len=args.max_seq_len,
        use_moe=args.use_moe
    )
    
    tokenizer = AutoTokenizer.from_pretrained('./model/minimind_tokenizer')
    model = MiniMindLM(lm_config).to(args.device)
    
    if args.ddp:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank
        )
    
    return model, tokenizer

In [18]:
def prepare_dataloader(tokenizer):
    dataset = PretrainDataset(
        args.data_path,
        tokenizer,
        max_length=args.max_seq_len
    )
    
    sampler = DistributedSampler(dataset) if args.ddp else None
    
    return DataLoader(
        dataset,
        batch_size=args.batch_size,
        shuffle=(sampler is None),
        sampler=sampler,
        num_workers=args.num_workers,
        pin_memory=True,
        drop_last=True
    )

In [19]:
# Step 7: 优化器和学习率调度器
def create_optimizer_and_scheduler(model):
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=args.learning_rate,
        weight_decay=0.01
    )
    
    total_steps = len(train_loader) * args.epochs
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer,
        T_max=total_steps
    )
    
    return optimizer, scheduler

In [20]:
# Step 8: 训练循环
def train(model, train_loader, optimizer, scheduler):
    model.train()
    loss_fct = nn.CrossEntropyLoss(reduction='none')
    scaler = torch.amp.GradScaler(enabled=args.dtype in ['float16', 'bfloat16'])
    
    for epoch in range(args.epochs):
        if args.ddp:
            train_loader.sampler.set_epoch(epoch)
            
        for step, (X, Y, loss_mask) in enumerate(train_loader):
            X, Y, loss_mask = X.to(args.device), Y.to(args.device), loss_mask.to(args.device)
            
            with torch.cuda.amp.autocast(dtype=args.dtype):
                outputs = model(X)
                loss = loss_fct(
                    outputs.logits.view(-1, outputs.logits.size(-1)),
                    Y.view(-1)
                )
                loss = (loss * loss_mask).sum() / loss_mask.sum()
                loss += outputs.aux_loss  # 添加辅助损失（如MoE损失）
                loss = loss / args.accumulation_steps
                
            scaler.scale(loss).backward()
            
            if (step + 1) % args.accumulation_steps == 0:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)
                
            scheduler.step()
            
            # 日志记录
            if step % args.log_interval == 0:
                current_lr = scheduler.get_last_lr()[0]
                writer.add_scalar('Loss/train', loss.item(), epoch * len(train_loader) + step)
                writer.add_scalar('Learning Rate', current_lr, epoch * len(train_loader) + step)
                
                print(f"Epoch {epoch+1}/{args.epochs} | Step {step}/{len(train_loader)} | "
                      f"Loss: {loss.item():.4f} | LR: {current_lr:.6f}")
                
            # 模型保存
            if step % args.save_interval == 0 and (not args.ddp or dist.get_rank() == 0):
                checkpoint = {
                    'model': model.module.state_dict() if args.ddp else model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                    'epoch': epoch,
                    'step': step
                }
                torch.save(checkpoint, os.path.join(args.out_dir, f'checkpoint_{epoch}_{step}.pt'))

In [22]:
# 主执行流程
init_distributed()
model, tokenizer = init_model()
train_loader = prepare_dataloader(tokenizer)
optimizer, scheduler = create_optimizer_and_scheduler(model)
train(model, train_loader, optimizer, scheduler)
writer.close()

TypeError: set_autocast_dtype(): argument 'dtype' (position 2) must be torch.dtype, not str