# Try DeepSpeed

In [2]:
import torch
import torch.nn as nn
from deepspeed.pipe import PipelineModule
import deepspeed
import argparse

# Define a simple two-layer MLP
class TwoLayerMLP(nn.Module):
    def __init__(self, input_dim=10, hidden_dim=20, output_dim=2):
        super(TwoLayerMLP, self).__init__()
        self.layer1 = nn.Linear(input_dim, hidden_dim)  # First layer
        self.relu = nn.ReLU(inplace=True)              # Activation
        self.layer2 = nn.Linear(hidden_dim, output_dim) # Second layer

    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return x

# Loss function for PipelineModule (required by DeepSpeed)
def loss_fn(output, target):
    return nn.MSELoss()(output, target)

# Argument parser for DeepSpeed configuration
parser = argparse.ArgumentParser()
parser.add_argument('--local_rank', type=int, default=0, help='Local rank for distributed training')
args = parser.parse_args()

# Initialize DeepSpeed distributed environment
deepspeed.init_distributed()

# Model and data parameters
input_dim, hidden_dim, output_dim = 10, 20, 2
batch_size, num_samples = 16, 64
num_gpus = torch.cuda.device_count()  # Assumes at least 2 GPUs
assert num_gpus >= 2, "This example requires at least 2 GPUs for PP and DP"

# Synthetic dataset (random data and targets)
data = torch.randn(num_samples, input_dim).cuda()
targets = torch.randn(num_samples, output_dim).cuda()
dataset = torch.utils.data.TensorDataset(data, targets)
train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define the MLP as a sequence for PipelineModule
# Split into 2 stages: layer1+relu on GPU 0, layer2 on GPU 1
layers = [
    nn.Linear(input_dim, hidden_dim),
    nn.ReLU(inplace=True),
    nn.Linear(hidden_dim, output_dim)
]

# Create PipelineModule with 2 stages (one per layer group)
model = PipelineModule(
    layers=layers,
    loss_fn=loss_fn,
    num_stages=2,  # Two pipeline stages: layer1+relu, layer2
    partition_method='parameters',  # Partition by number of parameters
)

# DeepSpeed configuration (minimal example)
ds_config = {
    "train_micro_batch_size_per_gpu": batch_size,
    "optimizer": {
        "type": "Adam",
        "params": {"lr": 0.001}
    },
    "fp16": {"enabled": True},  # Use mixed precision for efficiency
    "pipeline": {"activation_checkpointing": True}  # Reduce memory usage
}

# Initialize DeepSpeed engine (combines DP and PP)
model_engine, optimizer, _, _ = deepspeed.initialize(
    args=args,
    model=model,
    model_parameters=[p for p in model.parameters() if p.requires_grad],
    config=ds_config
)

# Training loop
num_epochs = 5
for epoch in range(num_epochs):
    model_engine.train()
    for batch_idx, (inputs, targets) in enumerate(train_loader):
        inputs, targets = inputs.cuda(), targets.cuda()
        loss = model_engine(inputs, targets)  # Forward pass with PP
        model_engine.backward(loss)          # Backward pass with PP
        model_engine.step()                  # Optimizer step with DP
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}")

# Finalize
print("Training completed!")

[2025-03-08 08:40:34,469] [INFO] [real_accelerator.py:222:get_accelerator] Setting ds_accelerator to cuda (auto detect)


df: /home/nee7ne/.triton/autotune: No such file or directory
/home/nee7ne/.conda/envs/effi_cot/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/nee7ne/.conda/envs/effi_cot/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::runtime_error::~runtime_error()@GLIBCXX_3.4'
/home/nee7ne/.conda/envs/effi_cot/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `__gxx_personality_v0@CXXABI_1.3'
/home/nee7ne/.conda/envs/effi_cot/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::ostream::tellp()@GLIBCXX_3.4'
/home/nee7ne/.conda/envs/effi_cot/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::chrono::_V2::steady_clock::now()@GLIBCXX_3.4.19'
/home/nee7ne/.conda/envs/effi_cot/compiler_compat/ld: /usr/local/cuda/lib64/libcufile.so: undefined reference to `std::string::_M_replace_aux(unsigned long, unsigned lon

SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
