<a href="https://colab.research.google.com/github/brynelee/deepspeedtrial/blob/main/singleNodeSingleCardTrainDemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install torch torchvision torchaudio



single node single card training

In [7]:
"""(SNSC) Single Node Single GPU Card Training"""
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms

BATCH_SIZE = 256
EPOCHS = 5

if __name__ == "__main__":

# 1. define network
    device = "cuda"
    net = torchvision.models.resnet18(num_classes=10)
    net = net.to(device=device)

# 2. define dataloader
    trainset = torchvision.datasets.CIFAR10(
      root="./data",
      train=True,
      download=True,
      transform=transforms.Compose(
          [
              transforms.RandomCrop(32, padding=4),
              transforms.RandomHorizontalFlip(),
              transforms.ToTensor(),
              transforms.Normalize(
                  (0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)
              ),
          ]
      ),
    )
    train_loader = torch.utils.data.DataLoader(
        trainset,
        batch_size=BATCH_SIZE,
        shuffle=True,
        num_workers=4,
        pin_memory=True,
    )

# 3. define loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(
        net.parameters(),
        lr=0.01,
        momentum=0.9,
        weight_decay=0.0001,
        nesterov=True,
    )

    print("            =======  Training  ======= \n")
# 4. start to train
    net.train()
    for ep in range(1, EPOCHS + 1):
            train_loss = correct = total = 0

    for idx, (inputs, targets) in enumerate(train_loader):
      inputs, targets = inputs.to(device), targets.to(device)
      outputs = net(inputs)

      loss = criterion(outputs, targets)
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      train_loss += loss.item()
      total += targets.size(0)
      correct += torch.eq(outputs.argmax(dim=1), targets).sum().item()

      if (idx + 1) % 10 == 0 or (idx + 1) == len(train_loader):
        print(
          "   == step: [{:3}/{}] [{}/{}] | loss: {:.3f} | acc: {:6.3f}%".format(
          idx + 1,
          len(train_loader),
          ep,
          EPOCHS,
          train_loss / (idx + 1),
          100.0 * correct / total,
              )
        )

    print("\n            =======  Training Finished  ======= \n")



Files already downloaded and verified

   == step: [ 10/196] [5/5] | loss: 2.313 | acc: 15.938%
   == step: [ 20/196] [5/5] | loss: 2.197 | acc: 20.879%
   == step: [ 30/196] [5/5] | loss: 2.108 | acc: 23.776%
   == step: [ 40/196] [5/5] | loss: 2.046 | acc: 26.191%
   == step: [ 50/196] [5/5] | loss: 1.996 | acc: 27.875%
   == step: [ 60/196] [5/5] | loss: 1.954 | acc: 29.206%
   == step: [ 70/196] [5/5] | loss: 1.919 | acc: 30.352%
   == step: [ 80/196] [5/5] | loss: 1.887 | acc: 31.377%
   == step: [ 90/196] [5/5] | loss: 1.857 | acc: 32.378%
   == step: [100/196] [5/5] | loss: 1.833 | acc: 33.180%
   == step: [110/196] [5/5] | loss: 1.811 | acc: 33.860%
   == step: [120/196] [5/5] | loss: 1.791 | acc: 34.603%
   == step: [130/196] [5/5] | loss: 1.775 | acc: 35.219%
   == step: [140/196] [5/5] | loss: 1.757 | acc: 35.742%
   == step: [150/196] [5/5] | loss: 1.742 | acc: 36.289%
   == step: [160/196] [5/5] | loss: 1.725 | acc: 36.875%
   == step: [170/196] [5/5] | loss: 1.710 | acc: 