In [1]:
import torch

In [20]:
torch.cuda.device_count()

4

In [3]:
import os
import sys
import tempfile
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp

from torch.nn.parallel import DistributedDataParallel as DDP

# On Windows platform, the torch.distributed package only
# supports Gloo backend, FileStore and TcpStore.
# For FileStore, set init_method parameter in init_process_group
# to a local file. Example as follow:
# init_method="file:///f:/libtmp/some_file"
# dist.init_process_group(
#    "gloo",
#    rank=rank,
#    init_method=init_method,
#    world_size=world_size)
# For TcpStore, same way as on Linux.

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'

    # initialize the process group
    dist.init_process_group("gloo", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

In [4]:
class ToyModel(nn.Module):
    def __init__(self):
        super(ToyModel, self).__init__()
        self.net1 = nn.Linear(10, 10)
        self.relu = nn.ReLU()
        self.net2 = nn.Linear(10, 5)

    def forward(self, x):
        return self.net2(self.relu(self.net1(x)))


def demo_basic(rank, world_size):
    print(f"Running basic DDP example on rank {rank}.")
    setup(rank, world_size)

    # create model and move it to GPU with id rank
    model = ToyModel().to(rank)
    ddp_model = DDP(model, device_ids=[rank])

    loss_fn = nn.MSELoss()
    optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)

    optimizer.zero_grad()
    outputs = ddp_model(torch.randn(20, 10))
    labels = torch.randn(20, 5).to(rank)
    loss_fn(outputs, labels).backward()
    optimizer.step()

    cleanup()


def run_demo(demo_fn, world_size):
    mp.spawn(demo_fn,
             args=(world_size,),
             nprocs=world_size,
             join=True)

In [6]:
run_demo(demo_basic, 4)

ProcessExitedException: process 0 terminated with exit code 1

In [19]:
net = ToyModel()
# net = torch.nn.parallel.DistributedDataParallel(net, device_ids=['cuda:0', 'cuda:1', 'cuda:2', 'cuda:3'])
net = torch.nn.parallel.DistributedDataParallel(net, device_ids=None)

RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.

In [17]:
%%time
out = net(torch.randn(10000000, 10))

CPU times: user 1.12 s, sys: 271 ms, total: 1.39 s
Wall time: 1.39 s


In [2]:
import torch
from torch import nn

In [39]:
parser = argparse.ArgumentParser()
parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
                    help='number of data loading workers (default: 4)')
parser.add_argument('-g', '--gpus', default=1, type=int,
                    help='number of gpus per node')
parser.add_argument('-nr', '--nr', default=0, type=int,
                    help='ranking within the nodes')
parser.add_argument('--epochs', default=2, type=int, metavar='N',
                    help='number of total epochs to run')
args = parser.parse_args(args='')

In [50]:
import os
from datetime import datetime
import argparse
import torch.multiprocessing as mp
import torchvision
import torchvision.transforms as transforms
import torch
import torch.nn as nn
import torch.distributed as dist

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N',
                        help='number of data loading workers (default: 4)')
    parser.add_argument('-g', '--gpus', default=1, type=int,
                        help='number of gpus per node')
    parser.add_argument('-nr', '--nr', default=0, type=int,
                        help='ranking within the nodes')
    parser.add_argument('--epochs', default=2, type=int, metavar='N',
                        help='number of total epochs to run')
    args = parser.parse_args('')
    args.world_size = args.gpus * args.nodes
    print(args)
    os.environ['MASTER_ADDR'] = '10.57.23.164'
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '8888'
    mp.spawn(train, nprocs=args.gpus, args=(args,), join=False)

def train(gpu, args):
    print(f'training on gpu {gpu}')
    rank = args.nr * args.gpus + gpu
    dist.init_process_group(backend='nccl', init_method='env://', world_size=args.world_size, rank=rank)
    torch.manual_seed(0)
    model = ConvNet()
    torch.cuda.set_device(gpu)
    model.cuda(gpu)
    batch_size = 100
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    optimizer = torch.optim.SGD(model.parameters(), 1e-4)
    # Wrap the model
    model = nn.parallel.DistributedDataParallel(model, device_ids=[gpu])
    # Data loading code

    transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
    #     torchvision.transforms.Resize((h, w), ), 
    #     Rearrange('c (nph psh) (npw psw) -> (nph npw) (psh psw c)', nph=8, npw=8),
    ])
    train_dataset = torchvision.datasets.CIFAR10(root='~/datasets/cifar10/', 
                                            train=True, 
                                            download=True, 
                                            transform=transform)
    
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset,
                                                                    num_replicas=args.world_size,
                                                                    rank=rank)
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               num_workers=0,
                                               pin_memory=True,
                                               sampler=train_sampler)

    start = datetime.now()
    total_step = len(train_loader)
    for epoch in range(args.epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.cuda(non_blocking=True)
            labels = labels.cuda(non_blocking=True)
            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (i + 1) % 100 == 0 and gpu == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, args.epochs, i + 1, total_step,
                                                                         loss.item()))
    if gpu == 0:
        print("Training complete in: " + str(datetime.now() - start))


if __name__ == '__main__':
    main()

Namespace(epochs=2, gpus=1, nodes=1, nr=0, world_size=1)


In [51]:
def hello():
    print('hello')
mp.spawn(hello, nprocs=1, args=(), join=True)

ProcessExitedException: process 0 terminated with exit code 1

In [23]:
from tqdm.notebook import tqdm

In [10]:
import torchvision
import time

In [47]:
class ConvNet(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2))
        self.fc = nn.Linear(8*8*32, num_classes)

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        print(out.device)
        return out



In [48]:
def train(net):
    torch.manual_seed(0)
    batch_size = 1000
    # define loss function (criterion) and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(net.parameters(), 1e-2)
    # Data loading code
    transform = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
    #     torchvision.transforms.Resize((h, w), ), 
    #     Rearrange('c (nph psh) (npw psw) -> (nph npw) (psh psw c)', nph=8, npw=8),
    ])
    print('creating ds')
    train_dataset = torchvision.datasets.CIFAR10(root='~/datasets/cifar10/', 
                                            train=True, 
                                            download=True, 
                                            transform=transform)
    
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
                                               batch_size=batch_size,
                                               shuffle=True,)

    start = time.time()
    total_step = len(train_loader)
    print('starting training')
    n_epochs = 3
    for epoch in range(n_epochs):
        for i, (images, labels) in enumerate(train_loader):
            images = images.to('cuda:0')
            labels = labels.to('cuda:0')
            # Forward pass
            outputs = net(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            if (i + 1) % 10 == 0:
                print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch + 1, n_epochs, i + 1, total_step,
                                                                         loss.item()))

    print("Training complete in: " + str(time.time() - start))
    
net = ConvNet().to('cuda:0')
net = nn.DataParallel(net, [0, 1, 2, 3])
train(net)

creating ds
Files already downloaded and verified
starting training
cuda:0
cuda:1
cuda:3
cuda:2
cuda:0
cuda:2
cuda:1
cuda:3
cuda:0
cuda:2
cuda:1
cuda:3
cuda:0cuda:2

cuda:1
cuda:3
cuda:0
cuda:2
cuda:1cuda:3

cuda:0
cuda:3
cuda:1
cuda:2
cuda:0
cuda:2
cuda:1cuda:3

cuda:0cuda:1

cuda:3
cuda:2
cuda:2
cuda:0
cuda:3cuda:1

cuda:1cuda:2

cuda:3cuda:0

Epoch [1/3], Step [10/50], Loss: 2.0629
cuda:0
cuda:2
cuda:1cuda:3

cuda:0cuda:2
cuda:1

cuda:3
cuda:0
cuda:1cuda:3

cuda:2
cuda:0cuda:2

cuda:1
cuda:3
cuda:0
cuda:2cuda:1
cuda:3

cuda:0
cuda:2cuda:1

cuda:3
cuda:2cuda:0cuda:1
cuda:3


cuda:0
cuda:1
cuda:2
cuda:3
cuda:0
cuda:2
cuda:1cuda:3

cuda:0cuda:3
cuda:1
cuda:2

Epoch [1/3], Step [20/50], Loss: 1.9158
cuda:0cuda:2
cuda:1

cuda:3
cuda:1cuda:0cuda:2


cuda:3
cuda:0
cuda:2
cuda:1
cuda:3
cuda:0
cuda:1
cuda:2
cuda:3
cuda:0cuda:3
cuda:2

cuda:1
cuda:0
cuda:2
cuda:1cuda:3

cuda:2
cuda:0cuda:3

cuda:1
cuda:0
cuda:2
cuda:1
cuda:3
cuda:0cuda:2

cuda:3
cuda:1
cuda:2cuda:0
cuda:1

cuda:3
Epoch [1/3],