In [1]:
import os
import torch
import torch.distributed as dist
from torch.multiprocessing import Process
from torchvision import datasets, transforms
from train import *
from dataload import *
from model import *
#import time

In [2]:
def distributed_is_initialized():
    if dist.is_available():
        if dist.is_initialized():
            return True
    return False

In [3]:
def run(args):
    #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    device = torch.device('cpu')
    print(device)
    model = Net()
    is_distributed = distributed_is_initialized()
    print("is_distributed:", is_distributed)
    if is_distributed:
        model.to(device)
        model = torch.nn.parallel.DistributedDataParallel(model)
    else:
        #model = nn.DataParallel(model)
        model.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=args['lr'])

    train_loader = MNISTDataLoader(args['root'], args['batch_size'], train=True, distributed=is_distributed)
    test_loader = MNISTDataLoader(args['root'], args['batch_size'], train=False, distributed=is_distributed)

    trainer = Trainer(model, optimizer, train_loader, test_loader, device)
    trainer.fit(args['epochs'])

In [4]:
def main():
    argv = {'world_size': int(2),
            'rank': int(0),
            'epochs': int(40),
            'back_end': 'gloo',
            'init_method': 'tcp://10.1.1.101:23456',
            'lr': float(1e-3),
            'root': 'data',
            'batch_size': int(32)
           }
    
    print(argv)
    if argv['world_size'] > 1:
        dist.init_process_group(
            backend=argv['back_end'],
            init_method=argv['init_method'],
            world_size=argv['world_size'],
            rank=argv['rank'],
    )
    print('Start Run')
    run(argv)

In [5]:
#os.environ['MASTER_ADDR'] = 'localhost'
#os.environ['MASTER_PORT'] = '23456'
main()


{'world_size': 2, 'rank': 0, 'epochs': 40, 'back_end': 'nccl', 'init_method': 'tcp://10.1.1.101:23456', 'lr': 0.001, 'root': 'data', 'batch_size': 32}
Start Run
cuda
is_distributed: True
Epoch: 1/40, train loss: 0.610072, train acc: 77.30%, test loss: 0.455231, test acc: 82.95%.
Epoch: 2/40, train loss: 0.397138, train acc: 85.36%, test loss: 0.394385, test acc: 85.62%.
Epoch: 3/40, train loss: 0.346577, train acc: 86.93%, test loss: 0.360034, test acc: 86.95%.
Epoch: 4/40, train loss: 0.315841, train acc: 88.18%, test loss: 0.342529, test acc: 87.59%.
Epoch: 5/40, train loss: 0.294255, train acc: 88.98%, test loss: 0.321948, test acc: 88.41%.
Epoch: 6/40, train loss: 0.277202, train acc: 89.55%, test loss: 0.310230, test acc: 88.48%.
Epoch: 7/40, train loss: 0.262937, train acc: 90.02%, test loss: 0.309009, test acc: 88.73%.
Epoch: 8/40, train loss: 0.252971, train acc: 90.48%, test loss: 0.305233, test acc: 88.93%.
Epoch: 9/40, train loss: 0.243077, train acc: 90.80%, test loss: 0.30