In [1]:
!pip install torch torchvision --quiet


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from torchvision import datasets, transforms
import time

# Define a simple CNN model
class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 3, 1)
        self.conv2 = nn.Conv2d(32, 64, 3, 1)
        self.dropout1 = nn.Dropout(0.25)  # Changed from Dropout2d to Dropout
        self.dropout2 = nn.Dropout(0.5)   # Changed from Dropout2d to Dropout
        self.fc1 = nn.Linear(9216, 128)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.conv1(x)
        x = nn.functional.relu(x)
        x = self.conv2(x)
        x = nn.functional.relu(x)
        x = nn.functional.max_pool2d(x, 2)
        x = self.dropout1(x)
        x = torch.flatten(x, 1)
        x = self.fc1(x)
        x = nn.functional.relu(x)
        x = self.dropout2(x)
        x = self.fc2(x)
        output = nn.functional.log_softmax(x, dim=1)
        return output

# Single process training function
def train_single_process(model, device, train_loader, optimizer, epochs):
    model.train()
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = nn.functional.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % 100 == 0:
                print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} '
                      f'({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')

# Function for each process in multi-process training
def train_worker(rank, model, device, train_loader, optimizer, epochs):
    for epoch in range(epochs):
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = model(data)
            loss = nn.functional.nll_loss(output, target)
            loss.backward()
            optimizer.step()
            if batch_idx % 100 == 0:
                print(f'Process {rank}, Epoch {epoch}: {batch_idx * len(data)}/{len(train_loader.dataset)}')

# Multi-process training function
def train_multi_process(model, device, train_dataset, optimizer, epochs, num_processes):
    model.share_memory()  # Required for multiprocessing
    processes = []
    for rank in range(num_processes):
        # Create a DataLoader for each process
        train_loader = torch.utils.data.DataLoader(train_dataset,
                                                   batch_size=64,
                                                   shuffle=True,
                                                   num_workers=0)  # Changed to 0 to avoid issues with multiprocessing
        p = mp.Process(target=train_worker, args=(rank, model, device, train_loader, optimizer, epochs))
        p.start()
        processes.append(p)
    
    for p in processes:
        p.join()

# Main function to run both single and multi-process training
def main():
    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")
    
    transform = transforms.Compose([transforms.ToTensor(),
                                    transforms.Normalize((0.1307,), (0.3081,))])
    
    train_dataset = datasets.MNIST('../data', train=True, download=True, transform=transform)
    
    # Single process training
    model_single = SimpleCNN().to(device)
    optimizer_single = optim.SGD(model_single.parameters(), lr=0.01, momentum=0.5)
    train_loader_single = torch.utils.data.DataLoader(train_dataset, batch_size=64, shuffle=True)
    
    start_time = time.time()
    train_single_process(model_single, device, train_loader_single, optimizer_single, epochs=1)
    single_process_time = time.time() - start_time
    
    # Multi-process training
    model_multi = SimpleCNN().to(device)
    optimizer_multi = optim.SGD(model_multi.parameters(), lr=0.01, momentum=0.5)
    
    start_time = time.time()
    train_multi_process(model_multi, device, train_dataset, optimizer_multi, epochs=1, num_processes=4)
    multi_process_time = time.time() - start_time
    
    print(f"Single process training time: {single_process_time:.2f} seconds")
    print(f"Multi-process training time: {multi_process_time:.2f} seconds")
    print(f"Speedup: {single_process_time / multi_process_time:.2f}x")

if __name__ == '__main__':
    mp.set_start_method('spawn')
    main()



Single process training time: 99.38 seconds
Multi-process training time: 0.21 seconds
Speedup: 464.05x


Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/codespace/.python/current/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.python/current/lib/python3.12/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AttributeError: Can't get attribute 'train_worker' on <module '__main__' (<class '_frozen_importlib.BuiltinImporter'>)>
Traceback (most recent call last):
  File "<string>", line 1, in <module>
  File "/home/codespace/.python/current/lib/python3.12/multiprocessing/spawn.py", line 122, in spawn_main
    exitcode = _main(fd, parent_sentinel)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/codespace/.python/current/lib/python3.12/multiprocessing/spawn.py", line 132, in _main
    self = reduction.pickle.load(from_parent)
           ^