In [None]:
import torch
from torchvision import transforms, models
from datasets import load_dataset
import time
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# set seed for reproducibility in torch, numpy and gpu

torch.manual_seed(710)
np.random.seed(710)




In [None]:
tiny_imagenet = load_dataset("Maysee/tiny-imagenet", split="train")
tiny_imagenet[0]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
# Convert to torch dataset

class TinyImageNet(torch.utils.data.Dataset):
    def __init__(self, dataset, transform=None):
        self.dataset = dataset
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        x, y = self.dataset[idx]["image"], self.dataset[idx]["label"]
        # convert x to RGB
        x = x.convert("RGB")
        if self.transform:
            x = self.transform(x)
        y = torch.tensor(y, dtype=torch.int64)
        return x, y

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

tiny_imagenet_torch = TinyImageNet(tiny_imagenet, transform=transform)
type(tiny_imagenet_torch[0])

### Compare effect of number of workers on data size

In [None]:
def time_load_transfer_train(data_loader, num_batches, rolling_window=20):
    data_load_times = []
    gpu_transfer_times = []

    forward_backward_times = []


    data_load_start_time = time.time()
    for i, (x, y) in enumerate(data_loader):
        data_load_end_time = time.time()
        data_load_time_taken = data_load_end_time - data_load_start_time
        
        data_load_times.append(data_load_time_taken)
            
        
        gpu_transfer_start_time = time.time()
        
        # transfer to GPU
        x = x.to(device)
        y = y.to(device)
        gpu_transfer_end_time = time.time()
        gpu_transfer_time_taken = gpu_transfer_end_time - gpu_transfer_start_time
        gpu_transfer_times.append(gpu_transfer_time_taken)
        
        # clear grads optimizer
        optimizer.zero_grad()

        # forward pass
        forward_backward_start_time = time.time()
        y_hat = model(x)
        loss = torch.nn.functional.cross_entropy(y_hat, y)
        loss.backward()
        # backward pass
        optimizer.step()

        forward_backward_end_time = time.time()

        forward_backward_time_taken = forward_backward_end_time - forward_backward_start_time 
        forward_backward_times.append(forward_backward_time_taken)   
        data_load_start_time = time.time()    

        if i == num_batches:
            break
    data_load_times = np.array(data_load_times)
    gpu_transfer_times = np.array(gpu_transfer_times)

    # apply rolling average
    data_load_times = np.convolve(data_load_times, np.ones(rolling_window)/rolling_window, mode='valid')
    gpu_transfer_times = np.convolve(gpu_transfer_times, np.ones(rolling_window)/rolling_window, mode='valid')
    forward_backward_times = np.convolve(forward_backward_times, np.ones(rolling_window)/rolling_window, mode='valid')

    return {
        "data_load_mean": np.round(np.mean(data_load_times), 2),
        "gpu_transfer_mean": np.round(np.mean(gpu_transfer_times), 2),
        "forward_backward_mean": np.round(np.mean(forward_backward_times), 2),
        "data_load_times": data_load_times,
        "gpu_transfer_times": gpu_transfer_times,
        "forward_backward_times": forward_backward_times,
    }



num_batches = 100
batch_size = 256
batch_ids = None

for num_workers in [0, 2, 4, 8]:
    model = models.resnet18(pretrained=True)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    model.to(device)

    data_loader = torch.utils.data.DataLoader(tiny_imagenet_torch, batch_size=batch_size, shuffle=True, num_workers=num_workers)
    data_load_times_dict = time_load_transfer_train(data_loader=data_loader, num_batches=num_batches)
    if batch_ids is None:
        batch_ids = np.arange(len(data_load_times_dict['data_load_times']))
    
    plt.figure(figsize=(10, 5))
    plt.plot(batch_ids, data_load_times_dict['data_load_times'], label=f"load time avg {data_load_times_dict['data_load_mean']}", marker="o", alpha=0.5)
    plt.plot(batch_ids, data_load_times_dict['gpu_transfer_times'], label=f"gpu transfer time avg {data_load_times_dict['gpu_transfer_mean']}", marker="o", alpha=0.5)
    plt.plot(batch_ids, data_load_times_dict['forward_backward_times'], label=f"forward backward time avg {data_load_times_dict['forward_backward_mean']}", marker="o", alpha=0.5)

    plt.xlabel("Batch ID")
    plt.ylabel("Time (s)")
    plt.title(f"Data Load and GPU Transfer Times with {num_workers} workers")
    plt.legend()
    plt.show()

    # clear memory
    del data_loader
    del model
    del optimizer
    torch.cuda.empty_cache()
    time.sleep(5)






## Concluding remarks
- With more workers, we were able to slash down the data load time
- Interstingly, the bottleneck is cpu to gpu transfer time. I can't use larger batch sizes because my 8 GB memory is full
    - 256 * 224 * 224 * 3 * 2 = 