
### Demonstrate batch training in pytorch

Motivation: 

for epochs:
    x,y = data -> forward + backward and update weights;

however, gradient calculations on the entire data set is expensive if large.

Solution:

Divide data into smaller **batches**:

    for epoch in epochs:
        for batch in batches:
            x_batch, y_batch = train ...

Let samples = 100, batch_size = 20 -> 100/20 = 5 iterations pr. epoch


In [54]:
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader # for batch training
import numpy as np
import math

In [78]:
# Store dataset in a class to unpack batches using torch.Dataloader
class MyDataset(Dataset):
    def __init__(self):
        xy = np.loadtxt('./my_data.csv', delimiter=',', dtype=np.float32, skiprows=1)
        self.x = torch.from_numpy(xy[:,1:])
        self.y = torch.from_numpy(xy[:,[0]])
        self.n_samples = xy.shape[0]

    def __getitem__(self, index):
        return self.x[index], self.y[index]
    
    def __len__(self):
        return self.n_samples

# Instantiate data class
data = MyDataset()

# torch.DataLoader takes dataset class and batch size
dataloader = DataLoader(dataset=data, batch_size=4, shuffle=True)

# iterator for unpacking data (using next)
data_iterator = iter(dataloader)
features, labels = data_iterator.next()

# Display the current batch of 4 X-features and their 4 y-lables
print("First batch:")
print(features)
print(labels)

# Display training setup
num_epocs = 2
batch_size = 4
n_samples = len(data)
m_iterations = math.ceil(n_samples/batch_size)

print("Total number of samples:", n_samples)
print("Batch size:", batch_size)
print("Iterations pr. training loop:", m_iterations)

First batch:
tensor([[1.3050e+01, 1.6500e+00, 2.5500e+00, 1.8000e+01, 9.8000e+01, 2.4500e+00,
         2.4300e+00, 2.9000e-01, 1.4400e+00, 4.2500e+00, 1.1200e+00, 2.5100e+00,
         1.1050e+03],
        [1.2690e+01, 1.5300e+00, 2.2600e+00, 2.0700e+01, 8.0000e+01, 1.3800e+00,
         1.4600e+00, 5.8000e-01, 1.6200e+00, 3.0500e+00, 9.6000e-01, 2.0600e+00,
         4.9500e+02],
        [1.3070e+01, 1.5000e+00, 2.1000e+00, 1.5500e+01, 9.8000e+01, 2.4000e+00,
         2.6400e+00, 2.8000e-01, 1.3700e+00, 3.7000e+00, 1.1800e+00, 2.6900e+00,
         1.0200e+03],
        [1.2370e+01, 1.1700e+00, 1.9200e+00, 1.9600e+01, 7.8000e+01, 2.1100e+00,
         2.0000e+00, 2.7000e-01, 1.0400e+00, 4.6800e+00, 1.1200e+00, 3.4800e+00,
         5.1000e+02]])
tensor([[1.],
        [2.],
        [1.],
        [2.]])
Total number of samples: 178
Batch size: 4
Iterations pr. training loop: 45


In [90]:
# Training loop
for epoch in range(num_epocs):
    for i, (inputs, labels) in enumerate(dataloader):
        # For each iteration (45 total), process the current batch of 4 samples 
        # Print to show simulated training progress:
        print(f'Epoch: {epoch+1}/{num_epocs}, iteration: {i+1}/{m_iterations}, (batch) inputs: {inputs.shape}')
    print(f' Done with epoch {epoch+1}')

# Note: batch size is 4 is seen in the size of inputs in each step

Epoch: 1/2, iteration: 1/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 2/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 3/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 4/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 5/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 6/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 7/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 8/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 9/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 10/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 11/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 12/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 13/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 14/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iteration: 15/45, (batch) inputs: torch.Size([4, 13])
Epoch: 1/2, iterati