# Learning Data Loader on Pytorch

In [18]:
# import modules
from sklearn.datasets import load_wine
import torch, torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np

## Terminology

* `one epochs` : one forward and backward calculations for ALL training samples
* `batch size` : number of samples on that passed to each batches
* `iteration` : the number of forward and backward calculation done on each batch samples for one epochs

**Example :** the number of All training data is 100 with 4 batch size. then the total number of iterations is 100/4 = 25

In [3]:
# the scheme of data loader looks like this:

"""
# Training loop
for e in range(number_of_epochs):
    for i in range(number_of_batches):
        # Execution
        x_batches, y_batches = ...
"""
print("")




## Examples

In [4]:
# Load dataset
X, y = load_wine(return_X_y=True)
data = np.concatenate([X, y.reshape(-1,1)], axis=1)
data

array([[  14.23,    1.71,    2.43, ...,    3.92, 1065.  ,    0.  ],
       [  13.2 ,    1.78,    2.14, ...,    3.4 , 1050.  ,    0.  ],
       [  13.16,    2.36,    2.67, ...,    3.17, 1185.  ,    0.  ],
       ...,
       [  13.27,    4.28,    2.26, ...,    1.56,  835.  ,    2.  ],
       [  13.17,    2.59,    2.37, ...,    1.62,  840.  ,    2.  ],
       [  14.13,    4.1 ,    2.74, ...,    1.6 ,  560.  ,    2.  ]])

In [5]:
data.dtype

dtype('float64')

In [6]:
data.shape

(178, 14)

In [7]:
# create datasets class

class WineDataset(Dataset):
    # initialize class 
    def __init__(self):
        # initialize all dataset class
        super().__init__()
        
        # load dataset
        X, y = load_wine(return_X_y=True)

        # convert it into torch tensor and save properties
        self.X = torch.from_numpy(X.astype(np.float32))
        self.y = torch.from_numpy(y.reshape(-1,1).astype(np.float32))
        self.n_samples = len(X)

    # make it can be indexed
    def __getitem__(self, index):
        return self.X[index], self.y[index]

    # make it can return length
    def __len__(self):
        return self.n_samples

In [8]:
# load wine dataset
data = WineDataset()

# Create Data Loader
dataloader = DataLoader(data,
                        batch_size = 4, shuffle = True)

In [9]:
# check data of index 1
data[0]

(tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
         3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
         1.0650e+03]),
 tensor([0.]))

In [15]:
# try to index on dataloader object
try:
    dataloader[0]
except TypeError:
    print("This Execution will produce an Error of: 'DataLoader' object is not subscriptable")

This Execution will produce an Error of: 'DataLoader' object is not subscriptable


In [16]:
# instead of doing directly from dataloader object
# we can utilize iter() built-in function from python to iterate
# and being able to subscript the dataloader obejct
dataiter = iter(dataloader)

In [17]:
itersample = 0
for x,y in dataiter:
    print("X = ", x)
    print("y = ", y)
    print("\n")

    # break the for loop if it has return the first 3 item from dataiter
    itersample +=1
    if itersample == 3:
        break

X =  tensor([[1.4120e+01, 1.4800e+00, 2.3200e+00, 1.6800e+01, 9.5000e+01, 2.2000e+00,
         2.4300e+00, 2.6000e-01, 1.5700e+00, 5.0000e+00, 1.1700e+00, 2.8200e+00,
         1.2800e+03],
        [1.1840e+01, 2.8900e+00, 2.2300e+00, 1.8000e+01, 1.1200e+02, 1.7200e+00,
         1.3200e+00, 4.3000e-01, 9.5000e-01, 2.6500e+00, 9.6000e-01, 2.5200e+00,
         5.0000e+02],
        [1.2880e+01, 2.9900e+00, 2.4000e+00, 2.0000e+01, 1.0400e+02, 1.3000e+00,
         1.2200e+00, 2.4000e-01, 8.3000e-01, 5.4000e+00, 7.4000e-01, 1.4200e+00,
         5.3000e+02],
        [1.2370e+01, 1.1700e+00, 1.9200e+00, 1.9600e+01, 7.8000e+01, 2.1100e+00,
         2.0000e+00, 2.7000e-01, 1.0400e+00, 4.6800e+00, 1.1200e+00, 3.4800e+00,
         5.1000e+02]])
y =  tensor([[0.],
        [1.],
        [2.],
        [1.]])


X =  tensor([[1.3160e+01, 3.5700e+00, 2.1500e+00, 2.1000e+01, 1.0200e+02, 1.5000e+00,
         5.5000e-01, 4.3000e-01, 1.3000e+00, 4.0000e+00, 6.0000e-01, 1.6800e+00,
         8.3000e+02],
     

**Conclution:**

it is different when we index using data loader. on each iterate object is actually a tuple/list consisting of 

In [13]:
n_epoch = 2
n_iters = np.ceil(len(data)/4)

# print the result every n step
print_step = 5
for epoch in range(n_epoch):
    for idx, (input_tensors, labels) in enumerate(dataloader):
        # this is the execution process skipped for forward, backward, and gradients update calculations

        # print the results:
        if (idx+1) % print_step == 0:
            print(f"epoch: {(epoch+1)}/{(n_epoch)} - step: {idx+1}/{n_iters} - input_shape: {input_tensors.shape} - label_shape: {labels.shape}")

epoch: 1/2 - step: 5/45.0 - input_shape: torch.Size([4, 13]) - label_shape: torch.Size([4, 1])
epoch: 1/2 - step: 10/45.0 - input_shape: torch.Size([4, 13]) - label_shape: torch.Size([4, 1])
epoch: 1/2 - step: 15/45.0 - input_shape: torch.Size([4, 13]) - label_shape: torch.Size([4, 1])
epoch: 1/2 - step: 20/45.0 - input_shape: torch.Size([4, 13]) - label_shape: torch.Size([4, 1])
epoch: 1/2 - step: 25/45.0 - input_shape: torch.Size([4, 13]) - label_shape: torch.Size([4, 1])
epoch: 1/2 - step: 30/45.0 - input_shape: torch.Size([4, 13]) - label_shape: torch.Size([4, 1])
epoch: 1/2 - step: 35/45.0 - input_shape: torch.Size([4, 13]) - label_shape: torch.Size([4, 1])
epoch: 1/2 - step: 40/45.0 - input_shape: torch.Size([4, 13]) - label_shape: torch.Size([4, 1])
epoch: 1/2 - step: 45/45.0 - input_shape: torch.Size([2, 13]) - label_shape: torch.Size([2, 1])
epoch: 2/2 - step: 5/45.0 - input_shape: torch.Size([4, 13]) - label_shape: torch.Size([4, 1])
epoch: 2/2 - step: 10/45.0 - input_shape: 

# Transform

we can use transform from pytorch to be combined with out dataset class

In [58]:
# modified WineDataset Class

class WineDataset(Dataset):
    # initialize class 
    def __init__(self, transform = None):
        # initialize all dataset class
        super().__init__()
        
        # load dataset
        self.X, self.y = load_wine(return_X_y=True)
        self.n_samples = len(X)
        self.transform = transform

    # make it can be indexed
    def __getitem__(self, index):
        sample = self.X[index], self.y[index]
        
        if self.transform:
            sample = self.transform(sample)

        return sample

    # make it can return length
    def __len__(self):
        return self.n_samples


# create custom torch transformer
class ToTensor:
    # create call properties
    def __call__(self, sample):
        X, y = sample
        X = torch.from_numpy(X.astype(np.float32))
        y = torch.from_numpy(y.reshape(-1,1).astype(np.float32))
        return X, y


# create another torch transformer
class MulTransform:
    # create call properties
    def __init__(self, factor):
        self.factor = factor

    def __call__(self, sample):
        X, y = sample
        X *= self.factor
        return X, y

In [59]:
# load wine dataset
data = WineDataset(transform = ToTensor())

In [60]:
# look at dataset
print(f"return sample type : {type(data[0])}\n")
print(f"dataset overview: \n{data[0]}\n")
print(f"dataset type: {type(data[0][0])}")

return sample type : <class 'tuple'>

dataset overview: 
(tensor([1.4230e+01, 1.7100e+00, 2.4300e+00, 1.5600e+01, 1.2700e+02, 2.8000e+00,
        3.0600e+00, 2.8000e-01, 2.2900e+00, 5.6400e+00, 1.0400e+00, 3.9200e+00,
        1.0650e+03]), tensor([[0.]]))

dataset type: <class 'torch.Tensor'>


In [61]:
# using composed transform
from torchvision.transforms import Compose

In [66]:
# create compose dataset
composed = Compose([ToTensor(), MulTransform(10)])

# load wine dataset
data = WineDataset(transform = composed)

In [67]:
# look at dataset
print(f"return sample type : {type(data[0])}\n")
print(f"dataset overview: \n{data[0]}\n")
print(f"dataset type: {type(data[0][0])}")

return sample type : <class 'tuple'>

dataset overview: 
(tensor([1.4230e+02, 1.7100e+01, 2.4300e+01, 1.5600e+02, 1.2700e+03, 2.8000e+01,
        3.0600e+01, 2.8000e+00, 2.2900e+01, 5.6400e+01, 1.0400e+01, 3.9200e+01,
        1.0650e+04]), tensor([[0.]]))

dataset type: <class 'torch.Tensor'>
