## Data Loader

    1. This is just the abstraction you need to feed in a big dataset in batches, almost like an iterator.
    2. You can override the base class magic functions of __len__ and __getitem__ for your custom dataset.

    A couple technical terms worth knowing:
        1. Pass -> One forward pass of batch + One backward pass of batch
        2. Batch Size -> The number of training examples in one forward/backward pass.
        3. Iterations -> The number of passes required to process all data points in the dataset (say, training data).
        4. Epoch -> One pass over all data points. One or more iterations make up an epoch.
        
        [So a dataset with 1000 examples and a batch size of 500, would need 2 iterations to complete an epoch]
        
    One could code up an implementation to randomly shuffle and slice up data, but it could turn out to be cumbersome
    and takes away your time to spend on the more important stuff. Data Loader comes to our rescue!

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

import numpy as np
from torch.utils.data import Dataset, DataLoader
torch.set_printoptions(linewidth=100)

## Creating a DataLoader Class for the diabetes classification dataset.

    This dataset contains 8 features and the target column contains 0 
    or 1 indicating whether the ith person has diabetes or not.

In [2]:
# we must inherit from the Dataset class of pytorch
class DiabetesDataset(Dataset):
    def __init__(self):
        # loadtxt assumes that each row has the same number of values.
        # (if you'd like to handle missing values, use genfromtxt function instead)
        data = np.loadtxt('data/diabetes.csv.gz', delimiter=',', dtype=np.float32)
        self.len = data.shape[0]
        self.x_data = torch.from_numpy(data[:, 0:-1])
        self.y_data = torch.from_numpy(data[:, -1])
    
    def __getitem__(self, index):
        assert (0 <= index < self.len), "That doesn't look like a valid index dudette/dude"
        return self.x_data[index], self.y_data[index]
    
    def __len__(self):
        return self.len

## Creating an object from the DataLoader class

In [3]:
train_data_loader = DataLoader(dataset=DiabetesDataset(),
                               batch_size=32,
                               shuffle=True,
                               num_workers=2)

# Using train_data_loader as a data supplier in the Diabetes Classfication Task

## Building the class for the nn Classifier (Functional paradigm)

In [4]:
class DiabetesNet(nn.Module):
    def __init__(self):
        super(DiabetesNet, self).__init__()
        self.fc1 = nn.Linear(in_features=8, out_features=6)
        self.fc2 = nn.Linear(in_features=6, out_features=4)
        self.out = nn.Linear(in_features=4, out_features=1)
    
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = torch.sigmoid(self.out(x))
        return x

## Training the network

In [5]:
# Network object creations, defining loss and optimizer
net = DiabetesNet()
loss_criterion = nn.BCELoss(reduction='mean')
optimizer = optim.Adam(net.parameters(), lr=0.1)

In [6]:
# training loop
for epoch in range(2):
    # this is where the dataloader object helps out a lot
    for i, data in enumerate(train_data_loader):
        # get input batch of data
        batch_train, batch_labels = data
        
        # [The following paragraphs are from pytorch tutorials - personal note]
        # A PyTorch Variable is a wrapper around a PyTorch Tensor, and represents 
        # a node in a computational graph. If x is a Variable then x.data is a 
        # Tensor giving its value, and x.grad is another Variable holding the gradient 
        # of x with respect to some scalar value.

        # PyTorch Variables have the same API as PyTorch tensors: (almost) any operation 
        # you can do on a Tensor you can also do on a Variable; the difference is that 
        # autograd allows you to automatically compute gradients.
        batch_train, batch_labels = Variable(batch_train), Variable(batch_labels)
        
        # getting predictions from model
        batch_pred = net(batch_train)
        
        # Compute and print loss
        loss = loss_criterion(batch_pred.squeeze(1), batch_labels)
        print("Epoch : {}\nIteration: {}\nLoss: {}".format(epoch + 1, i + 1, loss))
        
        # Zero out gradients, do backward pass and further, update the weights.
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

Epoch : 1
Iteration: 1
Loss: 0.6892212629318237
Epoch : 1
Iteration: 2
Loss: 0.664035975933075
Epoch : 1
Iteration: 3
Loss: 0.6139621138572693
Epoch : 1
Iteration: 4
Loss: 0.7150481939315796
Epoch : 1
Iteration: 5
Loss: 0.5498839616775513
Epoch : 1
Iteration: 6
Loss: 0.556950032711029
Epoch : 1
Iteration: 7
Loss: 0.4442341923713684
Epoch : 1
Iteration: 8
Loss: 0.4616973400115967
Epoch : 1
Iteration: 9
Loss: 0.3781052529811859
Epoch : 1
Iteration: 10
Loss: 0.6124705076217651
Epoch : 1
Iteration: 11
Loss: 0.461191326379776
Epoch : 1
Iteration: 12
Loss: 0.5773187279701233
Epoch : 1
Iteration: 13
Loss: 0.5445713400840759
Epoch : 1
Iteration: 14
Loss: 0.5822882652282715
Epoch : 1
Iteration: 15
Loss: 0.5311828255653381
Epoch : 1
Iteration: 16
Loss: 0.9326416850090027
Epoch : 1
Iteration: 17
Loss: 0.3803958296775818
Epoch : 1
Iteration: 18
Loss: 0.3509477972984314
Epoch : 1
Iteration: 19
Loss: 0.37157270312309265
Epoch : 1
Iteration: 20
Loss: 0.6579140424728394
Epoch : 1
Iteration: 21
Loss: 0