# Writing Custom Datasets and DataLoaders 

*This tutorial is based on [Writing Custom Datasets, DataLoaders and Transforms](https://pytorch.org/tutorials/beginner/data_loading_tutorial.html#writing-custom-datasets-dataloaders-and-transforms)*

A lot of effort in solving any machine learning problem goes into preparing the data. PyTorch provides many tools to make data loading easy and hopefully, to make your code more readable. In this tutorial, we will see how to load and preprocess/augment data from a non trivial dataset.

## [torch.utils.data.Dataset](https://pytorch.org/docs/stable/data.html#torch.utils.data.Dataset)

1.   `torch.utils.data.Dataset` is an abstract class representing a dataset. 
2.   Your custom dataset should inherit `Dataset` and override the following methods:
-   `__len__` so that `len(dataset)` returns the size of the dataset.
-   `__getitem__` to support the indexing such that `dataset[i]` can be used to get ith sample

All datasets are subclasses of `torch.utils.data.Dataset` i.e, they have `__getitem__` and `__len__` methods implemented

### Costum dataset from a `.csv` file

In [1]:
import pandas as pd


TRAIN_DATA_PATH = "/content/sample_data/california_housing_train.csv"
TEST_DATA_PATH = "/content/sample_data/california_housing_test.csv"

df_train = pd.read_csv(TRAIN_DATA_PATH)
df_test = pd.read_csv(TEST_DATA_PATH)

df_train.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


In [3]:
df_test.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-122.05,37.37,27.0,3885.0,661.0,1537.0,606.0,6.6085,344700.0
1,-118.3,34.26,43.0,1510.0,310.0,809.0,277.0,3.599,176500.0
2,-117.81,33.78,27.0,3589.0,507.0,1484.0,495.0,5.7934,270500.0
3,-118.36,33.82,28.0,67.0,15.0,49.0,11.0,6.1359,330000.0
4,-119.67,36.33,19.0,1241.0,244.0,850.0,237.0,2.9375,81700.0


The dataset is represented as table with $9$ columns. The first $8$ columns serve as input, while the last column (i.e., `median_house_value`) serves as target. 

We write a dataset `CaliforniaHousingDataset` using this data. 

In [6]:
import torch
from torch.utils.data import Dataset
from typing import Any, Callable, Dict, IO, List, Optional, Tuple, Union
import numpy 


class CalifroniaHousingDataset(Dataset):
    training_file = TRAIN_DATA_PATH
    test_file = TEST_DATA_PATH

    features_names = \
        [
            "longitude", "latitude", "housing_median_age", "total_rooms", 
            "total_bedrooms", "population", "households", "median_income"
        ]

    target_name = ["median_house_value"]
    
    def __init__(
        self,
        train: bool = True,
    ) -> None:
        self.train = train 

        if self.train:
            data_file = self.training_file
        else:
            data_file = self.test_file

        df = pd.read_csv(data_file)

        # TODO: create attributes `data` and `targets`
        # hints:
        #   - look at `__getitem__` to see how `data` and `targets` are used
        #   - you can use `torch.tensor` to transform a `numpy.array` into `torch.tensor` 

        self.data = torch.tensor(numpy.array(df[self.features_names]))
        self.targets = torch.tensor(numpy.array(df[self.target_name]))


    def __getitem__(self, index) -> Tuple[torch.Tensor, torch.Tensor]:
        return self.data[index], self.targets[index]

    def __len__(self) -> int:
        # TODO: Implement this function 
        # Should return the number of samples in the dataset
        return self.data.shape[0] #len(self.targets)


In [7]:
califronia_housing_train = CalifroniaHousingDataset(train=True)

califronia_housing_train[0]

(tensor([-1.1431e+02,  3.4190e+01,  1.5000e+01,  5.6120e+03,  1.2830e+03,
          1.0150e+03,  4.7200e+02,  1.4936e+00], dtype=torch.float64),
 tensor([66900.], dtype=torch.float64))

In [9]:
len(califronia_housing_train)

17000

In [8]:
califronia_housing_test = CalifroniaHousingDataset(train=False)

califronia_housing_test[0]

(tensor([-122.0500,   37.3700,   27.0000, 3885.0000,  661.0000, 1537.0000,
          606.0000,    6.6085], dtype=torch.float64),
 tensor([344700.], dtype=torch.float64))

In [10]:
len(califronia_housing_test)

3000

## Subset of MNIST

We build a subset of MNIST dataset. We use the same style a in [`torchvision.datasets.mnist`](https://pytorch.org/vision/stable/_modules/torchvision/datasets/mnist.html#EMNIST) 

In [11]:
import os 
from torchvision.datasets import MNIST
from torchvision.transforms import ToTensor, Normalize, Compose
from PIL import Image


def get_mnist():
    """
    gets full (both train and test) MNIST dataset inputs and labels;
    :return:
        data (60_000,720=28*28), targets
    """

    training_data =\
        MNIST(
            root="./data/",
            train=True,
            download=True,
            transform=ToTensor()
        )

    test_data =\
        MNIST(
            root="./data/",
            train=False,
            download=True,
            transform=ToTensor()
        )

    data =\
        torch.cat([
            training_data.data,
            test_data.data
        ])

    targets =\
        torch.cat([
            training_data.targets,
            training_data.targets
        ])

    return data, targets


class SubMNIST(Dataset):
    """
    Constructs a subset of EMNIST dataset from a pickle file;
    expects pickle file to store list of indices

    Attributes
    ----------
    indices: iterable of integers
    transform
    data
    targets

    Methods
    -------
    __init__
    __len__
    __getitem__

    """

    def __init__(self, indices, mnist_data=None, mnist_targets=None, transform=None):
        """
        :param idnices: List[int]
        :param emnist_data: EMNIST dataset inputs
        :param emnist_targets: EMNIST dataset labels
        :param transform:
        """
       
        self.indices = indices

        if transform is None:
            self.transform =\
                Compose([
                    ToTensor(),
                    Normalize((0.1307,), (0.3081,))
                ])

        if mnist_data is None or mnist_targets is None:
            self.data, self.targets = get_mnist()
        else:
            self.data, self.targets = mnist_data, mnist_targets

        self.data = self.data[self.indices]
        self.targets = self.targets[self.indices]

    def __len__(self):
        return self.data.size(0)

    def __getitem__(self, index):
        img, target = self.data[index], int(self.targets[index])

        img = Image.fromarray(img.numpy(), mode='L')

        if self.transform is not None:
            img = self.transform(img)

        return img, target


**Exercice:** Randomly partition MNIST dataset into equally sized chunks 

In [17]:
from random import shuffle
def iid_divide(l, g):
    """
    https://github.com/TalwalkarLab/leaf/blob/master/data/utils/sample.py
    divide list `l` among `g` groups
    each group has either `int(len(l)/g)` or `int(len(l)/g)+1` elements
    returns a list of groups
    """
    num_elems = len(l)
    group_size = int(len(l) / g)
    num_big_groups = num_elems - g * group_size
    num_small_groups = g - num_big_groups
    glist = []
    for i in range(num_small_groups):
        glist.append(l[group_size * i: group_size * (i + 1)])
    bi = group_size * num_small_groups
    group_size += 1
    for i in range(num_big_groups):
        glist.append(l[bi + group_size * i:bi + group_size * (i + 1)])
    
    return glist


def partition_mnist(n_chunks=10) -> List[SubMNIST]:
    TOTAL_N_SAMPLES = 60_000
    
    all_indices = list(range(TOTAL_N_SAMPLES))
    
    # TODO: shuffle the indices before partionning 
    indices = shuffle(all_indices)

    indices_list = iid_divide(all_indices, n_chunks)

    partitions = list()
    for indices in indices_list:
        
        # TODO: create dataset
        dataset = SubMNIST(indices)
        
        partitions.append(dataset)

    return partitions


In [18]:
from torch.utils.data import DataLoader

mnist_partition = partition_mnist(n_chunks=10)
mnist_loaders =\
    [
        DataLoader(partition, batch_size=64, shuffle=True) 
        for partition in mnist_partition
    ]


In [16]:
next(iter(mnist_loaders[0]))

[tensor([[[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
           [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
           [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
           ...,
           [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
           [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
           [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242]]],
 
 
         [[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
           [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
           [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
           ...,
           [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
           [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
           [-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242]]],
 
 
         [[[-0.4242, -0.4242, -0.4242,  ..., -0.4242, -0.4242, -0.4242],
       