# Pytorch Dataset
Haohang Li  
09/23/2021

### Dependicies

In [None]:
import torch
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, random_split, Dataset

## Generate Fake Data

For simplicity, we will use random number to represent our training featuers & label

In [None]:
rng = np.random.default_rng()
fake_features = rng.normal(size=12000).reshape(1000, 12)  # fake data, assume we have 1000 obs and 12 features
fake_label = rng.integers(low=0, high=5, size=1000) # fake label: assume we have 5 different class: [0, 1, 2, 3, 4]

In [None]:
torch.LongTensor(fake_label).size()[0]

1000

## Define Dataset Class
The dataset class should inherit from the pytorch's Dataset class, and we need to define:


1.   `__init__`: Initialize your parent class and preprocess
2.   `__getitem__`: Define how to retrieve your data by index, usually we return both data and corresponding label
3.   `__len__`: Define how to get the total length of your data(how many observations/data points/rows) in your dataset




In [None]:
class MyDataset(Dataset):
    def __init__(self, features, labels):
        super(MyDataset, self).__init__()
        self.features = torch.Tensor(features)
        self.labels = torch.LongTensor(labels)  # label shoul be (long) int (int64)
    
    def __getitem__(self, index):
        return self.features[index], self.labels[index]
    
    def __len__(self):
        return self.labels.size()[0]

Create a dataset object:

In [None]:
dataset_example = MyDataset(features=fake_features, labels=fake_label)

In [None]:
a_feature, a_label = dataset_example[0]
print(f'A feature:\n{a_feature}')
print(f'A label:\n{a_label}')

A feature:
tensor([-0.3835, -1.2055, -2.6225,  0.5530,  0.7653,  0.1032, -0.9299,  0.7310,
        -1.2663,  0.0893,  0.9387,  0.7605])
A label:
1


Sometimes, your dataset might be too large and you my need to load and process it "on the fly":

In [None]:
class MyDataset_onthefly(Dataset):
    def __init__(self, features, labels):
        super(MyDataset, self).__init__()
        self.size = fake_label.shape[0]
        # assume our total dataset is too large to load them at once to our memory
        # this actually is a very common case when you deal with audio, image dataset
        # it would be impossible for us to load 1 million images to our GPU memory
    
    def __getitem__(self, index):
        cur_features = torch.Tensor(fake_features[index]) # Instead of put everythin to memory, we load our data and process our data when we actually need it
        cur_label = torch.Tensor(fake_label[index])
        return cur_features, cur_label
    
    def __len__(self):
        return self.size

## Split Dataset
To split our dataset to train, validation, test datasets, we can use the `random_split` function.

In [None]:
split_size = (np.array([0.6, 0.2, 0.2]) * len(dataset_example)).astype(np.int)
train_data, valid_data, test_data = random_split(dataset_example, lengths=split_size)

print(f'Train dataset length: {len(train_data)}')
print(f'Validation dataset length: {len(valid_data)}')
print(f'Test dataset length: {len(test_data)}')

Train dataset length: 600
Validation dataset length: 200
Test dataset length: 200


## Data Loader
Convert to dataloader so we can use our data in train function:

In [None]:
train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
valid_loader = DataLoader(valid_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=True)

# one batch example
one_batch_features, one_batch_labels = next(train_loader.__iter__())
print(one_batch_features.size())  # (batch x num_features)
print(one_batch_labels.size())  # (batch x num_labels)

torch.Size([32, 12])
torch.Size([32])
