<a href="https://colab.research.google.com/github/aaronjoel/DeepUnderstandingOfDeepLearning/blob/main/DUDL_Data_DatasetLoader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# import libraries
import numpy as np
import torch
from torch.utils.data import DataLoader, TensorDataset

In [2]:
# Create some data in numpy
nObservations = 100
nFeatures= 20

data = np.random.randn(nObservations, nFeatures)

In [3]:
data.shape

(100, 20)

In [4]:
data[0].shape

(20,)

In [5]:
data[0]

array([ 1.64033716e-03,  8.53876275e-01,  8.77563430e-01, -1.11264898e+00,
       -6.13146261e-01, -5.40733959e-02,  2.14285854e-01, -9.99703703e-01,
       -5.24580817e-01, -1.02277193e+00, -1.08845805e+00,  1.32127034e+00,
        8.53866178e-01, -1.75535166e+00, -1.31413407e+00,  6.43028731e-01,
        2.84325366e+00, -1.11058455e+00, -1.19829027e+00, -1.06776911e+00])

In [6]:
# Convert to pytorch tensor
dataT = torch.tensor(data)
dataT.shape

torch.Size([100, 20])

In [7]:
dataT[0]

tensor([ 1.6403e-03,  8.5388e-01,  8.7756e-01, -1.1126e+00, -6.1315e-01,
        -5.4073e-02,  2.1429e-01, -9.9970e-01, -5.2458e-01, -1.0228e+00,
        -1.0885e+00,  1.3213e+00,  8.5387e-01, -1.7554e+00, -1.3141e+00,
         6.4303e-01,  2.8433e+00, -1.1106e+00, -1.1983e+00, -1.0678e+00],
       dtype=torch.float64)

In [8]:
# print out some information
print('NumPy data:')
print(type(data))
print(data.shape)
print(data.dtype)

NumPy data:
<class 'numpy.ndarray'>
(100, 20)
float64


In [9]:
print('Tensor data:')
print(type(dataT))
print(dataT.size()) # torch -> .size()
print(dataT.dtype)

Tensor data:
<class 'torch.Tensor'>
torch.Size([100, 20])
torch.float64


In [10]:
# Sometimes you need to convert data types
dataT2 = torch.Tensor( data ).float()
print(dataT2.dtype)

torch.float32


In [11]:
dataT3 = torch.tensor( data ).long()
print(dataT3.dtype)

torch.int64


In [12]:
# Convert tensor into PyTorch datasets
dataset = TensorDataset(data)

TypeError: 'int' object is not callable

In [13]:
dataset = TensorDataset(dataT)

In [14]:
dir(dataset)

['__add__',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_is_protocol',
 'tensors']

In [15]:
dataset.tensors

(tensor([[ 1.6403e-03,  8.5388e-01,  8.7756e-01,  ..., -1.1106e+00,
          -1.1983e+00, -1.0678e+00],
         [ 5.3528e-01,  1.2617e+00, -2.2205e+00,  ..., -1.5594e+00,
          -1.8863e+00, -3.6895e-01],
         [ 4.5088e-01,  7.8107e-01, -3.1973e-01,  ..., -3.7594e-01,
           1.4896e-01, -1.1421e-01],
         ...,
         [-3.4906e-01, -1.7391e-01, -4.9024e-01,  ...,  1.0912e+00,
           8.4654e-01,  7.9275e-01],
         [-2.3434e-01,  2.6258e-01, -1.4495e+00,  ..., -2.1065e-01,
           2.0551e+00,  9.5737e-01],
         [ 9.2015e-01, -2.7483e-01,  1.5946e+00,  ...,  3.3417e-01,
          -8.7692e-01,  8.0415e-01]], dtype=torch.float64),)

In [16]:
type(dataset.tensors)

tuple

In [17]:
dataset.tensors[0]

tensor([[ 1.6403e-03,  8.5388e-01,  8.7756e-01,  ..., -1.1106e+00,
         -1.1983e+00, -1.0678e+00],
        [ 5.3528e-01,  1.2617e+00, -2.2205e+00,  ..., -1.5594e+00,
         -1.8863e+00, -3.6895e-01],
        [ 4.5088e-01,  7.8107e-01, -3.1973e-01,  ..., -3.7594e-01,
          1.4896e-01, -1.1421e-01],
        ...,
        [-3.4906e-01, -1.7391e-01, -4.9024e-01,  ...,  1.0912e+00,
          8.4654e-01,  7.9275e-01],
        [-2.3434e-01,  2.6258e-01, -1.4495e+00,  ..., -2.1065e-01,
          2.0551e+00,  9.5737e-01],
        [ 9.2015e-01, -2.7483e-01,  1.5946e+00,  ...,  3.3417e-01,
         -8.7692e-01,  8.0415e-01]], dtype=torch.float64)

In [19]:
dataset.tensors[0].shape

torch.Size([100, 20])

In [20]:
# Let's try again with labels
labels = torch.ceil( torch.linspace(.01, 4, nObservations) )
labels.shape

torch.Size([100])

In [21]:
# transform to an actual matrix (column vector)
labels = labels.reshape(shape=(len(labels), 1))
labels.shape

torch.Size([100, 1])

In [23]:
# now another dataset
dataset = TensorDataset(dataT, labels)
dataset

<torch.utils.data.dataset.TensorDataset at 0x7c6b616d4410>

In [26]:
type(dataset.tensors)

tuple

In [27]:
dataset.tensors[0].shape

torch.Size([100, 20])

In [28]:
dataset.tensors[1].shape

torch.Size([100, 1])

In [29]:
dataset.tensors[0].size()

torch.Size([100, 20])

In [30]:
dataset.tensors[1].size()

torch.Size([100, 1])

In [31]:
np.shape( np.random.randint(5,size=nObservations) )

(100,)

In [32]:
# create a dataloader object
batchsize = 25
dataloader = DataLoader(dataset=dataset, batch_size=batchsize)
dir(dataloader)

['_DataLoader__initialized',
 '_DataLoader__multiprocessing_context',
 '_IterableDataset_len_called',
 '__annotations__',
 '__class__',
 '__class_getitem__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__orig_bases__',
 '__parameters__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_auto_collation',
 '_dataset_kind',
 '_get_iterator',
 '_index_sampler',
 '_is_protocol',
 '_iterator',
 'batch_sampler',
 'batch_size',
 'check_worker_number_rationality',
 'collate_fn',
 'dataset',
 'drop_last',
 'generator',
 'in_order',
 'multiprocessing_context',
 'num_workers',
 'persistent_workers',
 'pin_memory',
 'pin_memory_device',
 'prefetch_factor',
 'sampler',
 'timeout',
 

In [33]:
dataloader.dataset.tensors[0]

tensor([[ 1.6403e-03,  8.5388e-01,  8.7756e-01,  ..., -1.1106e+00,
         -1.1983e+00, -1.0678e+00],
        [ 5.3528e-01,  1.2617e+00, -2.2205e+00,  ..., -1.5594e+00,
         -1.8863e+00, -3.6895e-01],
        [ 4.5088e-01,  7.8107e-01, -3.1973e-01,  ..., -3.7594e-01,
          1.4896e-01, -1.1421e-01],
        ...,
        [-3.4906e-01, -1.7391e-01, -4.9024e-01,  ...,  1.0912e+00,
          8.4654e-01,  7.9275e-01],
        [-2.3434e-01,  2.6258e-01, -1.4495e+00,  ..., -2.1065e-01,
          2.0551e+00,  9.5737e-01],
        [ 9.2015e-01, -2.7483e-01,  1.5946e+00,  ...,  3.3417e-01,
         -8.7692e-01,  8.0415e-01]], dtype=torch.float64)

In [34]:
dataloader.dataset.tensors[1]

tensor([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [2.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
        [3.],
      

In [35]:
dataloader.dataset.tensors[0].size()

torch.Size([100, 20])

In [36]:
dataloader.dataset.tensors[1].size()

torch.Size([100, 1])

In [37]:
# sizes of each batch
for dat, labs in dataloader:
  print('BATCH INFO:')
  print(dat.size())
  print(labs.size())
  print(' ')

BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 
BATCH INFO:
torch.Size([25, 20])
torch.Size([25, 1])
 


In [38]:
# inspect the labels
for dat, labs in dataloader:
  print(labs.T)
  print(' ')

tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
         1., 1., 1., 1., 1., 1., 1.]])
 
tensor([[2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2.,
         2., 2., 2., 2., 2., 2., 2.]])
 
tensor([[3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3., 3.,
         3., 3., 3., 3., 3., 3., 3.]])
 
tensor([[4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4., 4.,
         4., 4., 4., 4., 4., 4., 4.]])
 


In [42]:
# Try again with shuffling (shuffling happens during iterations)
dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True)

for dat, labs in dataloader:
  print(labs.T)
  print(' ')

tensor([[4., 1., 4., 3., 3., 3., 1., 2., 1., 1., 2., 4., 1., 1., 3., 2., 1., 4.,
         2., 1., 3., 4., 2., 2., 4.]])
 
tensor([[1., 2., 1., 4., 1., 2., 2., 2., 2., 1., 4., 2., 3., 2., 3., 3., 4., 2.,
         1., 2., 3., 4., 3., 1., 4.]])
 
tensor([[1., 4., 3., 3., 4., 3., 3., 1., 4., 3., 1., 3., 2., 2., 2., 2., 3., 4.,
         3., 4., 1., 3., 1., 2., 4.]])
 
tensor([[1., 4., 4., 3., 4., 3., 1., 1., 4., 4., 4., 1., 2., 2., 3., 1., 3., 3.,
         2., 2., 4., 3., 2., 4., 1.]])
 


In [51]:
#dataloader = DataLoader(dataset=dataset, batch_size=batchsize, shuffle=True)

for dat, labs in dataloader:
  print(labs.T)
  print(' ')

tensor([[2., 2., 1., 4., 3., 1., 3., 3., 1., 1., 3., 4., 2., 1., 4., 2., 3., 3.,
         1., 4., 2., 4., 4., 3., 1.]])
 
tensor([[3., 2., 3., 3., 1., 3., 4., 4., 1., 2., 4., 2., 2., 1., 3., 4., 3., 4.,
         4., 2., 2., 1., 4., 3., 2.]])
 
tensor([[3., 3., 3., 2., 2., 1., 2., 3., 4., 2., 1., 1., 4., 4., 2., 3., 3., 1.,
         4., 1., 1., 2., 1., 4., 4.]])
 
tensor([[2., 1., 4., 3., 4., 3., 4., 1., 4., 2., 2., 2., 4., 1., 3., 3., 2., 2.,
         1., 3., 1., 1., 4., 1., 2.]])
 


In [52]:
# To get only one batch(e.g., for testing)
dat, labs = next(iter(dataloader))
labs

tensor([[3.],
        [4.],
        [3.],
        [3.],
        [4.],
        [2.],
        [1.],
        [1.],
        [1.],
        [3.],
        [4.],
        [4.],
        [2.],
        [1.],
        [4.],
        [4.],
        [1.],
        [3.],
        [2.],
        [2.],
        [3.],
        [2.],
        [3.],
        [1.],
        [2.]])

In [54]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7c6b3ddc5990>

In [55]:
iter(dataloader)

<torch.utils.data.dataloader._SingleProcessDataLoaderIter at 0x7c6b3e18c610>

In [56]:
next(_)

[tensor([[ 1.2329e+00, -5.5680e-01, -1.1515e+00,  1.9037e-01,  3.9972e-01,
           2.7523e-01,  9.3487e-01, -1.3376e+00, -1.2866e+00, -9.8949e-01,
          -2.4466e-01,  5.5947e-01,  1.7675e-01, -2.6617e-01, -1.0712e-01,
           1.1420e+00,  3.1702e-01,  8.6744e-01, -1.2531e+00,  1.4057e+00],
         [ 1.7733e-01, -1.1362e+00, -6.2604e-01, -5.7990e-01, -2.2177e-01,
           1.3225e+00,  8.2412e-01, -1.2680e-01, -5.8698e-01, -2.9902e-02,
          -1.4443e-01, -7.5330e-01, -6.9162e-01, -7.6353e-01,  3.6687e-01,
          -2.1813e+00,  3.1268e-01,  1.5809e+00,  2.2610e+00, -1.5746e-03],
         [ 8.6906e-01,  1.0354e+00,  2.3224e+00,  1.5224e+00,  1.1855e+00,
          -8.2245e-01,  8.6381e-01,  1.5907e+00,  1.0913e+00,  1.4903e-01,
           1.1942e+00, -1.0637e-01,  2.4053e+00, -2.3521e-01, -3.3876e-01,
          -4.8154e-01,  1.3918e+00, -2.0129e+00,  4.4431e-01, -1.1097e+00],
         [ 6.0000e-02,  8.8044e-01, -1.1511e+00, -4.6729e-01, -1.9832e+00,
           2.0805e-01,