In [3]:
from sklearn.datasets import make_classification
import torch

In [4]:
#step 1: create dataset
X,y = make_classification(
    n_samples=10,
    n_features=2,
    n_informative=2,
    n_redundant=0,
    n_classes=2,
    random_state=42
    )

In [5]:
X

array([[ 1.06833894, -0.97007347],
       [-1.14021544, -0.83879234],
       [-2.8953973 ,  1.97686236],
       [-0.72063436, -0.96059253],
       [-1.96287438, -0.99225135],
       [-0.9382051 , -0.54304815],
       [ 1.72725924, -1.18582677],
       [ 1.77736657,  1.51157598],
       [ 1.89969252,  0.83444483],
       [-0.58723065, -1.97171753]])

In [6]:
X.shape

(10, 2)

In [7]:
y

array([1, 0, 0, 0, 0, 1, 1, 1, 1, 0])

In [8]:
#convert the data to Pytorch tensors
X = torch.tensor(X,dtype=torch.float32)
y = torch.tensor(y,dtype=torch.float32)

In [9]:
X

tensor([[ 1.0683, -0.9701],
        [-1.1402, -0.8388],
        [-2.8954,  1.9769],
        [-0.7206, -0.9606],
        [-1.9629, -0.9923],
        [-0.9382, -0.5430],
        [ 1.7273, -1.1858],
        [ 1.7774,  1.5116],
        [ 1.8997,  0.8344],
        [-0.5872, -1.9717]])

In [10]:
X[1]

tensor([-1.1402, -0.8388])

## Dataset and Dataloader classes

In [11]:
from torch.utils.data import Dataset,DataLoader

Create a custom dataset class

In [12]:
class CustomDataset(Dataset): #inherits from Dataset class
    def __init__(self,features,labels):  #initializes dataset
        self.features = features
        self.labels = labels
    
    def __len__(self):
        return self.features.shape[0]  #returns length of dataset
    
    def __getitem__(self,index):  #returns a row of data in particular index, is used for transformation of data eg changing shpae of data before training
        return self.features[index],self.labels[index]

In [13]:
dataset = CustomDataset(X,y)

since dataset is a object of class that inherits from Dataset class we can use some inbuilt methods

In [14]:
len(dataset)

10

In [15]:
dataset[1]

(tensor([-1.1402, -0.8388]), tensor(0.))

Now create an object of DataLoader class


In [16]:
X

tensor([[ 1.0683, -0.9701],
        [-1.1402, -0.8388],
        [-2.8954,  1.9769],
        [-0.7206, -0.9606],
        [-1.9629, -0.9923],
        [-0.9382, -0.5430],
        [ 1.7273, -1.1858],
        [ 1.7774,  1.5116],
        [ 1.8997,  0.8344],
        [-0.5872, -1.9717]])

In [20]:
y

tensor([1., 0., 0., 0., 0., 1., 1., 1., 1., 0.])

In [27]:
dataloader = DataLoader(dataset,batch_size=3,shuffle=True) #shuffle = True gives random batch of batch_size rows can make it false using False then it gives sequential rows in batches

Note: if batch size = 3 is kept it makes 3 batches of three rows but makes last batch with single row

In [28]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x1f90a54f2e0>

Now dataloader is an iterable and we can run loop to fetch batches

In [29]:
for batch_features,batch_labels in dataloader:
    print(batch_features,batch_labels)

tensor([[-2.8954,  1.9769],
        [-0.9382, -0.5430],
        [-1.9629, -0.9923]]) tensor([0., 1., 0.])
tensor([[ 1.7273, -1.1858],
        [ 1.0683, -0.9701],
        [ 1.8997,  0.8344]]) tensor([1., 1., 1.])
tensor([[-0.7206, -0.9606],
        [ 1.7774,  1.5116],
        [-1.1402, -0.8388]]) tensor([0., 1., 0.])
tensor([[-0.5872, -1.9717]]) tensor([0.])
