<a href="https://colab.research.google.com/github/arpitpatelsitapur/my-py-torch-journey/blob/main/Pytorch_Dataset_and_DataLoader.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Problems** when We Don't use Dataset and Data Loader class
- No batch management
- Sequential , no parallelization
- No Shuffling and sampling

## **Solution**
- **Dataset** class - a `Blueprint` for loading and returning data.
  - `__init__()`- structure of data/batches
  - `__len__()`- no of batches/samples
  - `__getitem__(index)`- return data at given index

- **DataLoader** class- handles batching, shuffling and parallel loading
  - `dataset`(mandatory)
  - `batch_size`
  - `shuffle=True` for suffling data with indices
  - `num_workers` for parallelization
  - `pin_memory` for gpu transfer with high speed from memeory
  - `collate_fn` for collect and combining data into batches, helpful in *padding of uneven size data*
  - `drop_last` whether drop incomplete batch or not
  - `sampler` for custom suffling


In [None]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
from sklearn.datasets import make_classification
import torch
import torch.nn as nn
import torch.optim as optim
from torchinfo import summary

In [None]:
X,y=make_classification(n_samples=20,n_features=4,n_classes=2,random_state=42)
print(f"X.shape = {X.shape}, y.shape = {y.shape}")

X.shape = (20, 4), y.shape = (20,)


In [None]:
X=torch.tensor(X,dtype=torch.float32)
y=torch.tensor(y,dtype=torch.float32)
X,y

(tensor([[-0.3721,  1.2558,  0.3967, -0.5853],
         [ 1.3306, -0.2071,  1.1705, -0.9834],
         [ 0.5591,  0.9470,  1.1168, -1.1558],
         [ 0.8922, -0.7074,  0.4412, -0.2511],
         [ 0.4750,  1.1577,  1.1623, -1.2356],
         [ 0.9135,  1.4545,  1.7687, -1.8219],
         [-0.1135, -0.8750, -0.6394,  0.7249],
         [-0.8606,  1.4704,  0.0507, -0.3238],
         [ 0.6601,  2.0757,  1.8973, -2.0523],
         [ 1.3973,  0.0405,  1.3852, -1.2181],
         [-0.0456, -1.4569, -0.9250,  1.0851],
         [-1.4088, -0.2828, -1.5427,  1.4018],
         [ 0.7728,  1.9959,  1.9589, -2.0910],
         [ 2.3675,  0.6220,  2.6814, -2.4612],
         [ 1.0926, -0.0154,  1.0546, -0.9186],
         [-0.8299, -0.9640, -1.3908,  1.3985],
         [-0.8359, -0.6550, -1.2098,  1.1816],
         [-1.3137, -1.4257, -2.1410,  2.1417],
         [-0.7697,  1.9632,  0.4370, -0.7549],
         [-0.6568,  1.6113,  0.3343, -0.5983]]),
 tensor([0., 1., 0., 1., 0., 1., 1., 0., 1., 1., 1., 0., 1

## ***Using Pytorch Dataset and DataLoader***

In [None]:
from torch.utils.data import Dataset,DataLoader

class custom_dataset(Dataset):
  def __init__(self,X,y):
    self.X=X
    self.y=y
    self.n_samples=X.shape[0]

  def __len__(self):
    return self.n_samples

  def __getitem__(self,index):
    return self.X[index],self.y[index]

dataset=custom_dataset(X,y)

In [None]:
dataset[0]

(tensor([-0.3721,  1.2558,  0.3967, -0.5853]), tensor(0.))

In [None]:
dataloader=DataLoader(dataset=dataset,batch_size=5,shuffle=True)

for batch_X,batch_y in dataloader:
  print(batch_X)
  print(batch_y)
  print(f"batch_X.shape = {batch_X.shape}, batch_y.shape = {batch_y.shape}")
  print("-"*50)

tensor([[ 0.5591,  0.9470,  1.1168, -1.1558],
        [ 0.4750,  1.1577,  1.1623, -1.2356],
        [-0.0456, -1.4569, -0.9250,  1.0851],
        [-0.8299, -0.9640, -1.3908,  1.3985],
        [ 0.6601,  2.0757,  1.8973, -2.0523]])
tensor([0., 0., 1., 0., 1.])
batch_X.shape = torch.Size([5, 4]), batch_y.shape = torch.Size([5])
--------------------------------------------------
tensor([[-0.6568,  1.6113,  0.3343, -0.5983],
        [ 1.3973,  0.0405,  1.3852, -1.2181],
        [-0.1135, -0.8750, -0.6394,  0.7249],
        [-1.3137, -1.4257, -2.1410,  2.1417],
        [ 2.3675,  0.6220,  2.6814, -2.4612]])
tensor([0., 1., 1., 0., 1.])
batch_X.shape = torch.Size([5, 4]), batch_y.shape = torch.Size([5])
--------------------------------------------------
tensor([[ 0.9135,  1.4545,  1.7687, -1.8219],
        [ 0.7728,  1.9959,  1.9589, -2.0910],
        [ 1.3306, -0.2071,  1.1705, -0.9834],
        [-1.4088, -0.2828, -1.5427,  1.4018],
        [-0.8606,  1.4704,  0.0507, -0.3238]])
tensor([1.,

# **Applying it in Breast Cancer Classification data**

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torchinfo import summary
import torch.optim as optim
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# data preprocessing
df=pd.read_csv("https://raw.githubusercontent.com/gscdit/Breast-cancer-Detection/refs/heads/master/data.csv")
df.drop(["Unnamed: 32","id"],axis=1,inplace=True)

X=df.drop(["diagnosis"],axis=1)
y=df["diagnosis"]

le=LabelEncoder()
y=le.fit_transform(y)

sc=StandardScaler()
X=sc.fit_transform(X)

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

X_train_t = torch.tensor(X_train, dtype=torch.float32)
y_train_t = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test_t = torch.tensor(X_test, dtype=torch.float32)
y_test_t = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# Define model
class SimpleNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))      # hidden layer with ReLU
        x = torch.sigmoid(self.fc2(x))   # output with Sigmoid
        return x

In [None]:
print(f"X_train_t.shape = {X_train_t.shape}, y_train_t.shape = {y_train_t.shape}")
print(f"X_test_t.shape = {X_test_t.shape}, y_test_t.shape = {y_test_t.shape}")

X_train_t.shape = torch.Size([455, 30]), y_train_t.shape = torch.Size([455, 1])
X_test_t.shape = torch.Size([114, 30]), y_test_t.shape = torch.Size([114, 1])


In [None]:
# define datset and dataloader
from torch.utils.data import Dataset,DataLoader

class custom_dataset(Dataset):
  def __init__(self,X,y):
    self.X=X
    self.y=y
    self.n_samples=X.shape[0]

  def __len__(self):
    return self.n_samples

  def __getitem__(self,index):
    return self.X[index],self.y[index]

train_dataset=custom_dataset(X_train_t,y_train_t)
test_dataset=custom_dataset(X_test_t,y_test_t)

train_loader=DataLoader(dataset=train_dataset,batch_size=32,shuffle=True)
test_loader=DataLoader(dataset=test_dataset,batch_size=32,shuffle=True)

In [None]:
# Hyperparameters
input_size = X_train_t.shape[1]
hidden_size = 16
num_epochs = 50
lr = 0.001

model = SimpleNN(input_size, hidden_size)
loss = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=lr)

In [None]:
summary(model)

Layer (type:depth-idx)                   Param #
SimpleNN                                 --
├─Linear: 1-1                            496
├─Linear: 1-2                            17
Total params: 513
Trainable params: 513
Non-trainable params: 0

In [None]:
# Training (whole dataset each epoch)
for epoch in range(num_epochs):
    for batch_X, batch_y in train_loader:
        outputs = model(batch_X)
        l = loss(outputs, batch_y)

        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {l.item():.4f}")



Epoch [1/50], Loss: 0.8264
Epoch [1/50], Loss: 0.7747
Epoch [1/50], Loss: 0.7350
Epoch [1/50], Loss: 0.7626
Epoch [1/50], Loss: 0.6809
Epoch [1/50], Loss: 0.8044
Epoch [1/50], Loss: 0.6721
Epoch [1/50], Loss: 0.6740
Epoch [1/50], Loss: 0.7029
Epoch [1/50], Loss: 0.7089
Epoch [1/50], Loss: 0.6950
Epoch [1/50], Loss: 0.6392
Epoch [1/50], Loss: 0.6438
Epoch [1/50], Loss: 0.7136
Epoch [1/50], Loss: 0.5916
Epoch [2/50], Loss: 0.6967
Epoch [2/50], Loss: 0.6003
Epoch [2/50], Loss: 0.6301
Epoch [2/50], Loss: 0.6238
Epoch [2/50], Loss: 0.6502
Epoch [2/50], Loss: 0.6145
Epoch [2/50], Loss: 0.6218
Epoch [2/50], Loss: 0.5673
Epoch [2/50], Loss: 0.5966
Epoch [2/50], Loss: 0.5826
Epoch [2/50], Loss: 0.5845
Epoch [2/50], Loss: 0.5842
Epoch [2/50], Loss: 0.6059
Epoch [2/50], Loss: 0.5605
Epoch [2/50], Loss: 0.5679
Epoch [3/50], Loss: 0.5722
Epoch [3/50], Loss: 0.5456
Epoch [3/50], Loss: 0.5332
Epoch [3/50], Loss: 0.5377
Epoch [3/50], Loss: 0.5050
Epoch [3/50], Loss: 0.5284
Epoch [3/50], Loss: 0.5015
E

In [None]:
# Testing
model.eval()
with torch.no_grad():
    outputs = model(X_test_t)
    predicted = (outputs >= 0.5).float()
    accuracy = (predicted.eq(y_test_t).sum().item() / y_test_t.size(0)) * 100

print(f"Test Accuracy: {accuracy:.2f}%")

Test Accuracy: 95.61%


In [None]:
# so we see our accuarcy is increased from 85% to 95%.