# Large Data Processing
Here we will learn about how to process large data in batches.

## DataLoader and Dataset

In [1]:
import pandas as pd

url = "https://huggingface.co/datasets/mishrabp/mydatasets/resolve/main/data/wine.csv"
df = pd.read_csv(url)
print(df.head())


   Wine  Alcohol  Malic.acid   Ash   Acl   Mg  Phenols  Flavanoids  \
0     1    14.23        1.71  2.43  15.6  127     2.80        3.06   
1     1    13.20        1.78  2.14  11.2  100     2.65        2.76   
2     1    13.16        2.36  2.67  18.6  101     2.80        3.24   
3     1    14.37        1.95  2.50  16.8  113     3.85        3.49   
4     1    13.24        2.59  2.87  21.0  118     2.80        2.69   

   Nonflavanoid.phenols  Proanth  Color.int   Hue    OD  Proline  
0                  0.28     2.29       5.64  1.04  3.92     1065  
1                  0.26     1.28       4.38  1.05  3.40     1050  
2                  0.30     2.81       5.68  1.03  3.17     1185  
3                  0.24     2.18       7.80  0.86  3.45     1480  
4                  0.39     1.82       4.32  1.04  2.93      735  


In [None]:
from sys import deactivate_stack_trampoline
import torch
import torchvision
from torch.utils.data import Dataset, DataLoader
import numpy as np
import math

class WineDataset(Dataset):
  def __init__(self):
    url = "https://huggingface.co/datasets/mishrabp/mydatasets/resolve/main/data/wine.csv"
    xy = np.loadtxt(url, delimiter=",", dtype=np.float32, skiprows=1)
    self.X = torch.from_numpy(xy[:, 1:])
    self.y = torch.from_numpy(xy[:, 0])
    self.n_samples = xy.shape[0] # no of records

  def __getitem__(self, index):
    return self.X[index], self.y[index]

  def __len__(self):
    return self.n_samples

dataset = WineDataset()
# num_workers=2 makes data loader faster
dataloader = DataLoader(dataset=dataset, batch_size=4, shuffle=True, num_workers=2)


# training loop
epochs = 2
batch_size = 4
total_samples = len(dataset)
n_iterations = math.ceil(total_samples/batch_size)
print(total_samples, n_iterations)

for epoch in range(epochs):
  for i, (inputs, labels) in enumerate(dataloader): #returns 4 records as batch_size=4
    if((i+1)%5==0):
      print(f"epoch {epoch+1}/{epochs}, step {i+1}/{n_iterations}, inputs {inputs.shape}")


178 45
epoch 1/2, step 5/45, inputs torch.Size([4, 13])
epoch 1/2, step 10/45, inputs torch.Size([4, 13])
epoch 1/2, step 15/45, inputs torch.Size([4, 13])
epoch 1/2, step 20/45, inputs torch.Size([4, 13])
epoch 1/2, step 25/45, inputs torch.Size([4, 13])
epoch 1/2, step 30/45, inputs torch.Size([4, 13])
epoch 1/2, step 35/45, inputs torch.Size([4, 13])
epoch 1/2, step 40/45, inputs torch.Size([4, 13])
epoch 1/2, step 45/45, inputs torch.Size([2, 13])
epoch 2/2, step 5/45, inputs torch.Size([4, 13])
epoch 2/2, step 10/45, inputs torch.Size([4, 13])
epoch 2/2, step 15/45, inputs torch.Size([4, 13])
epoch 2/2, step 20/45, inputs torch.Size([4, 13])
epoch 2/2, step 25/45, inputs torch.Size([4, 13])
epoch 2/2, step 30/45, inputs torch.Size([4, 13])
epoch 2/2, step 35/45, inputs torch.Size([4, 13])
epoch 2/2, step 40/45, inputs torch.Size([4, 13])
epoch 2/2, step 45/45, inputs torch.Size([2, 13])


# Tranforms

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
import math
import torchvision.transforms as transforms  # ✅ added import

class WineDataset(Dataset):
    def __init__(self, transform=None):
        url = "https://huggingface.co/datasets/mishrabp/mydatasets/resolve/main/data/wine.csv"
        df = pd.read_csv(url)

        # assuming first column is target, rest are features
        self.X = df.iloc[:, 1:].values.astype(np.float32)
        self.y = df.iloc[:, 0].values.astype(np.float32)
        self.n_samples = len(df)
        self.transform = transform

    def __getitem__(self, index):
        sample = (self.X[index], self.y[index])
        if self.transform:
            sample = self.transform(sample)
        return sample

    def __len__(self):
        return self.n_samples


class ToTensor:
    def __call__(self, sample):
        inputs, targets = sample
        return torch.from_numpy(inputs), torch.tensor(targets)


class MulTransform:
    def __init__(self, factor):
        self.factor = factor

    def __call__(self, sample):
        inputs, targets = sample
        inputs = inputs * self.factor  # ✅ avoid in-place modification
        return inputs, targets


# ✅ Correct order: apply numeric transform first, then tensor conversion
composed = transforms.Compose([MulTransform(2), ToTensor()])

# Dataset and DataLoader
dataset = WineDataset(transform=composed)
dataloader = DataLoader(dataset=dataset, batch_size=4, shuffle=True, num_workers=0)

# Training loop
epochs = 2
batch_size = 4
total_samples = len(dataset)
n_iterations = math.ceil(total_samples / batch_size)
print(f"Total samples: {total_samples}, Iterations per epoch: {n_iterations}")

for epoch in range(epochs):
    for i, (inputs, labels) in enumerate(dataloader):
        if (i + 1) % 5 == 0:
            print(f"Epoch [{epoch+1}/{epochs}], Step [{i+1}/{n_iterations}], Inputs shape: {inputs.shape}")


Total samples: 178, Iterations per epoch: 45
Epoch [1/2], Step [5/45], Inputs shape: torch.Size([4, 13])
Epoch [1/2], Step [10/45], Inputs shape: torch.Size([4, 13])
Epoch [1/2], Step [15/45], Inputs shape: torch.Size([4, 13])
Epoch [1/2], Step [20/45], Inputs shape: torch.Size([4, 13])
Epoch [1/2], Step [25/45], Inputs shape: torch.Size([4, 13])
Epoch [1/2], Step [30/45], Inputs shape: torch.Size([4, 13])
Epoch [1/2], Step [35/45], Inputs shape: torch.Size([4, 13])
Epoch [1/2], Step [40/45], Inputs shape: torch.Size([4, 13])
Epoch [1/2], Step [45/45], Inputs shape: torch.Size([2, 13])
Epoch [2/2], Step [5/45], Inputs shape: torch.Size([4, 13])
Epoch [2/2], Step [10/45], Inputs shape: torch.Size([4, 13])
Epoch [2/2], Step [15/45], Inputs shape: torch.Size([4, 13])
Epoch [2/2], Step [20/45], Inputs shape: torch.Size([4, 13])
Epoch [2/2], Step [25/45], Inputs shape: torch.Size([4, 13])
Epoch [2/2], Step [30/45], Inputs shape: torch.Size([4, 13])
Epoch [2/2], Step [35/45], Inputs shape: t