# **PyTorch Training Pipeline Using Dataset & DataLoader**

## **1. Import Required Libraries**
Import all necessary libraries for data manipulation, preprocessing, and PyTorch utilities.

In [1]:
# Import numpy for numerical operations
import numpy as np
# Import pandas for data manipulation
import pandas as pd

# Import torch for tensor operations and neural networks
import torch

# Import train_test_split for splitting data
from sklearn.model_selection import train_test_split
# Import StandardScaler for feature scaling
from sklearn.preprocessing import StandardScaler
# Import LabelEncoder for encoding categorical labels
from sklearn.preprocessing import LabelEncoder

## **2. Load and Inspect Dataset**
Load the breast cancer dataset from a remote CSV file and inspect its structure.

In [2]:
# Load the breast cancer dataset from a remote CSV file
df = pd.read_csv(
    'https://raw.githubusercontent.com/gscdit/Breast-Cancer-Detection/refs/heads/master/data.csv')

# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [3]:
# Show the shape of the DataFrame (rows, columns)
df.shape

(569, 33)

## **3. Data Cleaning**
Remove unnecessary columns (`id`, `Unnamed: 32`) to keep only relevant features and labels.

In [4]:
# Drop unnecessary columns: 'id' and 'Unnamed: 32'
df.drop(columns=['id', 'Unnamed: 32'], inplace=True)

In [5]:
# Display the first few rows after dropping columns
df.head()

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


## **4. Train-Test Split**
Split the dataset into training and testing sets for supervised learning.

In [6]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    df.iloc[:, 1:],  # Features (all columns except first)
    df.iloc[:, 0],   # Labels (first column)
    test_size=0.2    # 20% for testing
)

## **5. Feature Scaling**
Standardize the features using `StandardScaler` to improve model convergence.

In [7]:
# Initialize the scaler
scaler = StandardScaler()

# Fit scaler on training data and transform
X_train = scaler.fit_transform(X_train)

# Transform test data using the same scaler
X_test = scaler.transform(X_test)

In [8]:
# View the scaled training features
X_train

array([[-1.27203462, -0.55046419, -1.2344535 , ..., -0.48470413,
         0.17803342,  0.8415128 ],
       [ 1.5184503 , -0.25170149,  1.56390743, ...,  1.70130269,
         1.27399793,  0.50436955],
       [-1.23402321, -0.03048026, -1.22270249, ..., -1.02259891,
         0.532854  , -0.17551733],
       ...,
       [-1.54817636, -1.70218299, -1.53025475, ..., -1.04849585,
         0.57264697,  0.39404194],
       [-0.52745811, -0.30415601, -0.55492102, ..., -0.26960715,
         0.96228639, -0.79884032],
       [ 0.84766066,  0.64230597,  0.78996168, ...,  1.30066172,
         0.5660148 , -0.42137429]], shape=(455, 30))

In [9]:
# View the training labels
y_train

341    B
162    M
424    B
96     B
70     M
      ..
312    B
557    B
59     B
409    B
264    M
Name: diagnosis, Length: 455, dtype: object

## **6. Label Encoding**
Convert categorical labels to numeric values using `LabelEncoder` for compatibility with PyTorch.

In [10]:
# Initialize label encoder
encoder = LabelEncoder()

# Fit encoder on training labels and transform
y_train = encoder.fit_transform(y_train)

# Transform test labels using the same encoder
y_test = encoder.transform(y_test)

In [11]:
# View the encoded training labels
y_train

array([0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,

## **7. Convert Numpy Arrays to PyTorch Tensors**
Transform the processed numpy arrays into PyTorch tensors for model input.

In [12]:
# Convert numpy arrays to PyTorch tensors (float32 type)
X_train_tensor = torch.from_numpy(X_train.astype(np.float32))
X_test_tensor = torch.from_numpy(X_test.astype(np.float32))

y_train_tensor = torch.from_numpy(y_train.astype(np.float32))
y_test_tensor = torch.from_numpy(y_test.astype(np.float32))

In [13]:
# View the shape of the training feature tensor
X_train_tensor.shape

torch.Size([455, 30])

In [14]:
# View the shape of the training label tensor
y_train_tensor.shape

torch.Size([455])

In [15]:
# Import PyTorch Dataset and DataLoader classes
from torch.utils.data import Dataset, DataLoader

## **8. Custom Dataset Class**
Define a custom PyTorch `Dataset` class to handle feature and label access for batching.

In [16]:
# Define a custom Dataset class for features and labels
class CustomDataset(Dataset):

    def __init__(self, features, labels):
        # Store features and labels
        self.features = features
        self.labels = labels

    def __len__(self):
        # Return number of samples
        return len(self.features)

    def __getitem__(self, idx):
        # Return feature and label at given index
        return self.features[idx], self.labels[idx]

## **9. Create Dataset Instances**
Instantiate training and testing datasets using the custom Dataset class.

In [17]:
# Create training dataset instance
train_dataset = CustomDataset(
    X_train_tensor,
    y_train_tensor
)

# Create testing dataset instance
test_dataset = CustomDataset(
    X_test_tensor,
    y_test_tensor
)

## **10. DataLoader for Batching**
Wrap the datasets with PyTorch `DataLoader` to enable efficient batching and shuffling during training and evaluation.

In [18]:
# Create DataLoader for training data
train_loader = DataLoader(
    train_dataset,
    batch_size=32,
    shuffle=True
)

# Create DataLoader for testing data
test_loader = DataLoader(
    test_dataset,
    batch_size=32,
    shuffle=True
)

## **11. Define the Neural Network Model**
Create a simple feedforward neural network using PyTorch's `nn.Module`.

In [19]:
# Import PyTorch neural network module
import torch.nn as nn

In [20]:
# Define a simple neural network class
class MySimpleNN(nn.Module):

    def __init__(self, num_features):
        # Call parent constructor
        super().__init__()

        # Linear layer: input features to 1 output
        self.linear = nn.Linear(num_features, 1)
        # Sigmoid activation for output
        self.sigmoid = nn.Sigmoid()

    def forward(self, features):
        # Pass input through linear layer
        out = self.linear(features)
        # Apply sigmoid activation
        out = self.sigmoid(out)
        # Return output
        return out

## **12. Set Training Hyperparameters**
Specify the learning rate and number of epochs for model training.

In [21]:
# Set learning rate for optimizer
learning_rate = 0.1

# Set number of epochs for training
epochs = 50

## **13. Model, Optimizer, and Loss Function**
Instantiate the model, optimizer (SGD), and binary cross-entropy loss function for training.

In [22]:
# Create model instance with number of features as input size
model = MySimpleNN(X_train_tensor.shape[1])

# Define SGD optimizer with model parameters and learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

# Define binary cross entropy loss function
loss_function = nn.BCELoss()

## **14. Training Loop**
Iterate over epochs and batches, performing forward and backward passes, updating model parameters, and printing the loss for each epoch.

In [23]:
# Training loop for specified number of epochs
for epoch in range(epochs):

    for batch_features, batch_labels in train_loader:

        # Forward pass: compute predictions
        y_pred = model(batch_features)

        # Compute loss between predictions and true labels
        loss = loss_function(y_pred, batch_labels.view(-1, 1))

        # Zero gradients before backward pass
        optimizer.zero_grad()

        # Backward pass: compute gradients
        loss.backward()

        # Update model parameters
        optimizer.step()

    # Print loss for current epoch
    print(f'Epoch: {epoch + 1}, Loss: {loss.item()}')

Epoch: 1, Loss: 0.1602795422077179
Epoch: 2, Loss: 0.04105423018336296
Epoch: 3, Loss: 0.15081845223903656
Epoch: 4, Loss: 0.1370118111371994
Epoch: 5, Loss: 0.06466248631477356
Epoch: 6, Loss: 0.1230577602982521
Epoch: 4, Loss: 0.1370118111371994
Epoch: 5, Loss: 0.06466248631477356
Epoch: 6, Loss: 0.1230577602982521
Epoch: 7, Loss: 0.13350827991962433
Epoch: 8, Loss: 0.11811214685440063
Epoch: 9, Loss: 0.022221488878130913
Epoch: 10, Loss: 0.020276715978980064
Epoch: 7, Loss: 0.13350827991962433
Epoch: 8, Loss: 0.11811214685440063
Epoch: 9, Loss: 0.022221488878130913
Epoch: 10, Loss: 0.020276715978980064
Epoch: 11, Loss: 0.02984280325472355
Epoch: 12, Loss: 0.0352652445435524
Epoch: 13, Loss: 0.028315788134932518
Epoch: 11, Loss: 0.02984280325472355
Epoch: 12, Loss: 0.0352652445435524
Epoch: 13, Loss: 0.028315788134932518
Epoch: 14, Loss: 0.017749732360243797
Epoch: 15, Loss: 0.09125881642103195
Epoch: 16, Loss: 0.08134667575359344
Epoch: 17, Loss: 0.05789788439869881
Epoch: 14, Loss:

## **15. Model Evaluation**
Evaluate the trained model on the test set using accuracy as the metric.

In [24]:
# Model evaluation using test_loader
model.eval()  # Set the model to evaluation mode
accuracy_list = []

with torch.no_grad():
    for batch_features, batch_labels in test_loader:
        # Forward pass
        y_pred = model(batch_features)

        # Convert probabilities to binary predictions (threshold = 0.8)
        y_pred = (y_pred > 0.8).float()

        # Calculate accuracy for the current batch
        batch_accuracy = (
            y_pred.view(-1) == batch_labels).float().mean().item()

        accuracy_list.append(batch_accuracy)

# Calculate overall accuracy
overall_accuracy = sum(accuracy_list) / len(accuracy_list)
print(f'Accuracy: {overall_accuracy:.4f}')

Accuracy: 0.9470
