## 1. Dataset & Dataloader

- Download dataset using `huggingface's datasets library`. Easy download
- Build custom dataset using `torch.utils.data.Dataset`

In [4]:
import torch
import torchvision
import datasets as huggingface_datasets

training_dataset    = huggingface_datasets.load_dataset("p2pfl/MNIST", split="train")

class Custom_Dataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
        self.data_transformations = torchvision.transforms.Compose(
            [
                torchvision.transforms.ToTensor(),
            ]
        )

    def __getitem__(self, index):
        image , label  = self.dataset[index]['image'], self.dataset[index]['label']
        image_tensor   = self.data_transformations(image)
        single_example = (image_tensor, label)
        return single_example
    
    def __len__(self):
        return len(self.dataset)

custom_training_dataset   = Custom_Dataset(training_dataset) # __init__ method called automatically to initalize
training_dataloader       = torch.utils.data.DataLoader(custom_training_dataset, batch_size = 32, shuffle= True)

Downloading readme:   0%|          | 0.00/315 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/15.6M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/60000 [00:00<?, ? examples/s]

In [8]:
validation_dataset        = huggingface_datasets.load_dataset("mnist", split="test" )
custom_validation_dataset = Custom_Dataset(validation_dataset)
validation_dataloader     = torch.utils.data.DataLoader(custom_validation_dataset, batch_size = 32, shuffle= False)

image_tensors, labels = next(iter(training_dataloader))

DataFilesNotFoundError: No (supported) data files found in mnist

## 2. Model Training

In [9]:
import lightning
import torchmetrics

class Lightning_Module(lightning.LightningModule):
    def __init__(self, model):
        super().__init__()
        
        self.model = model
        self.automatic_optimization = False
        self.training_accuracy   = torchmetrics.Accuracy(task="multiclass", num_classes=10)
        self.validation_accuracy = torchmetrics.Accuracy(task="multiclass", num_classes=10)
  
    def training_step (self, batch, batch_idx):
        images_actual, labels_actual        = batch
        predicted_logits  = self.model(images_actual)
        labels_predicted  = torch.argmax(predicted_logits, dim = 1)

        loss             = torch.nn.functional.cross_entropy(predicted_logits, labels_actual)
        
        optimizer        = self.optimizers()
        optimizer.zero_grad()
        self.manual_backward(loss)
        optimizer.step()
        """
        for individual_parameter in self.parameters():
            individual_parameter = individual_parameter - individual_parameter.grad * learning_rate
        """
        self.training_accuracy(labels_predicted, labels_actual)
        self.log("train_loss"     , loss                   , prog_bar = True)
        self.log("train_accuracy" , self.training_accuracy , prog_bar = True)
        
        return loss

    def validation_step (self, batch, batch_idx):
        images_actual, labels_actual        = batch
        predicted_logits  = self.model(images_actual)
        labels_predicted  = torch.argmax(predicted_logits, dim = 1)

        loss = torch.nn.functional.cross_entropy(predicted_logits, labels_actual)
        self.validation_accuracy(labels_predicted, labels_actual)
        self.log("validation_loss"     , loss                     , prog_bar= False)
        self.log("validation_accuracy" , self.validation_accuracy , prog_bar= True)
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
        return optimizer

## 3. Model Architecture

- Model Architecture using `keras`. Easy to read architecture
- Using `pytorch backend` for keras

### Simple Neuron

In [10]:
import os
os.environ["KERAS_BACKEND"] = "torch"
import keras
keras.config.set_image_data_format('channels_first')

input_shape 			= (1, 28, 28)
OBJECTS_TO_IDENTIFY 	= 10
HIDDEN_LAYER_NEURONS 	= 100

ARCHITECTURE 			= "1d fully_connected: [ (1,28,28) , 100, 10 ]"

random_initialization_model = keras.Sequential(
    [
        # Input Layer
        keras.layers.Input(shape= (1, 28, 28) ),
        
        # Hidden Layers
        keras.layers.Flatten(),
        keras.layers.Dense(units = HIDDEN_LAYER_NEURONS  , activation="relu"),
        
        # Final Layer: No activation function. Outputs: logits
        keras.layers.Dense(units = OBJECTS_TO_IDENTIFY   , ), 
    ]
)

model            = random_initialization_model

In [11]:
model.summary(show_trainable = True, expand_nested = True)

In [12]:
test_batch       = torch.randn((1,1,28,28))
predicted_logits = model(test_batch)

print(f'output shape {predicted_logits.shape} and total objects to predict = {OBJECTS_TO_IDENTIFY}')

output shape torch.Size([1, 10]) and total objects to predict = 10


In [13]:
epochs  = 10
lightning_model   = Lightning_Module(model)
lightning_trainer = lightning.Trainer( max_epochs= epochs)

lightning_trainer.fit(model=lightning_model, train_dataloaders= training_dataloader, val_dataloaders= training_dataloader)
# for batch_number, data_batch in enumerate(training_dataloader):
# 	x, y = data_batch
# 	loss = lightning_model.training_step(x)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | model               | Sequential         | 79.5 K | train
1 | training_accuracy   | MulticlassAccuracy | 0      | train
2 | validation_accuracy | MulticlassAccuracy | 0      | train
-------------------------------------------------------------------
79.5 K    Trainable params
0         Non-trainable params
79.5 K    Total params
0.318     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:475: Your `val_dataloader`'s sampler has shuffling enabled, it is strongly recommended that you turn shuffling off for val/test dataloaders.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=10` in the `DataLoader` to improve performance.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=10` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [17]:
from lightning.pytorch.loggers import WandbLogger

wandb_logger = WandbLogger(project="MNIST")

epochs  = 10
lightning_model   = Lightning_Module(model)
lightning_trainer = lightning.Trainer( max_epochs= epochs, logger=wandb_logger)

lightning_trainer.fit(model=lightning_model, train_dataloaders= training_dataloader, val_dataloaders= validation_dataloader)


GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33majinkyakolhe112[0m ([33mm2_mac[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011168399999991783, max=1.0…


  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | model               | Sequential         | 79.5 K | train
1 | training_accuracy   | MulticlassAccuracy | 0      | train
2 | validation_accuracy | MulticlassAccuracy | 0      | train
-------------------------------------------------------------------
79.5 K    Trainable params
0         Non-trainable params
79.5 K    Total params
0.318     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=10` in the `DataLoader` to improve performance.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=10` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


[Monitoring Training in Weights and Biases](https://wandb.ai/m2_mac/mnist/reports/Digit-Recognition-Training-Loop-in-Wandb--Vmlldzo4NzA5ODU1)

### Convolution Architecture

In [18]:
ARCHITECTURE = "conv: [ (1,28,28) , 100 ] , fully_connected: [ 10 ]"
keras_convolution_model = keras.Sequential(
    [
        keras.layers.Input (shape= (1, 28, 28) ),
        keras.layers.Conv2D(filters = HIDDEN_LAYER_NEURONS , kernel_size=(28, 28), activation="relu"),
        keras.layers.Flatten(),
        keras.layers.Dense (OBJECTS_TO_IDENTIFY, ),
    ]
)
model = keras_convolution_model

In [19]:
model.summary(show_trainable = True, expand_nested = True)

In [20]:
from lightning.pytorch.loggers import WandbLogger
wandb_logger = WandbLogger(project="MNIST")

epochs  = 10
lightning_model   = Lightning_Module(model)
lightning_trainer = lightning.Trainer( max_epochs= epochs, logger=wandb_logger)

lightning_trainer.fit(model=lightning_model, train_dataloaders= training_dataloader, val_dataloaders= validation_dataloader)
# for batch_number, data_batch in enumerate(training_dataloader):
# 	x, y = data_batch
# 	loss = lightning_model.training_step(x)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:652: Checkpoint directory ./MNIST/0ar9obtg/checkpoints exists and is not empty.

  | Name                | Type               | Params | Mode 
-------------------------------------------------------------------
0 | model               | Sequential         | 79.5 K | train
1 | training_accuracy   | MulticlassAccuracy | 0      | train
2 | validation_accuracy | MulticlassAccuracy | 0      | train
-------------------------------------------------------------------
79.5 K    Trainable params
0         Non-trainable params
79.5 K    Total params
0.318     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=10` reached.


In [None]:
%reload_ext tensorboard
%tensorboard --logdir=lightning_logs/
