# README
1. Code is writing in the simplest form possible, to make learning easy. 
2. In just `5 Code Cells`, you will have done your Kaggle Submission for "Digit Recognizer" competition 
2. Dataset is downloaded from `Huggingface Datasets`, Model is written in Pytorch, Training Loop is written in pytorch

# 1. Simple Dataset downloading Pipeline

In [1]:
import torch, torch.nn as nn
import torchvision, torchinfo, torchmetrics
import datasets as huggingface_datasets
from tqdm import tqdm

device        = torch.device("cuda" if torch.cuda.is_available() else "cpu")
BATCH_SIZE    = 5
# IMAGE_RESIZE  = 28,28

def DOWNLOAD_DATASETS():
    # Download
    dataset            = huggingface_datasets.load_dataset("mnist", ) # streaming = True)
    training_dataset   = dataset['train']
    validation_dataset = dataset['test']

    # Transform
    transformations_group = torchvision.transforms.Compose([
        torchvision.transforms.ToTensor(), # Converts every pixel into value between 0 & 1. 
        # torchvision.transforms.Resize(size=config.IMAGE_RESIZE)
    ])

    def transform_datasets(examples):
        examples["image_tensors"] = []

        for image in examples['image']:
            transformed_image = transformations_group(image)
            examples['image_tensors'].append(transformed_image)

        return examples

    training_dataset       = training_dataset   .map(transform_datasets  , batched= True)
    validation_dataset     = validation_dataset .map(transform_datasets  , batched= True)

    # Convert
    new_training_dataset   = training_dataset   .with_format("torch", columns=['label', 'image_tensors'], dtype = torch.float32)
    new_validation_dataset = validation_dataset .with_format("torch", columns=['label', 'image_tensors'], dtype = torch.float32)

    TOTAL_BATCHES = len(training_dataset) / BATCH_SIZE
    
    training_dataloader   = torch.utils.data.DataLoader( dataset= new_training_dataset   , batch_size= BATCH_SIZE, shuffle= True )
    validation_dataloader = torch.utils.data.DataLoader( dataset= new_validation_dataset , batch_size= BATCH_SIZE, shuffle= True )
    
    return training_dataset, validation_dataset, training_dataloader, validation_dataloader

training_dataset, validation_dataset, training_dataloader, validation_dataloader = DOWNLOAD_DATASETS();
assert next(iter(training_dataloader)) is not None
assert next(iter(validation_dataloader)) is not None

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

# 2. Simple Model Training Pipeline

In [2]:
lr      = 0.001 # learning_rate
epochs  = 10 # How much to train a model
device  = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def TRAIN_MODEL(model, training_dataloader, validation_dataloader):

    model.train(mode=True)
    OPTIMIZER = torch.optim.SGD ( params= model.parameters(), lr= lr ) # Using torch.optimizer algorithm
    metric    = torchmetrics.Accuracy(task="multiclass", num_classes= 10 ).to(device)
    
    for epoch_no in range(epochs):        
        for batch_no, batch_dictionary in enumerate(progress_bar := tqdm(training_dataloader)):
            x_actual = batch_dictionary['image_tensors'].to(device)
            y_actual = batch_dictionary['label'].to(device)

            y_predicted_LOGITS = model.forward               (x_actual)
            y_predicted_probs  = nn.functional.softmax       (y_predicted_LOGITS, dim= 1)
            loss               = nn.functional.cross_entropy (y_predicted_LOGITS, y_actual.to(torch.int64))
            
            OPTIMIZER.zero_grad()
            loss.backward()
            # dError_dParameters    = torch.autograd.grad( outputs = ERROR_FUNC( y_predicted, y_actual ), inputs = model.parameters())
            # Parameters of layer 1 are not dependent on any other parameters
            # Parameters of layer 2 are dependent on layer 1 parameters
            # Parameters of layer 3 are dependent on layer 2 parameters which are dependent on layer 1 parameters
            # Finding complicated rate of change of such nested parameters is done automatically when we do loss.backward()
            OPTIMIZER.step()
            """
            for (name, weight), gradient in zip(model.named_parameters(), dError_dWeights):
                weight = weight - gradient * LEARNING_RATE
                print(f"Parameters of layer: {name} have these many {torch.count_nonzero(gradient)} updates out of {torch.count(gradient)})
            """

            loss_batch      = loss.item()
            accuracy_batch  = metric(y_predicted_LOGITS, y_actual)
            training_accuracy_avg_epoch = metric.compute() # calculates average accuracy across epoch automatically

            metrics_per_batch = {
                "loss_batch": loss_batch,
                "accuracy_running_average": training_accuracy_avg_epoch,
            }
            progress_bar.set_description(f'batch_no = {batch_no},\t loss_batch = {loss_batch:0.4f},\t accuracy_avg = {training_accuracy_avg_epoch:0.4f}')

        metric.reset()
        
        loss_validation, accuracy_validation = EVALUATE_MODEL(model, validation_dataloader)
        print(f'epoch_no = {epoch_no}, training_loss = {loss_batch:0.4f}, validation_loss = {loss_validation:0.4f},\t training_accuracy = {accuracy_batch:0.4f}, validation_accuracy = {accuracy_validation:0.4f}')
        model.train(mode=False)

def EVALUATE_MODEL(model, validation_dataloader):
    model.eval()
    metric = torchmetrics.Accuracy(task="multiclass", num_classes=10).to(device)
    with torch.no_grad():
        for batch_no, batch_dictionary in enumerate(validation_dataloader):
            x_actual = batch_dictionary['image_tensors'].to(device)
            y_actual = batch_dictionary['label'].to(device)

            y_predicted_LOGITS = model.forward                 (x_actual)
            loss               = nn.functional.cross_entropy   (y_predicted_LOGITS, y_actual.to(torch.int64)).item()
            accuracy_batch     = metric                        (y_predicted_LOGITS, y_actual).item()

        testing_accuracy_avg = metric.compute().item()
        return loss, testing_accuracy_avg

# 3. Simple Model Architecture

In [3]:
model_random_parameters = torch.nn.Sequential(
    
    torch.nn.Flatten(start_dim=1),         # Dim:BCHW -> (0:B , 1:C, 2:H, 3:W)

    torch.nn.Linear(in_features = 28*28*1  , out_features   = 40   ), torch.nn.ReLU(),                 # LAYER 1: 1st Hidden Layer
    torch.nn.Linear(in_features = 40       , out_features = 30   ), torch.nn.ReLU(),                 # LAYER 2: 2nd Hidden Layer

    torch.nn.Linear(in_features = 30       , out_features = 10   ),                                  # OUTPUT LAYER
)

model = model_random_parameters
model = model.to(device)                    # Model Size / Number of Parameters are important

torchinfo.summary(model, input_size= (1,1*28*28), verbose=2);

Layer (type:depth-idx)                   Output Shape              Param #
Sequential                               [1, 10]                   --
├─Flatten: 1-1                           [1, 784]                  --
├─Linear: 1-2                            [1, 40]                   31,400
│    └─weight                                                      ├─31,360
│    └─bias                                                        └─40
├─ReLU: 1-3                              [1, 40]                   --
├─Linear: 1-4                            [1, 30]                   1,230
│    └─weight                                                      ├─1,200
│    └─bias                                                        └─30
├─ReLU: 1-5                              [1, 30]                   --
├─Linear: 1-6                            [1, 10]                   310
│    └─weight                                                      ├─300
│    └─bias                                                

## Details of Problem Complexity
- 0.00784 Mega Pixel Image
- 60,000 of such images
- They are black and white
- They contain only numbers 0 to 9
- Even this SIMPLE PROBLEM, requires a MODEL of MINIMUM 10,000 parameters

In [5]:
TRAIN_MODEL (model, training_dataloader, validation_dataloader)

batch_no = 2191,	 loss_batch = 1.9750,	 accuracy_avg = 0.3898:  18%|█▊        | 2176/12000 [24:46<00:46, 213.00it/s] 

# 4. Kaggle Competition Submission

In [None]:
!find /kaggle/input

import pandas as pd
import torch, torchvision
submission_test = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

x = torch.tensor(submission_test.values.reshape(submission_test.shape[0], 1, 28, 28), dtype=torch.float32)
x = x.to(device)

def tensor_to_images(x):
    images = {}
    for index in range(x.shape[0]):
        images[str(index)] = torchvision.transforms.ToPILImage(mode = "L" )(x[index])
    return images

images = tensor_to_images(x)

# TODO: Figure Out how to do Transforms same way as training Data
transformations_list = torchvision.transforms.Compose([
    torchvision.transforms.ToPILImage(), # Because Data is not saved as Image, we need to first convert it in Image and then convert back to Tensor
    
    torchvision.transforms.ToTensor(),
])


In [None]:
import numpy as np
y_prediction_logits = model(x)
y_labels_predicted  = torch.argmax(y_prediction_logits, dim = 1)

submission          = pd.DataFrame({'ImageId' : torch.arange(1, len(y_labels_predicted) + 1).cpu(), 
                                    'Label'   : y_labels_predicted.cpu()})
submission.to_csv('submission.csv', index=False)