In [None]:
import random

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset
import torchvision.transforms as T

# pd.set_option('display.max_columns', None)

In [None]:
# loading data
train_dr = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test_dr = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
train_dr

In [None]:
# reproducibility package
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
train_dr.info()
print('\n'f'Missing values: {train_dr.isna().sum().sum()}')

# CNN

## Default model

In [None]:
X = train_dr.drop('label', axis=1).values.astype('float32') / 255.0
y = train_dr.label.values.astype('int64')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED, shuffle=True, stratify=y)

In [None]:
def to_chw(images_flat):
    """
    converting a vector to a matrix
    """
    return images_flat.reshape(-1, 1, 28, 28)

X_train = to_chw(X_train)
X_val = to_chw(X_val)

X_train_tz = torch.from_numpy(X_train)
X_val_tz = torch.from_numpy(X_val)
y_train_tz = torch.from_numpy(y_train)
y_val_tz = torch.from_numpy(y_val)

train_dataset = TensorDataset(X_train_tz, y_train_tz)
val_dataset = TensorDataset(X_val_tz, y_val_tz)

g = torch.Generator()
g.manual_seed(SEED)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=g)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
# test data preparation
X_test = test_dr.values.astype('float32') / 255.0
X_test = to_chw(X_test)
X_test_tz = torch.from_numpy(X_test)
test_loader = DataLoader(TensorDataset(X_test_tz), batch_size=256, shuffle=False)

In [None]:
# visualization of samples 
def show_sample(dataset, idx=15, denormalize=False):
    img, label = dataset.tensors[0][idx], dataset.tensors[1][idx]

    # img: (1, 28, 28) -> (28, 28)
    img2d = img[0].cpu().numpy()
    plt.figure(figsize=(2,2))
    plt.title(f'Number: {int(label)}')
    plt.imshow(img2d, cmap='gray')
    plt.axis('off')
    plt.show()

show_sample(train_dataset, idx=19999)

In [None]:
class CNN_v1(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, padding=1),    # 28->28
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1),    # 28->28
            nn.ReLU(),
            nn.MaxPool2d(2),    # 28->14
            
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),    # 14->14
            nn.ReLU(),
            nn.MaxPool2d(2),    # 14->7
            
            nn.Flatten(),
            nn.Linear(128*7*7, 512),
            nn.ReLU(),
            nn.Linear(512, 10),
        )
    def forward(self, x): 
        return self.net(x)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNN_v1().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_func = nn.CrossEntropyLoss()

In [None]:
best_accuracy = 0.0
best_state = None

for epoch in range(30):
    # train
    model.train()
    for X_tr, y_tr in train_loader:
        X_tr, y_tr = X_tr.to(device), y_tr.to(device)
        optimizer.zero_grad()
        loss = loss_func(model(X_tr), y_tr)
        loss.backward()
        optimizer.step()

    # eval
    model.eval()
    correct = total_obj = 0
    with torch.no_grad():
        for X_vl, y_vl in val_loader:
            X_vl, y_vl = X_vl.to(device), y_vl.to(device)
            pred = model(X_vl).argmax(1)
            correct += (pred == y_vl).sum().item()
            total_obj += y_vl.size(0)

    val_accuracy = correct / total_obj
    print(f"epoch {epoch+1}: val_accuracy={val_accuracy:.4f}")

    # save best parameters
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_state = model.state_dict()
        print(f"New best! Saving model with accuracy={best_accuracy:.4f}")

# load the best weights
model.load_state_dict(best_state)

*This training method showed **accuracy** 0.9926 on validation and 0.99207 on the leaderboard.*

## Promoted Model

*Initially, I trained the model and evaluated its performance using both the training and validation sets. However, in the final stage, I train the model on the entire dataset without a validation split. The notebook contains the final version, where the model is trained on all available data and used to generate the final submission.*

In [None]:
# data preparation with augmentation
X = train_dr.drop('label', axis=1).values.astype('uint8')
y = train_dr.label.values.astype('int64')

IM = T.InterpolationMode

# augmentation
train_transform = T.Compose([
    T.ToPILImage(),
    T.RandomChoice([
    T.RandomAffine(
        degrees=10, 
        translate=(0.1, 0.1), 
        scale=(0.95, 1.05), 
        shear=5,
        interpolation=IM.BILINEAR, fill=0
    ),
    T.Compose([]),
    T.RandomAffine(
        degrees=6, 
        translate=(0.05, 0.05), 
        scale=(0.97, 1.03),
        interpolation=IM.BILINEAR, fill=0
    )
], p=[0.5, 0.2, 0.3]),
    T.ToTensor(),    # (1, 28 x 28) dtype=torch.float32 [0..1]
    T.Normalize((0.1307,), (0.3081,))
])

eval_transform = T.Compose([
    T.ToPILImage(),
    T.ToTensor(),
    T.Normalize((0.1307,), (0.3081,))
])

# class for online augmentation
class FlatAugDataset(Dataset):
    def __init__(self, X, y=None, transform=None):
        self.X = X
        self.y = y
        self.transform = transform

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x = self.X[idx].reshape(28, 28)    # (784) dtype=uint8 [0..255]  ->  (28 x 28) dtype=uint8 [0..255]
        x = self.transform(x)    # tensor (1, 28 x 28) dtype=torch.float32 [0..1]
        if self.y is None:
            return x
        return x, torch.tensor(self.y[idx], dtype=torch.long)

# Datasets
train_dataset = FlatAugDataset(X, y, transform=train_transform)

X_test = test_dr.values.astype('uint8')
test_dataset = FlatAugDataset(X_test, y=None, transform=eval_transform)

# DataLoaders
g = torch.Generator()
g.manual_seed(SEED)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, pin_memory=True, generator=g)
test_loader  = DataLoader(test_dataset, batch_size=256, shuffle=False, pin_memory=True)

In [None]:
def conv_block(cin, cout):
    return nn.Sequential(
        nn.Conv2d(cin, cout, kernel_size=3, padding=1, bias=False),
        nn.BatchNorm2d(cout),
        nn.GELU(),  
    )

class CNN_v2(nn.Module):
    def __init__(self, num_classes=10, p_drop=0.05):
        super().__init__()
        self.stem = nn.Sequential(
            conv_block(1, 32),
            conv_block(32, 32),
            nn.MaxPool2d(2),          # 28 -> 14
            nn.Dropout2d(p_drop),

            conv_block(32, 64),
            conv_block(64, 64),
            nn.MaxPool2d(2),          # 14 -> 7
            nn.Dropout2d(p_drop),
        )
        self.head = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),  # (B,64,1,1)
            nn.Flatten(),             # (B,64)
            nn.Linear(64, num_classes)
        )
        self.apply(self._init_weights)

    @staticmethod
    def _init_weights(m):
        if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
            nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
            if getattr(m, "bias", None) is not None:
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.stem(x)
        x = self.head(x)
        return x

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = CNN_v2().to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-3, weight_decay=1e-3)
epochs = 35
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
loss_func = nn.CrossEntropyLoss(label_smoothing=0.05)

In [None]:
for epoch in range(epochs):
    # train
    model.train()
    for X_tr, y_tr in train_loader:
        X_tr, y_tr = X_tr.to(device, non_blocking=True), y_tr.to(device, non_blocking=True)
        optimizer.zero_grad()
        loss = loss_func(model(X_tr), y_tr)
        loss.backward()
        optimizer.step()
    scheduler.step()

In [None]:
# submit
preds = []
model.eval()
with torch.no_grad():
    for X_test in test_loader:
        X_test = X_test.to(device)
        preds.append(model(X_test).argmax(1).cpu().numpy())
preds = np.concatenate(preds)

submission = pd.DataFrame({
    'ImageId': np.arange(1, len(preds)+1),
    'Label': preds
})
submission.to_csv('submission.csv', index=False)

*This training method showed **accuracy** 0.99560  on the leaderboard.*