In [None]:
import random
import numpy as np
import pandas as pd
import torch
from catboost import CatBoostClassifier, Pool
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split

# pd.set_option('display.max_columns', None)

In [None]:
# loading data
train_dr = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test_dr = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')
train_dr

In [None]:
# reproducibility package
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
train_dr.info()
print('\n'f'Missing values: {train_dr.isna().sum().sum()}')

# CatBoost baseline

In [None]:
X = train_dr.drop(['label'], axis=1)
y = train_dr.label

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED, shuffle=True, stratify=y)

In [None]:
# create a sparse matrix
Xtr = csr_matrix(X_train.astype('float32') / 255.0)
Xvl = csr_matrix(X_val.astype('float32') / 255.0)
train_pool = Pool(Xtr, label=y_train)
val_pool = Pool(Xvl, label=y_val)
test_dr_pool = Pool(test_dr)

In [None]:
clf = CatBoostClassifier(iterations=2000,
                         loss_function='MultiClass',
                         eval_metric='Accuracy',
                         task_type='GPU',
                         random_state=SEED)

clf.fit(
    train_pool,
    eval_set=val_pool,
    verbose=50,
    use_best_model=True,
    early_stopping_rounds=150
)

**score 0.96328**

# MLP

## Default Model

In [None]:
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [None]:
# train data preparation
X = train_dr.drop('label', axis=1).values.astype('float32') / 255.0
y = train_dr.label.values.astype('int64')

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=SEED, shuffle=True, stratify=y)

X_train_tz = torch.from_numpy(X_train)
X_val_tz = torch.from_numpy(X_val)
y_train_tz = torch.from_numpy(y_train)
y_val_tz = torch.from_numpy(y_val)

train_dataset = TensorDataset(X_train_tz, y_train_tz)
val_dataset = TensorDataset(X_val_tz, y_val_tz)

g = torch.Generator()
g.manual_seed(SEED)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, generator=g)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [None]:
# test data preparation
X_test = test_dr.values.astype('float32') / 255.0
X_test_tz = torch.from_numpy(X_test)
test_loader = DataLoader(TensorDataset(X_test_tz), batch_size=256, shuffle=False)

In [None]:
# create model
class MLP_Default(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(784, 256), nn.ReLU(),
            nn.Linear(256, 128), nn.ReLU(),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MLP_Default().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_func = nn.CrossEntropyLoss()

In [None]:
best_accuracy = 0.0
best_state = None

for epoch in range(30):
    # train
    model.train()
    for X_tr, y_tr in train_loader:
        X_tr, y_tr = X_tr.to(device), y_tr.to(device)
        optimizer.zero_grad()
        loss = loss_func(model(X_tr), y_tr)
        loss.backward()
        optimizer.step()

    # eval
    model.eval()
    correct = total_obj = 0
    with torch.no_grad():
        for X_vl, y_vl in val_loader:
            X_vl, y_vl = X_vl.to(device), y_vl.to(device)
            pred = model(X_vl).argmax(1)
            correct += (pred == y_vl).sum().item()
            total_obj += y_vl.size(0)

    val_accuracy = correct / total_obj
    print(f"epoch {epoch+1}: val_accuracy={val_accuracy:.4f}")

    # save best parameters
    if val_accuracy > best_accuracy:
        best_accuracy = val_accuracy
        best_state = model.state_dict()
        print(f"New best! Saving model with accuracy={best_accuracy:.4f}")

# load the best weights
model.load_state_dict(best_state)

**score 0.97703**

## Promoted Model

*I will train the model using augmentations, and I have also improved it by adding dropout, BatchNorm1d, a scheduler, and so on. Predictions will be made using the TTA method with averaging over multiple predictions. The final prediction will be obtained by averaging the predictions across different seeds.*

In [None]:
from torch.utils.data import Dataset
import torchvision.transforms as T

In [None]:
# average with different seeds
SEEDS = [65, 233, 9499]
SPLIT_SEED = 42

def set_global_seed(seed: int):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
# preparing training data
X = train_dr.drop('label', axis=1).values.astype('uint8')
y = train_dr.label.values.astype('int64')

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=SPLIT_SEED, shuffle=True, stratify=y)

In [None]:
# augmentation
train_transform = T.Compose([
    T.ToPILImage(),    # (28 x 28) dtype=PIL.Image [0..255]
    T.RandomAffine(degrees=10, translate=(0.1, 0.1), scale=(0.95, 1.05), shear=5),
    T.ToTensor(),    # (1, 28 x 28) dtype=torch.float32 [0..1]
])

eval_transform = T.Compose([
    T.ToPILImage(),
    T.ToTensor(),
])

# class for online augmentation
class FlatAugDataset(Dataset):
    def __init__(self, X, y=None, transform=None):
        self.X = X
        self.y = y
        self.transform = transform

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, idx):
        x = self.X[idx].reshape(28, 28)    # (784) dtype=uint8 [0..255]  ->  (28 x 28) dtype=uint8 [0..255]
        x = self.transform(x)    # tensor (1, 28 x 28) dtype=torch.float32 [0..1]
        x_vec = x.view(-1)    # (784) dtype=torch.float32 [0..1]
        if self.y is None:
            return x_vec
        return x_vec, torch.tensor(self.y[idx], dtype=torch.long)

# preparing test data
X_test = test_dr.values.astype('uint8')
test_dataset = FlatAugDataset(X_test, y=None, transform=eval_transform)
test_loader  = DataLoader(test_dataset,  batch_size=256, shuffle=False, pin_memory=True)

In [None]:
def make_loaders(seed: int):
    """
    Loader factory for train/val for a specific seed
    """
    # Datasets
    train_dataset = FlatAugDataset(X_train, y_train, transform=train_transform)
    val_dataset = FlatAugDataset(X_val, y_val, transform=eval_transform)
    
    # DataLoaders
    g = torch.Generator()
    g.manual_seed(seed)  

    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True,  generator=g, pin_memory=True)
    val_loader   = DataLoader(val_dataset,   batch_size=64, shuffle=False, pin_memory=True)
    return train_loader, val_loader

In [None]:
class MLP_Maxed(nn.Module):
    def __init__(self):
        super().__init__()
        self.fe = nn.Sequential(
            nn.Linear(784, 1024),
            nn.BatchNorm1d(1024),
            nn.GELU(),
            nn.Dropout(0.1),

            nn.Linear(1024, 512),
            nn.BatchNorm1d(512),
            nn.GELU(),
            nn.Dropout(0.1),

            nn.Linear(512, 256),
            nn.BatchNorm1d(256),
            nn.GELU(),
            nn.Dropout(0.1),
        )
        self.head = nn.Linear(256, 10)

        # kaiming init for Linear - under ReLU/GELU
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
                nn.init.zeros_(m.bias)

    def forward(self, x):
        x = self.fe(x)
        return self.head(x)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 35

def build_model_and_optim():
    model = MLP_Maxed().to(device)

    optimizer = torch.optim.AdamW(
        model.parameters(), lr=2e-3, weight_decay=1e-2
    )
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
        optimizer, T_max=EPOCHS
    )
    loss_func = nn.CrossEntropyLoss(label_smoothing=0.1)

    return model, optimizer, scheduler, loss_func

In [None]:
def train_and_select_best(model, optimizer, scheduler, loss_func,
                          train_loader, val_loader):
    """
    The function returns the best model weights and accuracy value for a specific seed
    """
    best_acc = -1.0
    best_state = None

    for epoch in range(EPOCHS):
        # train
        model.train()
        for X_tr, y_tr in train_loader:
            X_tr = X_tr.to(device, non_blocking=True)
            y_tr = y_tr.to(device, non_blocking=True)
            optimizer.zero_grad(set_to_none=True) 
            logits = model(X_tr)
            loss = loss_func(logits, y_tr)
            loss.backward()
            optimizer.step()
        scheduler.step()

        # eval
        model.eval()
        correct = 0
        total = 0
        with torch.no_grad():
            for X_vl, y_vl in val_loader:
                X_vl = X_vl.to(device, non_blocking=True)
                y_vl = y_vl.to(device, non_blocking=True)
                logits = model(X_vl)
                pred = logits.argmax(dim=1)
                correct += (pred == y_vl).sum().item()
                total += y_vl.size(0)

        val_acc = correct / total
        print(f"epoch {epoch+1}: val_accuracy={val_acc:.4f}")

        # save best parameters
        if val_acc > best_acc:
            best_acc = val_acc
            best_state = {k: v.detach().clone() for k, v in model.state_dict().items()}
            print(f"New best! Saving model with accuracy={best_acc:.4f}")

    return best_state, best_acc

In [None]:
# TTA and averaging by seeds
base_eval_tf = T.Compose([T.ToPILImage(), T.ToTensor()])    # "as is"
tta_transforms = [
    base_eval_tf,
    T.Compose([T.ToPILImage(), T.RandomAffine(degrees=8,  translate=(0.08, 0.08)), T.ToTensor()]),
    T.Compose([T.ToPILImage(), T.RandomAffine(degrees=0,  translate=(0.10, 0.00), scale=(0.98, 1.02)), T.ToTensor()]),
    T.Compose([T.ToPILImage(), T.RandomAffine(degrees=10, translate=(0.00, 0.08), shear=5), T.ToTensor()]),
]

all_test_probs = []

for seed in SEEDS:
    print(f"\n=== [seed={seed}] training started ===")
    set_global_seed(seed)
    train_loader, val_loader = make_loaders(seed)
    model, optimizer, scheduler, loss_func = build_model_and_optim()

    best_state, best_acc = train_and_select_best(
        model, optimizer, scheduler, loss_func,
        train_loader, val_loader)

    # load the best weights of the model for this seed
    model.load_state_dict(best_state)
    
    sum_probs = None
    for tf in tta_transforms:      
        test_dataset.transform = tf
        model.eval()
        probs_chunks = []
        with torch.no_grad():
            for X in test_loader:
                X = X.to(device, non_blocking=True)
                logits = model(X)    # (B, C)
                probs_chunks.append(torch.softmax(logits, dim=1))
        probs = torch.cat(probs_chunks, dim=0)    # (N, C)
        sum_probs = probs if sum_probs is None else (sum_probs + probs)

    avg_probs_seed = sum_probs / len(tta_transforms)    # (N, C)
    all_test_probs.append(avg_probs_seed.cpu())
    print(f"[seed={seed}] TTA predictions ready.")

# average by seeds
avg_probs = torch.stack(all_test_probs, dim=0).mean(dim=0)  # (3, N, C) -> (N, C)
preds = avg_probs.argmax(dim=1).numpy()
print("\n=== Ensemble across seeds complete. Predictions ready. ===")

In [None]:
submission = pd.DataFrame({'ImageId': np.arange(1, len(preds)+1), 'Label': preds})
submission.to_csv('submission.csv', index=False)
print("Saved to submission.csv")

**Score 0.99367**