In [1]:
# ============================================================
# Digit Recognizer — CNN in PyTorch vs TensorFlow
# ============================================================

# -------------------------
# Imports & Config
# -------------------------
import os
import random
import numpy as np
import pandas as pd

# PyTorch
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

# TensorFlow / Keras
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split

# -------------------------
# Reproducibility
# -------------------------
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
tf.random.set_seed(SEED)

# -------------------------
# Device Config (PyTorch)
# -------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using PyTorch device:", device)

# ============================================================
# 1. Load Data
# ============================================================

train_df = pd.read_csv("/kaggle/input/digit-recognizer/train.csv")
test_df  = pd.read_csv("/kaggle/input/digit-recognizer/test.csv")

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

# Split features and labels
X = train_df.drop("label", axis=1).values  # (n_samples, 784)
y = train_df["label"].values              # (n_samples,)

# Normalize to [0,1]
X = X.astype("float32") / 255.0
test_data = test_df.values.astype("float32") / 255.0

# Train/Validation split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=SEED, stratify=y
)

print("Train:", X_train.shape, "Val:", X_val.shape)

# Reshape for CNNs: (N, 28, 28, 1)
X_train_img = X_train.reshape(-1, 28, 28, 1)
X_val_img   = X_val.reshape(-1, 28, 28, 1)
test_img    = test_data.reshape(-1, 28, 28, 1)

# ============================================================
# 2. PyTorch Dataset & Dataloader
# ============================================================

class MNISTTorchDataset(Dataset):
    def __init__(self, images, labels=None):
        """
        images: numpy array (N, 28, 28, 1), [0,1] float32
        labels: optional, numpy array (N,)
        """
        self.images = images
        self.labels = labels

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        # Convert to (1, 28, 28) for Conv2d
        img = self.images[idx].transpose(2, 0, 1)  # (1,28,28)
        img_tensor = torch.tensor(img, dtype=torch.float32)

        if self.labels is not None:
            label_tensor = torch.tensor(self.labels[idx], dtype=torch.long)
            return img_tensor, label_tensor
        else:
            return img_tensor

BATCH_SIZE = 128

train_dataset_torch = MNISTTorchDataset(X_train_img, y_train)
val_dataset_torch   = MNISTTorchDataset(X_val_img, y_val)
test_dataset_torch  = MNISTTorchDataset(test_img, labels=None)

train_loader = DataLoader(train_dataset_torch, batch_size=BATCH_SIZE, shuffle=True)
val_loader   = DataLoader(val_dataset_torch, batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset_torch, batch_size=BATCH_SIZE, shuffle=False)

# ============================================================
# 3. PyTorch CNN Model
# ============================================================

# As given (input: 1x28x28)
# Conv2d(1,32,3) -> (32, 26, 26)
# MaxPool2d(2)   -> (32, 13, 13) => 32*13*13 = 5408

class TorchCNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = nn.Sequential(
            nn.Conv2d(1, 32, 3),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Flatten(),
            nn.Linear(5408, 128),
            nn.ReLU(),
            nn.Linear(128, 10)
        )

    def forward(self, x):
        return self.model(x)

torch_model = TorchCNN().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(torch_model.parameters(), lr=1e-3)

# -------------------------
# Training & Evaluation Loops
# -------------------------
def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for images, labels in loader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(images)            # (batch, 10)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * labels.size(0)

        preds = outputs.argmax(dim=1)
        correct += (preds == labels).sum().item()
        total += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc  = correct / total
    return epoch_loss, epoch_acc


def evaluate(model, loader, criterion, device):
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0

    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            loss = criterion(outputs, labels)

            running_loss += loss.item() * labels.size(0)
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    epoch_loss = running_loss / total
    epoch_acc  = correct / total
    return epoch_loss, epoch_acc

# -------------------------
# Train PyTorch Model
# -------------------------
EPOCHS_TORCH = 10

best_val_acc_torch = 0.0
for epoch in range(1, EPOCHS_TORCH + 1):
    train_loss, train_acc = train_one_epoch(torch_model, train_loader, optimizer, criterion, device)
    val_loss, val_acc = evaluate(torch_model, val_loader, criterion, device)

    if val_acc > best_val_acc_torch:
        best_val_acc_torch = val_acc
        torch.save(torch_model.state_dict(), "best_torch_cnn.pth")

    print(f"[PyTorch] Epoch {epoch}/{EPOCHS_TORCH} "
          f"- Train loss: {train_loss:.4f}, acc: {train_acc:.4f} "
          f"- Val loss: {val_loss:.4f}, acc: {val_acc:.4f}")

print("Best PyTorch Val Accuracy:", best_val_acc_torch)

# Load best weights
torch_model.load_state_dict(torch.load("best_torch_cnn.pth"))

# ============================================================
# 4. TensorFlow / Keras CNN Model
# ============================================================

# Build tf.data datasets from numpy arrays
BATCH_SIZE_TF = 128

train_ds_tf = tf.data.Dataset.from_tensor_slices((X_train_img, y_train))
train_ds_tf = train_ds_tf.shuffle(buffer_size=1024, seed=SEED).batch(BATCH_SIZE_TF)

val_ds_tf = tf.data.Dataset.from_tensor_slices((X_val_img, y_val))
val_ds_tf = val_ds_tf.batch(BATCH_SIZE_TF)

test_ds_tf = tf.data.Dataset.from_tensor_slices(test_img)
test_ds_tf = test_ds_tf.batch(BATCH_SIZE_TF)

# -------------------------
# Define TF Model (similar architecture)
# -------------------------
def build_tf_model():
    model = keras.Sequential(
        [
            layers.Conv2D(32, (3, 3), activation="relu", input_shape=(28, 28, 1)),
            layers.MaxPooling2D(pool_size=(2, 2)),
            layers.Flatten(),
            layers.Dense(128, activation="relu"),
            layers.Dense(10, activation="softmax"),
        ]
    )
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=1e-3),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )
    return model

tf_model = build_tf_model()
tf_model.summary()

# -------------------------
# Train TF Model
# -------------------------
EPOCHS_TF = 10

history = tf_model.fit(
    train_ds_tf,
    validation_data=val_ds_tf,
    epochs=EPOCHS_TF,
    verbose=2
)

# Final validation accuracy from TF
tf_val_acc = history.history["val_accuracy"][-1]
print("TensorFlow Val Accuracy:", tf_val_acc)

# ============================================================
# 5. Compare Metrics: PyTorch vs TensorFlow
# ============================================================
print("===================================")
print("Validation Accuracy Comparison")
print("PyTorch CNN   :", best_val_acc_torch)
print("TensorFlow CNN:", tf_val_acc)
print("Better model  :", "PyTorch" if best_val_acc_torch >= tf_val_acc else "TensorFlow")
print("===================================")

# ============================================================
# 6. Create Submission
# ============================================================


torch_model.eval()
all_preds_torch = []

with torch.no_grad():
    for images in test_loader:
        images = images.to(device)
        outputs = torch_model(images)
        preds = outputs.argmax(dim=1).cpu().numpy()
        all_preds_torch.extend(preds)

submission_torch = pd.DataFrame({
    "ImageId": np.arange(1, len(all_preds_torch) + 1),
    "Label": all_preds_torch
})

submission_torch.to_csv("submission_torch.csv", index=False)
print("Saved PyTorch submission as submission_torch.csv")

2025-12-02 05:56:57.268806: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1764655017.507523      47 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1764655017.576081      47 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

Using PyTorch device: cpu
Train shape: (42000, 785)
Test shape: (28000, 784)
Train: (37800, 784) Val: (4200, 784)
[PyTorch] Epoch 1/10 - Train loss: 0.3916, acc: 0.8908 - Val loss: 0.1465, acc: 0.9552
[PyTorch] Epoch 2/10 - Train loss: 0.1144, acc: 0.9667 - Val loss: 0.0919, acc: 0.9733
[PyTorch] Epoch 3/10 - Train loss: 0.0714, acc: 0.9787 - Val loss: 0.0715, acc: 0.9798
[PyTorch] Epoch 4/10 - Train loss: 0.0509, acc: 0.9851 - Val loss: 0.0656, acc: 0.9795
[PyTorch] Epoch 5/10 - Train loss: 0.0407, acc: 0.9873 - Val loss: 0.0617, acc: 0.9798
[PyTorch] Epoch 6/10 - Train loss: 0.0311, acc: 0.9906 - Val loss: 0.0546, acc: 0.9838
[PyTorch] Epoch 7/10 - Train loss: 0.0257, acc: 0.9922 - Val loss: 0.0574, acc: 0.9819
[PyTorch] Epoch 8/10 - Train loss: 0.0193, acc: 0.9947 - Val loss: 0.0574, acc: 0.9840
[PyTorch] Epoch 9/10 - Train loss: 0.0158, acc: 0.9953 - Val loss: 0.0583, acc: 0.9810
[PyTorch] Epoch 10/10 - Train loss: 0.0137, acc: 0.9961 - Val loss: 0.0607, acc: 0.9831
Best PyTorch Va

2025-12-02 05:59:19.162125: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
296/296 - 11s - 36ms/step - accuracy: 0.9196 - loss: 0.2900 - val_accuracy: 0.9662 - val_loss: 0.1161
Epoch 2/10
296/296 - 9s - 31ms/step - accuracy: 0.9747 - loss: 0.0862 - val_accuracy: 0.9795 - val_loss: 0.0750
Epoch 3/10
296/296 - 10s - 35ms/step - accuracy: 0.9832 - loss: 0.0567 - val_accuracy: 0.9807 - val_loss: 0.0656
Epoch 4/10
296/296 - 9s - 29ms/step - accuracy: 0.9878 - loss: 0.0411 - val_accuracy: 0.9824 - val_loss: 0.0616
Epoch 5/10
296/296 - 9s - 29ms/step - accuracy: 0.9906 - loss: 0.0303 - val_accuracy: 0.9812 - val_loss: 0.0667
Epoch 6/10
296/296 - 8s - 29ms/step - accuracy: 0.9928 - loss: 0.0239 - val_accuracy: 0.9829 - val_loss: 0.0579
Epoch 7/10
296/296 - 9s - 29ms/step - accuracy: 0.9948 - loss: 0.0173 - val_accuracy: 0.9814 - val_loss: 0.0667
Epoch 8/10
296/296 - 8s - 29ms/step - accuracy: 0.9969 - loss: 0.0125 - val_accuracy: 0.9802 - val_loss: 0.0660
Epoch 9/10
296/296 - 8s - 29ms/step - accuracy: 0.9973 - loss: 0.0103 - val_accuracy: 0.9817 - val_los

## 4. Results — PyTorch vs TensorFlow CNN on MNIST

### 4.1 Training Logs (Summary)

**PyTorch CNN**

- Device: `cpu`
- Train shape: `(42000, 785)` → `(37800, 784)` train, `(4200, 784)` validation
- Architecture:
  - `Conv2d(1 → 32, kernel_size=3)` → `ReLU` → `MaxPool2d(2)`
  - `Flatten` → `Linear(5408 → 128)` → `ReLU`
  - `Linear(128 → 10)`

**Training progress (PyTorch)**  
- Epoch 1: **Train acc = 0.8908**, Val acc = 0.9552  
- Epoch 2: **Train acc = 0.9667**, Val acc = 0.9733  
- Epoch 3: **Train acc = 0.9787**, Val acc = 0.9798  
- Epoch 6: Train acc = 0.9906, **Val acc = 0.9838**  
- Epoch 10: Train acc = 0.9961, Val acc = 0.9831  

**Best PyTorch validation accuracy**:  
> **0.9840 (98.40%)**

---

**TensorFlow / Keras CNN**

- Same idea: `Conv2D(32, 3×3)` → `MaxPooling2D(2×2)` → `Flatten` → `Dense(128)` → `Dense(10)`
- Model parameters: **693,962 trainable parameters**
- Trained for 10 epochs with `Adam(lr=1e-3)` and `sparse_categorical_crossentropy`.

**Training progress (TensorFlow)**  
- Epoch 1: **Train acc = 0.9196**, Val acc = 0.9662  
- Epoch 2: **Train acc = 0.9747**, Val acc = 0.9795  
- Epoch 3: Train acc = 0.9832, Val acc = 0.9807  
- Epoch 6: Train acc = 0.9928, **Val acc = 0.9829**  
- Epoch 10: Train acc = 0.9983, Val acc = 0.9821  

**Final TensorFlow validation accuracy**:  
> **0.9821 (98.21%)**

---

### 4.2 Side-by-Side Comparison

| Framework   | Best Val Accuracy |
|------------|-------------------|
| **PyTorch**   | **0.9840 (98.40%)** |
| **TensorFlow** | **0.9821 (98.21%)** |

Both models use almost the **same CNN architecture**, same input processing (28×28 grayscale normalized to `[0, 1]`), and are trained on the same train/validation split.

- The **PyTorch model** slightly edges out TensorFlow on this split by about **0.2%** in validation accuracy.
- The difference is **very small** and can easily be due to:
  - Random weight initialization
  - Mini-batch ordering
  - Small variations in optimization dynamics

So the main takeaway is:

> **Both PyTorch and TensorFlow can reach ~98–99% accuracy on MNIST with a very simple CNN.**  
> The choice of framework is more about developer experience, ecosystem, and personal preference than raw performance for this task.

---

### 4.3 Submission

For this notebook, I used the **PyTorch model** (best validation accuracy: **98.40%**) to generate predictions on the test set and saved them as:

```text
submission_torch.csv