In [2]:
# --- Fixed training + model cell (replace your previous one) ---
import math
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (Input, Conv2D, MaxPooling2D, Dense, Dropout,
                                     Flatten, BatchNormalization, LeakyReLU)
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Load data (paths for Kaggle notebook)
train_df = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')
test_df  = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

X = train_df.drop('label', axis=1).values.reshape(-1, 28, 28, 1).astype('float32') / 255.0
y = train_df['label'].values.astype('int32')
X_test = test_df.values.reshape(-1, 28, 28, 1).astype('float32') / 255.0

# Train/val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.10, random_state=42, stratify=y)

# Mixup helpers
def mixup_data(x, y, alpha=0.2):
    """Return mixed inputs, paired targets, and lambda"""
    batch_size = x.shape[0]
    if alpha <= 0:
        return x, y, y, 1.0
    l = np.random.beta(alpha, alpha)
    index = np.random.permutation(batch_size)
    mixed_x = l * x + (1 - l) * x[index]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, l

def mixup_generator(x, y, batch_size=64, alpha=0.2):
    datagen = ImageDataGenerator(
        rotation_range=10,
        zoom_range=0.15,
        width_shift_range=0.1,
        height_shift_range=0.1
    )
    gen = datagen.flow(x, y, batch_size=batch_size, shuffle=True)
    while True:
        x_batch, y_batch = next(gen)                 # <-- fixed here (use next(gen))
        x_mix, y_a, y_b, l = mixup_data(x_batch, y_batch, alpha)
        # produce soft labels for training
        y_mix = tf.keras.utils.to_categorical(y_a, 10) * l + tf.keras.utils.to_categorical(y_b, 10) * (1 - l)
        yield x_mix, y_mix

# Model (using Input to avoid warning)
def build_model():
    model = Sequential([
        Input(shape=(28,28,1)),
        Conv2D(32, (3,3), padding='same'),
        BatchNormalization(),
        LeakyReLU(0.1),

        Conv2D(32, (3,3), padding='same'),
        BatchNormalization(),
        LeakyReLU(0.1),
        MaxPooling2D(2,2),
        Dropout(0.25),

        Conv2D(64, (3,3), padding='same'),
        BatchNormalization(),
        LeakyReLU(0.1),

        Conv2D(64, (3,3), padding='same'),
        BatchNormalization(),
        LeakyReLU(0.1),
        MaxPooling2D(2,2),
        Dropout(0.3),

        Conv2D(128, (3,3), padding='same'),
        BatchNormalization(),
        LeakyReLU(0.1),

        Conv2D(128, (3,3), padding='same'),
        BatchNormalization(),
        LeakyReLU(0.1),
        MaxPooling2D(2,2),
        Dropout(0.4),

        Flatten(),
        Dense(512),
        BatchNormalization(),
        LeakyReLU(0.1),
        Dropout(0.5),
        Dense(10, activation='softmax')
    ])
    return model

model = build_model()
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
model.summary()

# Training parameters
batch_size = 64
train_steps = math.ceil(len(X_train) / batch_size)

callbacks = [
    EarlyStopping(monitor='val_accuracy', patience=8, restore_best_weights=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)
]

# Use the mixup generator for training; validation uses ordinary one-hot encoding
history = model.fit(
    mixup_generator(X_train, y_train, batch_size=batch_size, alpha=0.2),
    validation_data=(X_val, tf.keras.utils.to_categorical(y_val, 10)),
    steps_per_epoch=train_steps,
    epochs=50,
    callbacks=callbacks,
    verbose=2
)

# Predict and save submission
preds = np.argmax(model.predict(X_test), axis=1)
submission = pd.DataFrame({"ImageId": np.arange(1, len(preds)+1), "Label": preds})
submission.to_csv('submission.csv', index=False)
print("Saved submission.csv")


Epoch 1/50


I0000 00:00:1754804217.810104      93 service.cc:148] XLA service 0x7f33f0008bd0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1754804217.811182      93 service.cc:156]   StreamExecutor device (0): Tesla P100-PCIE-16GB, Compute Capability 6.0
I0000 00:00:1754804218.485042      93 cuda_dnn.cc:529] Loaded cuDNN version 90300
I0000 00:00:1754804224.067138      93 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


591/591 - 31s - 52ms/step - accuracy: 0.8145 - loss: 0.9021 - val_accuracy: 0.9798 - val_loss: 0.1242 - learning_rate: 0.0010
Epoch 2/50
591/591 - 10s - 17ms/step - accuracy: 0.9174 - loss: 0.5709 - val_accuracy: 0.9821 - val_loss: 0.0978 - learning_rate: 0.0010
Epoch 3/50
591/591 - 10s - 18ms/step - accuracy: 0.9305 - loss: 0.5569 - val_accuracy: 0.9888 - val_loss: 0.0680 - learning_rate: 0.0010
Epoch 4/50
591/591 - 11s - 18ms/step - accuracy: 0.9307 - loss: 0.4881 - val_accuracy: 0.9900 - val_loss: 0.0741 - learning_rate: 0.0010
Epoch 5/50
591/591 - 10s - 17ms/step - accuracy: 0.9394 - loss: 0.4754 - val_accuracy: 0.9883 - val_loss: 0.0548 - learning_rate: 0.0010
Epoch 6/50
591/591 - 10s - 18ms/step - accuracy: 0.9330 - loss: 0.4624 - val_accuracy: 0.9921 - val_loss: 0.0511 - learning_rate: 0.0010
Epoch 7/50
591/591 - 10s - 18ms/step - accuracy: 0.9336 - loss: 0.4501 - val_accuracy: 0.9900 - val_loss: 0.0544 - learning_rate: 0.0010
Epoch 8/50
591/591 - 10s - 18ms/step - accuracy: 0.9