In [1]:
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.datasets import mnist
import matplotlib.pyplot as plt



In [2]:
(x_train, y_train), (x_test, y_test) = mnist.load_data()

x_train = x_train.astype("float32") / 255.0
x_test  = x_test.astype("float32") / 255.0

x_train = np.expand_dims(x_train, -1)  # (N, 28, 28, 1)
x_test  = np.expand_dims(x_test, -1)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)


(60000, 28, 28, 1) (60000,)
(10000, 28, 28, 1) (10000,)


In [3]:
model = models.Sequential([
    layers.Conv2D(32, (3,3), activation="relu", input_shape=(28,28,1)),
    layers.MaxPooling2D((2,2)),
    layers.Conv2D(64, (3,3), activation="relu"),
    layers.MaxPooling2D((2,2)),
    layers.Flatten(),
    layers.Dense(128, activation="relu"),
    layers.Dropout(0.2),
    layers.Dense(10, activation="softmax")
])

model.compile(optimizer="adam",
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])

model.summary()


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [4]:
history = model.fit(x_train, y_train, epochs=3, batch_size=128, validation_split=0.1)

loss, acc = model.evaluate(x_test, y_test, verbose=0)
print("Test Accuracy:", acc)

model.save("mnist_digit_webcam.h5")
print("Saved: mnist_digit_webcam.h5")


Epoch 1/3
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 22ms/step - accuracy: 0.9239 - loss: 0.2536 - val_accuracy: 0.9832 - val_loss: 0.0604
Epoch 2/3
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 23ms/step - accuracy: 0.9789 - loss: 0.0673 - val_accuracy: 0.9892 - val_loss: 0.0406
Epoch 3/3
[1m422/422[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 24ms/step - accuracy: 0.9857 - loss: 0.0460 - val_accuracy: 0.9895 - val_loss: 0.0379




Test Accuracy: 0.9886999726295471
Saved: mnist_digit_webcam.h5


In [5]:
def preprocess_for_mnist(roi_bgr):
    gray = cv2.cvtColor(roi_bgr, cv2.COLOR_BGR2GRAY)
    gray = cv2.GaussianBlur(gray, (5,5), 0)

    # Binary invert so digit is white on black (MNIST style)
    _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)

    # Find non-zero pixels (digit)
    coords = cv2.findNonZero(thresh)
    if coords is None:
        return None, thresh

    x, y, w, h = cv2.boundingRect(coords)
    digit = thresh[y:y+h, x:x+w]

    # Make square canvas
    size = max(w, h)
    square = np.zeros((size, size), dtype=np.uint8)
    x_offset = (size - w) // 2
    y_offset = (size - h) // 2
    square[y_offset:y_offset+h, x_offset:x_offset+w] = digit

    # Resize to 20x20 and pad to 28x28 (MNIST-like)
    digit20 = cv2.resize(square, (20,20), interpolation=cv2.INTER_AREA)
    digit28 = cv2.copyMakeBorder(digit20, 4,4,4,4, cv2.BORDER_CONSTANT, value=0)

    # Normalize and reshape for model: (1, 28, 28, 1)
    digit28 = digit28.astype("float32") / 255.0
    digit28 = np.expand_dims(digit28, axis=(0, -1))

    return digit28, thresh


In [7]:
model = tf.keras.models.load_model("mnist_digit_webcam.h5")

cap = cv2.VideoCapture(0)
if not cap.isOpened():
    raise RuntimeError("Webcam not found. Try VideoCapture(1).")

print("Show a digit inside the box. Press 'q' to quit, 's' to save ROI.")

save_count = 0

while True:
    ret, frame = cap.read()
    if not ret:
        continue

    frame = cv2.flip(frame, 1)  # mirror for convenience
    h, w, _ = frame.shape

    # ROI box
    box = 250
    x1 = w//2 - box//2
    y1 = h//2 - box//2
    x2 = x1 + box
    y2 = y1 + box

    roi = frame[y1:y2, x1:x2]
    processed, thresh_img = preprocess_for_mnist(roi)

    label = "No digit"
    if processed is not None:
        probs = model.predict(processed, verbose=0)[0]
        pred = int(np.argmax(probs))
        conf = float(np.max(probs))
        label = f"Digit: {pred} ({conf*100:.1f}%)"

    # Draw ROI + label
    cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0), 2)
    cv2.putText(frame, label, (10, 40),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0), 2)

    cv2.imshow("MNIST Digit Recognition - Webcam", frame)
    cv2.imshow("Threshold ROI", thresh_img)

    key = cv2.waitKey(1) & 0xFF
    if key == ord('q'):
        break
    elif key == ord('s') and thresh_img is not None:
        cv2.imwrite(f"saved_digit_{save_count}.png", thresh_img)
        print("Saved:", f"saved_digit_{save_count}.png")
        save_count += 1

cap.release()
cv2.destroyAllWindows()




Show a digit inside the box. Press 'q' to quit, 's' to save ROI.
