# Digit Recognizer

Created by Zach Brazil, Richard Charles, Adam Kiehl, Zane Perkins

## Setup

In [None]:
# Import packages
import os
import glob
import shutil
import plotnine

import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

from tensorflow import keras
from tensorflow.keras import layers
from keras.models import Sequential
from keras.layers import Conv2D, Dense , MaxPool2D , Lambda, Flatten, Dropout, DepthwiseConv2D

from plotnine import ggplot
from plotnine import aes 
from plotnine import geom_line
from plotnine import labs

In [None]:
# Reproducibility
np.random.seed(478)
tf.random.set_seed(478)

In [None]:
# Read in data from .csv files
train = pd.read_csv('./Data/train.csv')
test = pd.read_csv('./Data/test.csv')

# Convert Pandas dataframes to valid Numpy arrays
def convert(X):
    data = np.empty((X.shape[0], 28, 28, 1))
    for i in range(X.shape[0]):
        img = np.array(X.iloc[i, :]).reshape((28, 28, 1))
        data[i] = img
    return(np.array(data))

# Define training set
X_train = convert(train.drop('label', axis = 1))
y_train = train['label']

# Define testing set
X_test = convert(test)

In [None]:
# Plot example digit images
for i in range(5):
    plt.imshow(X_train[i]) 
    plt.title(f"Digit: {y_train[i]}")
    plt.show() 

In [None]:
# Standardization function
mean_px = X_train.mean().astype(np.float32)
std_px = X_train.std().astype(np.float32)

def standardize(x):
    return (x - mean_px) / std_px

## Modeling

In [None]:
# Define model architecture
model = Sequential([
    Lambda(standardize, input_shape = (28, 28, 1)),
    MaxPool2D(3),

    Conv2D(filters = 32, kernel_size = 3, activation = 'relu'),
    MaxPool2D(3),

    DepthwiseConv2D(kernel_size = 3, depth_multiplier = 4, activation = 'relu'),
    MaxPool2D(3),

    Flatten(),
    Dense(512, activation = 'relu'),
    Dropout(.3),
    Dense(10, activation = 'softmax')
])

model.summary()

In [None]:
# Define number of training epochs
EPOCHS = 10

# Compile model
model.compile(
    optimizer = 'adam',
    loss = 'sparse_categorical_crossentropy',
    metrics = ['accuracy']
)

# Train model with 75/25 training/validation split
trained = model.fit(
    X_train,
    y_train,
    epochs = EPOCHS,
    validation_split = .25
)

In [None]:
# Plot training and validation accuracy
(
    ggplot() 
    + geom_line(aes(range(EPOCHS), trained.history['accuracy']))
    + geom_line(aes(range(EPOCHS), trained.history['val_accuracy']), color = 'red')
    + labs(title='Training and Validation (red) Accuracy', x='Epoch', y='Accuracy')
)

## Prediction

In [None]:
# Choose 5 epochs to avoid overfitting
EPOCHS = 5

# Retrain model on full training set
trained_full = model.fit(
    X_train,
    y_train,
    epochs = EPOCHS,
    validation_split = 0
)

In [None]:
# Generate competition preedictions based on test set
pred = pd.DataFrame(model.predict(X_test)).idxmax(axis = 1)
submission = pd.DataFrame({'ImageId': range(1, len(pred) + 1), 'Label': pred})
submission.head()

# Write submission results to local .csv file
submission.to_csv('~/Desktop/submission.csv', index=False)