In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/digit-recognizer/sample_submission.csv
/kaggle/input/digit-recognizer/train.csv
/kaggle/input/digit-recognizer/test.csv


In [3]:
# import MNIST train and test datasets
# the test.csv dataset doesn't have the y labels; only the X variables
# hence, to make the code reuseable; i'll declare train.csv as 'data' 
# and create the 'train' and 'test' data from it

data = pd.read_csv('/kaggle/input/digit-recognizer/train.csv')

# this is the 'competition scoring' data which does not have y labels; only X variables
# X_comp = pd.read_csv('/kaggle/input/digit-recognizer/test.csv')

In [4]:
# RFE - Recursive Feature Elimination

# from sklearn.feature_selection import RFE
# rfe = RFE(logreg, 20)
# rfe = rfe.fit(os_data_X, os_data_y.values.ravel())
# print(rfe.support_)
# print(rfe.ranking_)

In [5]:
# create X and y
# https://www.kaggle.com/yassineghouzam/introduction-to-cnn-keras-0-997-top-6
X = data.drop(labels=['label'], axis=1)
y = data['label']
# del data

In [6]:
# check for missing and null values
X.isnull().any().describe()
# X_comp.isnull().any().describe()

count       784
unique        1
top       False
freq        784
dtype: object

In [7]:
# normalize data (to grayscale) to reduce effects of illumination and for models to converge faster

X = X/255.0
# X_comp = X_comp/255.0 # competition scoring set

# check for issues with conversion
# TODO: write tests here - assert max value <=1

In [8]:
X = X.to_numpy() # to_numpy() and .values is quite similar; but to_numpy() is recommended

In [9]:
# train-test-split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

## Models

In [53]:
# using LogReg instead of LogRegCV as LogRegCV doesn't seem to converge
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# import xgboost as xgb
from xgboost.sklearn import XGBClassifier

from sklearn import metrics

### fit models ###

## Logistic Regression ##
# https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8#:~:text=Logistic%20Regression%20is%20a%20Machine,%2C%20failure%2C%20etc.).
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)
y_pred_logreg = logreg.predict(X_test)

## Support Vector Classifier ##
# https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
# for large datasets (10s of 1000s of samples), might be better to use LinearSVC or SGDClassifier instead

svc = SVC()
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)

## RandomForest Classifier ##
rf = RandomForestClassifier(
    random_state=0
)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

## XGBoost ##
xgb = XGBClassifier(label_encoder=False) # can XGB be used with X_train_3d?
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)





In [26]:
# from sklearn.metrics import confusion_matrix
# confusion_matrix = confusion_matrix(y_test, y_pred)
# print(confusion_matrix)

# from sklearn.metrics import classification_report
# print(classification_report(y_test, y_pred))

## Keras Deep Neural Network

In [15]:
# for Keras, might have to reshape 1D vector of 784 (28*28) to 3D (28 by 28 by 1)
# grayscale only uses 1 channel, if RGB (3 channels) reshape to 28*28*3

# Reshape image in 3 dimensions (height = 28px, width = 28px , canal = 1)
X_train_3d = X_train.reshape(-1,28,28,1)
X_test_3d = X_test.reshape(-1,28,28,1)
# X_comp = X_comp.reshape(-1,28,28,1)

In [16]:
# keras requires y labels to be one-hot endcoded: 
# https://blog.fastforwardlabs.com/2016/02/24/hello-world-in-keras-or-scikit-learn-versus-keras.html
# this changes y into a vector of len 10, as it has 10 classes and is not binary (1/0)

from keras.utils.np_utils import to_categorical # convert to one-hot-encoding

y_train_ohe = to_categorical(y_train, num_classes=10)
y_test_ohe = to_categorical(y_test, num_classes=10)

In [18]:
print(f'before transformation: {X_train.shape, X_test.shape, y_train.shape, y_test.shape}')
print(f'after transformation:{X_train_3d.shape, X_test_3d.shape, y_train_ohe.shape, y_test_ohe.shape}')

before transformation: ((29400, 784), (12600, 784), (29400,), (12600,))
after transformation:((29400, 28, 28, 1), (12600, 28, 28, 1), (29400, 10), (12600, 10))


In [80]:
# set the model for keras
# https://www.kaggle.com/yassineghouzam/introduction-to-cnn-keras-0-997-top-6
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Lambda
# from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau

model = Sequential()

model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                 activation ='relu', input_shape = (28,28,1)))
model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', 
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2)))
model.add(Dropout(0.25))


model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                 activation ='relu'))
model.add(Conv2D(filters = 64, kernel_size = (3,3),padding = 'Same', 
                 activation ='relu'))
model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
model.add(Dropout(0.25))


model.add(Flatten())
model.add(Dense(256, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(10, activation = "softmax"))

model.summary()

Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_49 (Conv2D)           (None, 28, 28, 32)        832       
_________________________________________________________________
conv2d_50 (Conv2D)           (None, 28, 28, 32)        25632     
_________________________________________________________________
max_pooling2d_24 (MaxPooling (None, 14, 14, 32)        0         
_________________________________________________________________
dropout_36 (Dropout)         (None, 14, 14, 32)        0         
_________________________________________________________________
conv2d_51 (Conv2D)           (None, 14, 14, 64)        18496     
_________________________________________________________________
conv2d_52 (Conv2D)           (None, 14, 14, 64)        36928     
_________________________________________________________________
max_pooling2d_25 (MaxPooling (None, 7, 7, 64)        

In [81]:
# Define the optimizer
# optimizer = RMSprop(lr=0.001, rho=0.9, epsilon=1e-08, decay=0.0)

# Compile the model
# model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

model.compile(
    optimizer = tf.keras.optimizers.Adam(0.001), # we can also use the RMSprop() as the optimizer
    # Loss Function to minimize
    loss = "categorical_crossentropy", # keras.losses.SparseCategoricalCrossentropy()
    # List of metrics to monitor
    metrics=["accuracy"] # keras.metrics.SparseCategoricalAccuracy()
)

# Set a learning rate annealer
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

In [82]:
batch_size = 86
epochs = 1
verbose = 2

In [83]:
history_nn = model.fit(
    X_train_3d,
    y_train_ohe,
    batch_size=batch_size,
    epochs=epochs, # Turn epochs to 30 to get 0.9967 accuracy
    validation_data=(X_test_3d, y_test_ohe),
    verbose=verbose
)

342/342 - 57s - loss: 0.3032 - accuracy: 0.9037 - val_loss: 0.0669 - val_accuracy: 0.9787


In [99]:
history_nn.history

{'loss': [0.30320146679878235],
 'accuracy': [0.9037414789199829],
 'val_loss': [0.06685896217823029],
 'val_accuracy': [0.9787301421165466]}

## Siamese Networks

In [95]:
print(X_train_3d.shape, y_train_ohe.shape)

(29400, 28, 28, 1) (29400, 10)


In [92]:
model_sn = tf.keras.Sequential([
    tf.keras.layers.Conv2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=(28,28,1)),
    tf.keras.layers.MaxPooling2D(pool_size=2),
    tf.keras.layers.Dropout(0.3),
    
    tf.keras.layers.Conv2D(filters=32, kernel_size=2, padding='same', activation='relu'),
    tf.keras.layers.MaxPooling2D(pool_size=2),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Flatten(),
    
    tf.keras.layers.Dense(256, activation=None), # No activation on final dense layer
    tf.keras.layers.Lambda(lambda x: tf.math.l2_normalize(x, axis=1)) # L2 normalize embeddings
])

# model_sn = Sequential([
#     Conv2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=(28,28,1)),
#     MaxPool2D(pool_size=2),
#     Dropout(0.3),
#     Conv2D(filters=32, kernel_size=2, padding='same', activation='relu'),
#     MaxPool2D(pool_size=2),
#     Dropout(0.3),
#     Flatten(),
#     Dense(256, activation=None), # No activation on final dense layer
#     Lambda(lambda x: tf.math.l2_normalize(x, axis=1)) # L2 normalize embeddings
# ])

model_sn.summary()

In [100]:
import tensorflow_addons as tfa

model_sn.compile(
    optimizer=tf.keras.optimizers.Adam(0.001),
    loss=tfa.losses.TripletSemiHardLoss(), # set the loss as TripletSemiHardLoss()
    metrics=["accuracy"]
)

In [101]:
history_sn = model_sn.fit(
    X_train_3d,
    y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_test_3d, y_test),
    verbose=verbose
)

342/342 - 18s - loss: 0.6366 - accuracy: 2.3810e-04 - val_loss: 0.5007 - val_accuracy: 0.0000e+00


In [107]:
print(y_train.shape, y_test.shape)
print(type(y_train), type(y_test))
print(y_train.iloc[0], y_test.iloc[0])

(29400,) (12600,)
<class 'pandas.core.series.Series'> <class 'pandas.core.series.Series'>
0 3


In [102]:
history_sn.history

{'loss': [0.6366080045700073],
 'accuracy': [0.0002380952355451882],
 'val_loss': [0.5007310509681702],
 'val_accuracy': [0.0]}

## Scoring

In [1]:
print('Accuracy on Test Set')
print('Logistic Regression: {:.2f}'.format(logreg.score(X_test, y_test)))
print('Support Vector Classifier: {:.2f}'.format(svc.score(X_test, y_test)))
print('RandomForest Classifier: {:.2f}'.format(rf.score(X_test, y_test)))
print('XGBoost Classifier: {:.2f}'.format(xgb.score(X_test, y_test)))

# these are the results of the training and 'val' data, which is kinda like the 'test' data
# some terminology differences
# thus what we need here IIUC is the val_accuracy
print("Neural Network: {:.2f}".format(history_nn.history.get('val_accuracy')[0]))
print("Siamese Network: {:.2f}".format(history_sn.history.get('val_accuracy')[0]))

Accuracy on Test Set


NameError: name 'logreg' is not defined

In [34]:
# keras uses .evaluate() instead of .score()
# this .evaluate() is supposed to be done on the 'competition set' IIUC

# loss, accuracy = model.evaluate(X_test_3d, y_test_ohe, verbose=0)
# print("Accuracy of Neural Network on test set = {:.2f}".format(accuracy))

# results = model.evaluate(X_test_3d, y_test_ohe, verbose=0)