In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
# Model / Data Parameters
num_classes = 10
input_shape = (28,28,1)
# Data Split by Train & Test
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz


In [3]:
# Scaling of data
x_train = x_train.astype("float32")/255
x_test = x_test.astype("float32")/255
# Check if the shape is (28,28,1)
x_train = np.expand_dims(x_train,-1)
x_test = np.expand_dims(x_test, -1)
print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

x_train shape: (60000, 28, 28, 1)
60000 train samples
10000 test samples


In [10]:
model = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation=tf.math.sin, kernel_initializer="he_uniform"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation=tf.math.sin, kernel_initializer="he_uniform"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(128, kernel_size=(3, 3), activation=tf.math.sin, kernel_initializer="he_uniform"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_6 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_7 (Conv2D)            (None, 11, 11, 64)        18496     
_________________________________________________________________
max_pooling2d_7 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
conv2d_8 (Conv2D)            (None, 3, 3, 128)         73856     
_________________________________________________________________
max_pooling2d_8 (MaxPooling2 (None, 1, 1, 128)         0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 128)              

In [11]:
batch_size = 128
epochs = 15

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f856474fb10>

In [13]:
score = model.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score[0]*100)
print("Test accuracy:", score[1]*100)

Test loss: 4.423553496599197
Test accuracy: 98.71000051498413


Setup for `Relu`

In [5]:
model_1 = keras.Sequential(
    [
        keras.Input(shape=input_shape),
        layers.Conv2D(32, kernel_size=(3, 3), activation="relu", kernel_initializer="he_uniform"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(64, kernel_size=(3, 3), activation="relu", kernel_initializer="he_uniform"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Conv2D(128, kernel_size=(3, 3), activation="relu", kernel_initializer="he_uniform"),
        layers.MaxPooling2D(pool_size=(2, 2)),
        layers.Flatten(),
        layers.Dropout(0.5),
        layers.Dense(num_classes, activation="softmax"),
    ]
)

model_1.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 26, 26, 32)        320       
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 13, 13, 32)        0         
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 11, 11, 64)        18496     
_________________________________________________________________
max_pooling2d_4 (MaxPooling2 (None, 5, 5, 64)          0         
_________________________________________________________________
conv2d_5 (Conv2D)            (None, 3, 3, 128)         73856     
_________________________________________________________________
max_pooling2d_5 (MaxPooling2 (None, 1, 1, 128)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 128)              

In [7]:
batch_size = 128
epochs = 15

model_1.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model_1.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7f85647073d0>

In [15]:
score_1 = model_1.evaluate(x_test, y_test, verbose=0)
print("Test loss:", score_1[0]*100)
print("Test accuracy:", score_1[1]*100)

Test loss: 4.5937638729810715
Test accuracy: 98.8099992275238


**Conclusion:** 

This network performed far better than the original one with `relu` activations. This network achieved much lower loss `(~0.25 vs ~0.26)` on the test set. The test accuracy is also much better `(~99 vs ~991xx)`

## CIFAR-10

In [16]:
# Model / data parameters
num_classes = 10
input_shape = (32, 32, 1)


# The data, split between train and test sets:
(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()

# Scale images to the [0, 1] range
x_train = x_train.astype("float32") / 255
x_test = x_test.astype("float32") / 255

print("x_train shape:", x_train.shape)
print(x_train.shape[0], "train samples")
print(x_test.shape[0], "test samples")

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
x_train shape: (50000, 32, 32, 3)
50000 train samples
10000 test samples


This example has been taken from the old keras [website](https://keras.io/examples/cifar10_cnn/) and the following modifications were done:

1. Change activation function from `relu` to `sin`
2. Change initializer from `glorot_uniform` to `he_uniform`

In [None]:
model = keras.models.Sequential()
model.add(layers.Conv2D(32,
                 (3, 3),
                 padding='same',
                 kernel_initializer="he_uniform",
                 activation=tf.math.sin,
                 input_shape=x_train.shape[1:]))
model.add(layers.Conv2D(32,
                 (3, 3),
                 kernel_initializer="he_uniform",
                 activation=tf.math.sin))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))
model.add(layers.Dropout(0.25))

model.add(layers.Conv2D(64,
                 (3, 3),
                 padding='same',
                 kernel_initializer="he_uniform",
                 activation=tf.math.sin))
model.add(layers.Conv2D(64,
                 (3, 3),
                 kernel_initializer="he_uniform",
                 activation=tf.math.sin))
model.add(layers.MaxPooling2D(pool_size=(2, 2)))
model.add(layers.Dropout(0.25))

model.add(layers.Flatten())
model.add(layers.Dense(512, kernel_initializer="he_uniform", activation=tf.math.sin))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(num_classes, activation="softmax"))

model.summary()

In [None]:
epochs=25
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)

**Conclusion:** 

Even without any `augmentation`, this network achieved the same validation accuracy `(~74%-75%)` whereas heavy augmentation is used in the original implementation. Although you can argue that with augmentation the network would take much more time to generalize as in the case of the original implementation, I would say that same holds for `overfitting`. The network isn't that bad in this case. 