In [9]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import itertools

from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator

import warnings
warnings.filterwarnings("ignore")

# My seed

seed = 42

### Loading the training and test dataset

In [10]:
df_train = pd.read_csv('train.csv')
df_test  = pd.read_csv('test.csv')

In [11]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB


In [12]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Columns: 784 entries, pixel0 to pixel783
dtypes: int64(784)
memory usage: 167.5 MB


### Spliting the dataset

In [13]:
X_train = df_train.drop(['label'], axis=1)
y_train = df_train['label']
X_test = df_test

# Free memory space

del df_train
del df_test

print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of X_test :', X_test.shape)

Shape of X_train: (42000, 784)
Shape of y_train: (42000,)
Shape of X_test : (28000, 784)


### Counting the labels

In [14]:
counter = Counter(y_train)
counter

Counter({1: 4684,
         0: 4132,
         4: 4072,
         7: 4401,
         3: 4351,
         5: 3795,
         8: 4063,
         9: 4188,
         2: 4177,
         6: 4137})

### Normalizing the values of training and test

In [15]:
X_train = X_train / 255
X_test = X_test / 255

### Reshape the images in 3 dimensions to use with Keras

In [16]:
X_train = X_train.values.reshape(-1,28,28,1) # (height = 28px, width = 28px , canal = 1)
X_test = X_test.values.reshape(-1,28,28,1)

print('Shape of X_train:', X_train.shape)
print('Shape of X_test :', X_test.shape)

Shape of X_train: (42000, 28, 28, 1)
Shape of X_test : (28000, 28, 28, 1)


### Converting y values (labels) to categorical values

In [17]:
y_train = to_categorical(y_train, num_classes = 10)
y_train

array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

### Splitting the values into train and validation

In [18]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state=seed)

### Define the baseline neural network model

In [19]:
def baseline_model():
    
    # Create model
    
    model = Sequential()

    #---------------------------------------------------------------------------------------------------
    
    # 32 filters for the two firsts conv2D layers
    
    model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', activation ='relu', 
                     input_shape = (28, 28, 1)))
    model.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', activation ='relu'))
    
    # This layer simply acts as a downsampling filter. 
    # It looks at the 2 neighboring pixels and picks the maximal value, reducing computational cost, 
    # and to some extent also reduce overfitting.
    
    # IMPORTANT: Combining convolutional and pooling layers, CNN are able to combine local features and 
    # learn more global features of the image.
    
    model.add(MaxPool2D(pool_size=(2,2)))
    
    # Dropout is a regularization method, where a proportion of nodes (25%) in the layer are randomly ignored 
    # for each training sample. This dropout forces the network to learn features in a distributed way 
    # and improves generalization and reduces the overfitting.
    
    model.add(Dropout(0.25))
    #---------------------------------------------------------------------------------------------------
    
    # 64 filters for the two last conv2D layers
    
    model.add(Conv2D(filters = 64, kernel_size = (3,3), padding = 'Same', activation ='relu'))
    model.add(Conv2D(filters = 64, kernel_size = (3,3), padding = 'Same', activation ='relu'))
    
    model.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
    model.add(Dropout(0.25))
    #---------------------------------------------------------------------------------------------------

    # The Flatten layer is use to convert the final feature maps into a one single 1D vector. 
    # IMPORTANT: It combines all the found local features of the previous convolutional layers.
    
    model.add(Flatten())
    
    model.add(Dense(256, activation = "relu"))
    
    model.add(Dropout(0.5))
    
    # The net outputs distribution of probability of each class --> In our case, 10 output classes
    
    model.add(Dense(10, activation = "softmax"))
    
    # The optimizer will iteratively improve parameters in order to minimize the loss.
    
    optimizer = RMSprop(epsilon=1e-08)

    # Compile the model including the optimizer and evaluating the performance of the model by accuracy
    
    model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])
    
    return model

### Data augmentation

In [20]:
# The idea is to alter the training data with small transformations to reproduce the variations 
# occuring when someone is writing a digit. It's a way to minimize the overfitting of the model.

generator = ImageDataGenerator(featurewise_center = False,
                               samplewise_center = False, 
                               featurewise_std_normalization = False,
                               samplewise_std_normalization = False,
                               zca_whitening = False,
                               rotation_range = 10, # Rotate image in 10 degrees
                               zoom_range = 0.1, # Zoom image (10% zoom) 
                               width_shift_range = 0.1, # Shift image horizontally (10% of width)
                               height_shift_range = 0.1, # Shift image vertically (10% of height)
                               horizontal_flip = False,
                               vertical_flip = False)

generator.fit(X_train)

### Fitting the model

In [23]:
epochs = 2
batch_size = 80

In [24]:
digits = baseline_model()
history = digits.fit_generator(generator.flow(X_train, y_train, batch_size = batch_size),
                               epochs = epochs,
                               validation_data = (X_val, y_val),
                               verbose = 1, 
                               steps_per_epoch = X_train.shape[0])

Epoch 1/2
Epoch 2/2


In [25]:
label_predicted = digits.predict(X_test)

# Get the index with the maximum probability

label_predicted = np.argmax(label_predicted, axis = 1)

label_predicted = pd.Series(label_predicted, name = "Label")

In [26]:
solution = pd.concat([pd.Series(range(1, 28001), name = "ImageId"), label_predicted], axis = 1)
solution.to_csv("solution_cnn.csv", index=False)

In [27]:
solution.head()

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,0
4,5,3
