In [2]:
!pip install imblearn
!pip install numpy
!pip install sklearn



In [0]:
%matplotlib inline

import pandas as pd
import numpy as np

from collections import Counter
from sklearn.model_selection import train_test_split

from keras.utils.np_utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, BatchNormalization
from keras.optimizers import RMSprop
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau

from keras.models import model_from_json

import warnings
warnings.filterwarnings("ignore")

from imblearn.over_sampling import SMOTE

import io
from google.colab import files

# My seed

seed = 42

In [0]:
uploaded = files.upload()

Saving train.csv to train.csv


### Loading the training and test dataset

In [0]:
df_train = pd.read_csv(io.StringIO(uploaded['train.csv'].decode('utf-8')))
df_test  = pd.read_csv(io.StringIO(uploaded['test.csv'].decode('utf-8')))

In [10]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42000 entries, 0 to 41999
Columns: 785 entries, label to pixel783
dtypes: int64(785)
memory usage: 251.5 MB


In [11]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28000 entries, 0 to 27999
Columns: 784 entries, pixel0 to pixel783
dtypes: int64(784)
memory usage: 167.5 MB


### Spliting the dataset

In [12]:
X_train = df_train.drop(['label'], axis=1)
y_train = df_train['label']
X_test = df_test

# Free memory space

del df_train
del df_test

print('Shape of X_train:', X_train.shape)
print('Shape of y_train:', y_train.shape)
print('Shape of X_test :', X_test.shape)

Shape of X_train: (42000, 784)
Shape of y_train: (42000,)
Shape of X_test : (28000, 784)


### Counting the labels before SMOTE

In [13]:
counter = Counter(y_train)
counter

Counter({0: 4132,
         1: 4684,
         2: 4177,
         3: 4351,
         4: 4072,
         5: 3795,
         6: 4137,
         7: 4401,
         8: 4063,
         9: 4188})

### Using SMOTE to increase the number of images in training dataset

In [14]:
cols = X_train.columns

# https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html#imblearn.over_sampling.SMOTE

sm = SMOTE(sampling_strategy = 'all', k_neighbors = 3, random_state = seed)
X_train, y_train = sm.fit_sample(X_train, y_train.ravel())
X_train = pd.DataFrame(X_train, columns=cols)

print('New shape of X_train:', X_train.shape)
print('New shape of y_train:', y_train.shape)

New shape of X_train: (46840, 784)
New shape of y_train: (46840,)


### Counting the labels after SMOTE

In [15]:
counter = Counter(y_train)
counter

Counter({0: 4684,
         1: 4684,
         2: 4684,
         3: 4684,
         4: 4684,
         5: 4684,
         6: 4684,
         7: 4684,
         8: 4684,
         9: 4684})

### Normalizing the values of training and test

In [0]:
X_train = X_train / 255
X_test = X_test / 255

### Reshape the images in 3 dimensions to use with Keras

In [17]:
X_train = X_train.values.reshape(-1,28,28,1) # (height = 28px, width = 28px , canal = 1)
X_test = X_test.values.reshape(-1,28,28,1)

print('Shape of X_train:', X_train.shape)
print('Shape of X_test :', X_test.shape)

Shape of X_train: (46840, 28, 28, 1)
Shape of X_test : (28000, 28, 28, 1)


### Converting y values (labels) to categorical values

In [18]:
# One Hot Categories

y_train = to_categorical(y_train, num_classes = 10)
y_train

array([[0., 1., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32)

### Define the baseline neural network model

In [0]:
def baseline_model():
    
    # Create baseline
    
    baseline = Sequential()

    #---------------------------------------------------------------------------------------------------
    
    # 32 filters for the three firsts conv2D layers
    
    baseline.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', activation ='relu', 
                     input_shape = (28, 28, 1)))
    baseline.add(BatchNormalization())
    baseline.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', activation ='relu'))
    baseline.add(BatchNormalization())
    baseline.add(Conv2D(filters = 32, kernel_size = (5,5),padding = 'Same', activation ='relu'))
    baseline.add(BatchNormalization())
    
    # This layer simply acts as a downsampling filter. 
    # It looks at the 2 neighboring pixels and picks the maximal value, reducing computational cost, 
    # and to some extent also reduce overfitting.
    
    # IMPORTANT: Combining convolutional and pooling layers, CNN are able to combine local features and 
    # learn more global features of the image.
    
    baseline.add(MaxPool2D(pool_size=(2,2)))
    
    # Dropout is a regularization method, where a proportion of nodes (25%) in the layer are randomly ignored 
    # for each training sample. This dropout forces the network to learn features in a distributed way 
    # and improves generalization and reduces the overfitting.
    
    baseline.add(Dropout(0.25))
    #---------------------------------------------------------------------------------------------------
    
    # 64 filters for the three last conv2D layers
    
    baseline.add(Conv2D(filters = 64, kernel_size = (3,3), padding = 'Same', activation ='relu'))
    baseline.add(BatchNormalization())
    baseline.add(Conv2D(filters = 64, kernel_size = (3,3), padding = 'Same', activation ='relu'))
    baseline.add(BatchNormalization())
    baseline.add(Conv2D(filters = 64, kernel_size = (3,3), padding = 'Same', activation ='relu'))
    baseline.add(BatchNormalization())
    
    baseline.add(MaxPool2D(pool_size=(2,2), strides=(2,2)))
    baseline.add(Dropout(0.25))
    #---------------------------------------------------------------------------------------------------

    # The Flatten layer is use to convert the final feature maps into a one single 1D vector. 
    # IMPORTANT: It combines all the found local features of the previous convolutional layers.
    
    baseline.add(Conv2D(filters = 128, kernel_size = (3,3), padding = 'Same', activation ='relu'))
    baseline.add(BatchNormalization())
    baseline.add(Flatten())
    baseline.add(Dense(256, activation = "relu"))
    baseline.add(Dropout(0.4))
    
    # The net outputs distribution of probability of each class --> In our case, 10 output classes
    
    baseline.add(Dense(10, activation = "softmax"))
    
    # The optimizer will iteratively improve parameters in order to minimize the loss.
    
    optimizer = RMSprop(epsilon=1e-08)

    # Compile the baseline including the optimizer and evaluating the performance of the baseline by accuracy
    
    baseline.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])
    
    return baseline

### Learning Rate

In [0]:
# If after the third epoch we didn't have an improvement of accuracy, the learning rate will be 
# reduced by 50% (factor).

lr_reduction = ReduceLROnPlateau(monitor='val_acc',
                                 patience=3, 
                                 verbose=0, 
                                 factor=0.5, 
                                 min_lr=0.00001)

### Data augmentation

In [0]:
# The idea is to alter the training data with small transformations to reproduce the variations 
# occuring when someone is writing a digit. It's a way to minimize the overfitting of the model.

generator = ImageDataGenerator(featurewise_center = False,
                               samplewise_center = False, 
                               featurewise_std_normalization = False,
                               samplewise_std_normalization = False,
                               zca_whitening = False,
                               rotation_range = 10, # Rotate image in 10 degrees
                               zoom_range = 0.10, # Zoom image (10% zoom) 
                               width_shift_range = 0.10, # Shift image horizontally (10% of width)
                               height_shift_range = 0.10, # Shift image vertically (10% of height)
                               horizontal_flip = False,
                               vertical_flip = False)

generator.fit(X_train)

### Creating 10 nets and training every ones

In [0]:
nets = 10
digits = [0] * nets
history = [0] * nets

epochs = 40
batch_size = 90

In [3]:
print("Creating {0} CNNs...".format(nets))
for model in range(nets):
    digits[model] = baseline_model()
    
    # Splitting train and test datasets
    
    X_train_aux, X_test_aux, y_train_aux, y_test_aux = train_test_split(X_train, y_train, test_size = 0.1)
    
    history[model] = digits[model].fit_generator(generator.flow(X_train_aux,
                                                              y_train_aux, 
                                                              batch_size = batch_size),
                                                 epochs = epochs, 
                                                 steps_per_epoch = X_train_aux.shape[0] // batch_size, 
                                                 validation_data = (X_test_aux, y_test_aux), 
                                                 callbacks=[lr_reduction],
                                                 verbose=0)
    
    print("CNN {0:>2d}: Epochs = {1:d}, Train accuracy = {2:.5f}, Validation accuracy = {3:.5f}".format(
        model + 1, # Number of the CNN
        epochs, # Total of epochs
        max(history[model].history['acc']), # Maximum Accuracy from Training
        max(history[model].history['val_acc']))) # Maximum Accuracy from Test (validation)

Creating 10 CNNs...


NameError: ignored

### Getting the predictions with more probabilities to be correct

In [0]:
label_predicted = np.zeros( (X_test.shape[0], 10) ) 

for model in range(nets):
    label_predicted = label_predicted + digits[model].predict(X_test)
    
# Get the index with the maximum probability

label_predicted = np.argmax(label_predicted, axis = 1)
label_predicted = pd.Series(label_predicted, name = "Label")

In [0]:
solution = pd.concat([pd.Series(range(1, 28001), name = "ImageId"), label_predicted], axis = 1)
solution.to_csv("solution_cnn_v9.csv", index=False)

In [0]:
solution.head(10)

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,0
4,5,3
5,6,7
6,7,0
7,8,3
8,9,0
9,10,3


In [0]:
from google.colab import files
files.download('solution_cnn_v9.csv')

### Saving and loading the models

In [0]:
for model in range(nets):
    model_saved = digits[model].to_json()
    name = 'model_' + str(model) + '.json'
    with open(name, 'w') as json_file:
        json_file.write(model_saved)
    name = 'model_' + str(model) + '.h5'
    digits[model].save_weights(name)

In [0]:
model_loaded = [0] * nets
for model in range(nets):
    name = 'model_' + str(model) + '.json'
    json_file = open(name, 'r')
    loaded_model_json = json_file.read()
    json_file.close()
    model_loaded[model] = model_from_json(loaded_model_json)
    name = 'model_' + str(model) + '.h5'
    model_loaded[model].load_weights(name)