# Building a strong image classification model using keras


**** CONTENTS::****

**** 1 ) Importing Various Modules****

 ****2 ) Prepare the Data****
 
 ****3 ) Modelling****
 
 ****4 ) Evaluating the Model Performance****
 
 ****5) Making Predictions on the Test Set****
 
**** 6) Saving Submissions onto a CSV  ****


## 1)Importing libraries and constants for Preprocessing

In [1]:
import cv2                 # working with, mainly resizing, images
import numpy as np         # dealing with arrays
import os                  # dealing with directories
from random import shuffle # mixing up or currently ordered data that might lead our network astray in training.

train_dir = '../input/train'
test_dir = '../input/test'

## 2)Preparing the data

## 2.1)conversion to one-hot array for cat it is [1,0] and for dog it is [0,1]

In [2]:
def get_label(img):
    label = img.split('.')[0]
    if label == 'cat': 
        return [1,0]
    elif label == 'dog': 
        return [0,1]

## 2.2)Building  another function to fully process the training images and their labels into arrays:-

In [3]:
from tqdm import tqdm      # a nice pretty percentage bar for tasks.

def making_train_data():
    training_data = []
    
    for img in tqdm(os.listdir(train_dir)):
        label = get_label(img)
        path = os.path.join(train_dir,img)
        img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img, (50,50))
        training_data.append([np.array(img),np.array(label)])
        
    shuffle(training_data)
    np.save('train_data.npy', training_data)
    return training_data

In [4]:
def making_test_data():
    testing_data = []
    
    for img in tqdm(os.listdir(test_dir)):
        path = os.path.join(test_dir , img)
        img_num = img.split('.')[0]
        img = cv2.imread(path , cv2.IMREAD_GRAYSCALE)
        img = cv2.resize(img , (50,50))
        testing_data.append([np.array(img), img_num])
        
    shuffle(testing_data)
    np.save('test_data.npy', testing_data)
    return testing_data

In [5]:
train_data = making_train_data()

100%|██████████| 25000/25000 [01:39<00:00, 250.21it/s]


## 2.3)Split the train_data into train(having 20,000 images) and test(having 5,000 images) 

In [6]:
train = train_data[0:20000]
test = train_data[20000:25000]

In [7]:
X = np.array([i[0] for i in train]).reshape(-1,1,50,50)
Y = [i[1] for i in train]

test_x = np.array([i[0] for i in test]).reshape(-1,1,50,50)
test_y = [i[1] for i in test]

## 2.4)Data Augmentation to prevent Overfitting

In [8]:
from keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
        featurewise_center=False,  
        samplewise_center=False,  
        featurewise_std_normalization=False,  
        samplewise_std_normalization=False,  
        zca_whitening=False,  
        rotation_range=10,  
        zoom_range = 0.0,  
        width_shift_range=0.1,  
        height_shift_range=0.1,  
        horizontal_flip=False, 
        vertical_flip=False)  

datagen.fit(X)

Using TensorFlow backend.
  ' channels).')


In [9]:
Y = np.asarray(Y)
Y.reshape(len(Y) , 2)

array([[1, 0],
       [0, 1],
       [1, 0],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]])

In [10]:
test_y = np.asarray(test_y)
test_y.reshape(len(test_y) , 2)

array([[1, 0],
       [0, 1],
       [1, 0],
       ...,
       [1, 0],
       [0, 1],
       [0, 1]])

In [11]:
test_x = test_x.reshape(-1, 1, 50, 50)

In [12]:
test_x = test_x / 255
X = X / 255

****We will be using the Sequential model from Keras to form the Neural Network. Sequential Model is used to construct simple models with linear stack of layers.****

****More info on Sequential model and Keras in general at https://keras.io/getting-started/sequential-model-guide/ and https://github.com/keras-team/keras****

## ## 3)Modelling

In [13]:
from keras.models import Sequential
from keras.layers import Dense , Activation
from keras.layers import Dropout
from keras.layers import Flatten
from keras.constraints import maxnorm
from keras.optimizers import SGD
from keras.layers import Convolution2D
from keras.layers import Conv2D , BatchNormalization
from keras.layers import MaxPooling2D
from keras.utils import np_utils
from keras import backend as K
K.set_image_dim_ordering('th')

## 3.1)Building the ConvNet Model

In [14]:
# Initialising the CNN
classifier = Sequential()

# Step 1 - Convolution
classifier.add(Convolution2D(32, 3, 3, input_shape = (1,50,50), activation = 'relu'))

# Step 2 - Pooling
classifier.add(MaxPooling2D(pool_size = (2, 2)))

# Adding a second convolutional layer
classifier.add(Convolution2D(32, 3, 3, activation = 'relu'))
classifier.add(MaxPooling2D(pool_size = (2, 2)))



# Adding a third convolutional layer
classifier.add(Convolution2D(64, 3, 3, activation = 'relu'))
classifier.add(MaxPooling2D(pool_size = (2, 2)))




# Step 3 - Flattening
classifier.add(Flatten())

# Step 4 - Full connection
classifier.add(Dense(output_dim = 64, activation = 'relu'))
classifier.add(Dropout(0.4))
classifier.add(Dense(output_dim = 2, activation = 'sigmoid'))



  """
  # This is added back by InteractiveShellApp.init_path()


## 3.2)Compiling the Keras Model

In [15]:
# Compiling the CNN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

## 3.3)Summary of the Model

In [16]:
classifier.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_1 (Conv2D)            (None, 32, 48, 48)        320       
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 32, 24, 24)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 32, 22, 22)        9248      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 32, 11, 11)        0         
_________________________________________________________________
conv2d_3 (Conv2D)            (None, 64, 9, 9)          18496     
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 64, 4, 4)          0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 1024)              0         
__________

## 3.4)Fitting on the Training set and making predcitons on the Validation set

In [17]:
batch_size = 128
epochs = 20

classifier.compile(loss='binary_crossentropy', optimizer='adam' , metrics=['accuracy'])
steps_per_epoch = len(train_data) // batch_size
validation_steps = len((test_x, test_y)) // batch_size

In [18]:
history = classifier.fit_generator(datagen.flow(X, Y, batch_size=batch_size),
                    steps_per_epoch=X.shape[0] // batch_size,
                    validation_data=(test_x, test_y),
                    epochs = epochs, verbose = 2)

  str(self.x.shape[channels_axis]) + ' channels).')


Epoch 1/20
 - 27s - loss: 0.6778 - acc: 0.5572 - val_loss: 0.6393 - val_acc: 0.6317
Epoch 2/20
 - 27s - loss: 0.6291 - acc: 0.6496 - val_loss: 0.6095 - val_acc: 0.6565
Epoch 3/20
 - 27s - loss: 0.5749 - acc: 0.7027 - val_loss: 0.5542 - val_acc: 0.7103
Epoch 4/20
 - 27s - loss: 0.5338 - acc: 0.7361 - val_loss: 0.5229 - val_acc: 0.7452
Epoch 5/20
 - 27s - loss: 0.4942 - acc: 0.7648 - val_loss: 0.4730 - val_acc: 0.7718
Epoch 6/20
 - 27s - loss: 0.4758 - acc: 0.7777 - val_loss: 0.4677 - val_acc: 0.7776
Epoch 7/20
 - 27s - loss: 0.4613 - acc: 0.7844 - val_loss: 0.4453 - val_acc: 0.7904
Epoch 8/20
 - 27s - loss: 0.4471 - acc: 0.7968 - val_loss: 0.4457 - val_acc: 0.7873
Epoch 9/20
 - 27s - loss: 0.4285 - acc: 0.8049 - val_loss: 0.4267 - val_acc: 0.8009
Epoch 10/20
 - 27s - loss: 0.4134 - acc: 0.8092 - val_loss: 0.4123 - val_acc: 0.8125
Epoch 11/20
 - 26s - loss: 0.3974 - acc: 0.8211 - val_loss: 0.4112 - val_acc: 0.8102
Epoch 12/20
 - 27s - loss: 0.3963 - acc: 0.8194 - val_loss: 0.4062 - val_a

In [19]:
test_data = making_test_data()

100%|██████████| 12500/12500 [00:47<00:00, 261.54it/s]


In [20]:
score = classifier.evaluate(test_x, test_y, verbose=0)
print('valid loss:', score[0])
print('valid accuracy:', score[1])

valid loss: 0.3688554729938507
valid accuracy: 0.8385


In [21]:
with open('submission_file.csv','w') as f:
    f.write('id,label\n')
            
with open('submission_file.csv','a') as f:
    for data in tqdm(test_data):
        img_num = data[1]
        img_data = data[0]
        orig = img_data
        data = img_data.reshape(1,1,50,50)
        model_out = classifier.predict([data])[0]
        f.write('{},{}\n'.format(img_num,model_out[1]))

100%|██████████| 12500/12500 [00:18<00:00, 677.94it/s]


In [None]:
import pandas as pd
aa = pd.read_csv('submission_file.csv')
aa


In [None]:
from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))
print(check_output(["ls", "../working"]).decode("utf8"))

In [None]:
data_to_submit = pd.DataFrame({
    'id':aa['id'],
    'label':aa['label']
})
data_to_submit.to_csv('submission_file.csv', index = False)