# Data Augmentation for fixing the problem of Overfitting the model

> In this notebook we will build a CNN to classify flower images. We will also see how our model overfits and how overfitting can be addressed using data augmentation. 

> Data augmentation is a process of generating new training samples from current training dataset using transformations such as zoom, rotations, change in contrast etc

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import cv2
import os
import PIL
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential

In [2]:
dataset_url = "https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz"
data_dir = tf.keras.utils.get_file('flower_photos', origin=dataset_url,  cache_dir='.', untar=True)
# cache_dir indicates where to download data. I specified . which means current directory
# untar true will unzip it

Downloading data from https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz


In [3]:
data_dir

'./datasets/flower_photos'

In [4]:
import pathlib
data_dir=pathlib.Path(data_dir)
data_dir

PosixPath('datasets/flower_photos')

In [5]:
list(data_dir.glob('*/*.jpg'))

[PosixPath('datasets/flower_photos/sunflowers/200557981_f800fa1af9.jpg'),
 PosixPath('datasets/flower_photos/sunflowers/4821232343_7e0bcfbfdf_n.jpg'),
 PosixPath('datasets/flower_photos/sunflowers/3568925290_faf7aec3a0.jpg'),
 PosixPath('datasets/flower_photos/sunflowers/14646283472_50a3ae1395.jpg'),
 PosixPath('datasets/flower_photos/sunflowers/4805544785_a63241f6d0_n.jpg'),
 PosixPath('datasets/flower_photos/sunflowers/8543642705_b841b0e5f6.jpg'),
 PosixPath('datasets/flower_photos/sunflowers/7510262868_cf7d6f6f25_n.jpg'),
 PosixPath('datasets/flower_photos/sunflowers/22478719251_276cb094f9_n.jpg'),
 PosixPath('datasets/flower_photos/sunflowers/5917253022_4e3142d48b_n.jpg'),
 PosixPath('datasets/flower_photos/sunflowers/9216286162_6ceefdd1b4_m.jpg'),
 PosixPath('datasets/flower_photos/sunflowers/3912497870_a2f91c3a65_n.jpg'),
 PosixPath('datasets/flower_photos/sunflowers/2894191705_a1d2d80c80.jpg'),
 PosixPath('datasets/flower_photos/sunflowers/5437996076_cf7e2ac32e_n.jpg'),
 PosixPa

In [7]:
len(list(data_dir.glob('*/*.jpg')))

3670

In [10]:
flowers_images_dict = {
    'roses': list(data_dir.glob('roses/*')),
    'daisy': list(data_dir.glob('daisy/*')),
    'dandelion': list(data_dir.glob('dandelion/*')),
    'sunflowers': list(data_dir.glob('sunflowers/*')),
    'tulips': list(data_dir.glob('tulips/*')),
}

In [13]:
len(flowers_images_dict)

5

In [14]:
len(flowers_images_dict['roses'])

641

In [15]:
flowers_labels_dict = {
    'roses': 0,
    'daisy': 1,
    'dandelion': 2,
    'sunflowers': 3,
    'tulips': 4,
}

In [19]:
cv2.imread(str(flowers_images_dict['roses'][0]))

array([[[235, 215, 174],
        [235, 216, 173],
        [234, 215, 170],
        ...,
        [173, 160, 134],
        [164, 153, 133],
        [152, 145, 126]],

       [[238, 214, 172],
        [236, 215, 170],
        [236, 216, 169],
        ...,
        [177, 160, 134],
        [166, 152, 130],
        [154, 142, 124]],

       [[239, 214, 170],
        [238, 215, 169],
        [239, 217, 169],
        ...,
        [173, 160, 134],
        [160, 152, 129],
        [149, 142, 122]],

       ...,

       [[ 45,  66,  58],
        [ 42,  65,  57],
        [ 42,  65,  57],
        ...,
        [ 21,  21,  21],
        [ 20,  20,  20],
        [ 17,  17,  17]],

       [[ 40,  62,  57],
        [ 40,  63,  55],
        [ 40,  63,  55],
        ...,
        [ 28,  26,  25],
        [ 25,  23,  22],
        [ 21,  19,  19]],

       [[ 38,  60,  55],
        [ 38,  61,  53],
        [ 38,  62,  54],
        ...,
        [ 45,  42,  37],
        [ 41,  38,  34],
        [ 36,  32,  31]]

In [18]:
img=cv2.imread(str(flowers_images_dict['roses'][0]))

In [20]:
img.shape

(333, 500, 3)

In [21]:
cv2.resize(img,(180,180)).shape

(180, 180, 3)

In [22]:
X, y = [], []

for flower_name, images in flowers_images_dict.items():
    for image in images:
        img = cv2.imread(str(image))
        resized_img = cv2.resize(img,(180,180))
        X.append(resized_img)
        y.append(flowers_labels_dict[flower_name])

In [23]:
flowers_labels_dict['roses']

0

In [24]:
X=np.array(X)
y=np.array(y)

In [25]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [26]:
len(X_train)

2936

In [27]:
len(X_test)

734

In [28]:
X_train[:2]

array([[[[245, 236, 232],
         [244, 235, 232],
         [243, 235, 235],
         ...,
         [205, 200, 202],
         [203, 198, 200],
         [207, 200, 203]],

        [[245, 236, 232],
         [244, 236, 233],
         [243, 235, 235],
         ...,
         [207, 202, 204],
         [206, 201, 203],
         [209, 202, 205]],

        [[245, 238, 233],
         [243, 236, 234],
         [243, 235, 235],
         ...,
         [210, 205, 207],
         [211, 204, 207],
         [213, 206, 209]],

        ...,

        [[193, 195, 195],
         [195, 195, 195],
         [195, 195, 195],
         ...,
         [  0,   0,   0],
         [  0,   0,   0],
         [  0,   0,   0]],

        [[195, 198, 197],
         [197, 198, 197],
         [199, 197, 197],
         ...,
         [  0,   0,   0],
         [  0,   0,   0],
         [  0,   0,   0]],

        [[196, 200, 198],
         [199, 200, 198],
         [201, 199, 198],
         ...,
         [  0,   0,   0],
        

In [29]:
X_train_scaled=X_train/255
X_test_scaled=X_test/255

In [30]:
X_train_scaled[:2]

array([[[[0.96078431, 0.9254902 , 0.90980392],
         [0.95686275, 0.92156863, 0.90980392],
         [0.95294118, 0.92156863, 0.92156863],
         ...,
         [0.80392157, 0.78431373, 0.79215686],
         [0.79607843, 0.77647059, 0.78431373],
         [0.81176471, 0.78431373, 0.79607843]],

        [[0.96078431, 0.9254902 , 0.90980392],
         [0.95686275, 0.9254902 , 0.91372549],
         [0.95294118, 0.92156863, 0.92156863],
         ...,
         [0.81176471, 0.79215686, 0.8       ],
         [0.80784314, 0.78823529, 0.79607843],
         [0.81960784, 0.79215686, 0.80392157]],

        [[0.96078431, 0.93333333, 0.91372549],
         [0.95294118, 0.9254902 , 0.91764706],
         [0.95294118, 0.92156863, 0.92156863],
         ...,
         [0.82352941, 0.80392157, 0.81176471],
         [0.82745098, 0.8       , 0.81176471],
         [0.83529412, 0.80784314, 0.81960784]],

        ...,

        [[0.75686275, 0.76470588, 0.76470588],
         [0.76470588, 0.76470588, 0.76470588]

# Training the model using CNN

In [37]:
num_classes=5
cnn= Sequential([
                 layers.Conv2D(16,3,padding='same', activation='relu'),
                 layers.MaxPooling2D(),
                 layers.Conv2D(32,3, padding='same', activation='relu'),
                 layers.MaxPooling2D(),
                 layers.Conv2D(64,3,padding='same',activation='relu'),
                 layers.MaxPooling2D(),

                 layers.Flatten(),
                 layers.Dense(128, activation='relu'),
                 layers.Dense(num_classes)
])

cnn.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])

In [38]:
cnn.fit(X_train_scaled,y_train,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f38103cf350>

In [39]:
cnn.evaluate(X_test_scaled,y_test)



[0.9455143809318542, 0.6457765698432922]

In [41]:
predictions=cnn.predict(X_test_scaled)

In [42]:
predictions

array([[-0.12969632,  0.8424153 ,  3.902874  , -1.4062488 ,  1.9552941 ],
       [ 8.620602  , -1.8787518 , -5.5569186 , -3.6326866 ,  5.545794  ],
       [-0.28404972,  2.7293873 ,  8.184557  , -4.0392084 ,  1.4131677 ],
       ...,
       [ 1.4712942 , -0.10070462, -0.5958167 ,  0.2648753 ,  3.803593  ],
       [-1.7362819 , -0.6667512 ,  2.636825  ,  0.31220308, -0.19403559],
       [ 0.44144887,  0.4917045 , -0.5271903 ,  0.89175993,  1.4350162 ]],
      dtype=float32)

In [43]:
score=tf.nn.softmax(predictions[0])
score

<tf.Tensor: shape=(5,), dtype=float32, numpy=
array([0.01462571, 0.03866338, 0.8249737 , 0.00408054, 0.11765676],
      dtype=float32)>

In [45]:
np.argmax(score) # Predictions

2

In [46]:
y_test[0] # Actual  

1

In [47]:
data_augmentation = keras.Sequential(
  [
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.1),
    layers.RandomZoom(0.1),
  ]
)

In [49]:
num_classes=5
cnn_data_augmented= Sequential([
                 data_augmentation,
                 layers.Conv2D(16,3,padding='same', activation='relu'),
                 layers.MaxPooling2D(),
                 layers.Conv2D(32,3, padding='same', activation='relu'),
                 layers.MaxPooling2D(),
                 layers.Conv2D(64,3,padding='same',activation='relu'),
                 layers.MaxPooling2D(),
                 layers.Dropout(0.2),

                 layers.Flatten(),
                 layers.Dense(128, activation='relu'),
                 layers.Dense(num_classes)
])

cnn_data_augmented.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
cnn_data_augmented.fit(X_train_scaled,y_train,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f381003c950>

In [50]:
cnn_data_augmented.evaluate(X_test_scaled,y_test)



[0.7797755002975464, 0.7247956395149231]