# Section 5.2
## Handle the dataset
Data is stored in project root directory under datsets and a small subset is copied to chapter directory/datasets. It is split into 1000, 500, 500 train validate and test samples each in its own folder.

NOTE: add commands that will use kagggle api to pull data.

In [13]:
import os, shutil

def create_dir_if_not_existing(dir_name):
    # This function cretes a directory if it does not exist otherwise it prints that it exists and continues.
    try:
        os.mkdir(dir_name) 
    except FileExistsError:
        print(f'Directory:"{dir_name}" already exists, continuing.')
    
original_dataset_dir = '../datasets/dogs-vs-cats/train'
# Create directories.
base_dir = './datasets/cats_and_dogs_small'
create_dir_if_not_existing(base_dir)

train_dir = os.path.join(base_dir, 'train') 
create_dir_if_not_existing(train_dir)

validation_dir = os.path.join(base_dir, 'validation') 
create_dir_if_not_existing(validation_dir)

test_dir = os.path.join(base_dir, 'test') 
create_dir_if_not_existing(test_dir)

train_cats_dir = os.path.join(train_dir, 'cats') 
create_dir_if_not_existing(train_cats_dir)

train_dogs_dir = os.path.join(train_dir, 'dogs') 
create_dir_if_not_existing(train_dogs_dir)

validation_cats_dir = os.path.join(validation_dir, 'cats') 
create_dir_if_not_existing(validation_cats_dir)

validation_dogs_dir = os.path.join(validation_dir, 'dogs') 
create_dir_if_not_existing(validation_dogs_dir)

test_cats_dir = os.path.join(test_dir, 'cats') 
create_dir_if_not_existing(test_cats_dir)

test_dogs_dir = os.path.join(test_dir, 'dogs') 
create_dir_if_not_existing(test_dogs_dir)

# Split [train valdiate test] as [1000].
def copy_datset(fnames,src_dir,dst_dir):
    for fname in fnames:
        src = os.path.join(src_dir, fname)
        dst = os.path.join(dst_dir, fname)
        shutil.copyfile(src,dst)
        
copy_datset([f'cat.{i}.jpg' for i in range(1000)], original_dataset_dir, train_cats_dir)
copy_datset([f'dog.{i}.jpg' for i in range(1000)], original_dataset_dir, train_dogs_dir)

copy_datset([f'cat.{i}.jpg' for i in range(1000,1500)], original_dataset_dir, validation_cats_dir)
copy_datset([f'dog.{i}.jpg' for i in range(1000,1500)], original_dataset_dir, validation_dogs_dir)

copy_datset([f'cat.{i}.jpg' for i in range(1500,2000)], original_dataset_dir, test_cats_dir)
copy_datset([f'dog.{i}.jpg' for i in range(1500,2000)], original_dataset_dir, test_dogs_dir)

Directory:"./datasets/cats_and_dogs_small" already exists, continuing.
Directory:"./datasets/cats_and_dogs_small\train" already exists, continuing.
Directory:"./datasets/cats_and_dogs_small\validation" already exists, continuing.
Directory:"./datasets/cats_and_dogs_small\test" already exists, continuing.
Directory:"./datasets/cats_and_dogs_small\train\cats" already exists, continuing.
Directory:"./datasets/cats_and_dogs_small\train\dogs" already exists, continuing.
Directory:"./datasets/cats_and_dogs_small\validation\cats" already exists, continuing.
Directory:"./datasets/cats_and_dogs_small\validation\dogs" already exists, continuing.
Directory:"./datasets/cats_and_dogs_small\test\cats" already exists, continuing.
Directory:"./datasets/cats_and_dogs_small\test\dogs" already exists, continuing.


In [15]:
# Check number of images.
print(f"Train cat images:{len(os.listdir(train_cats_dir))}")
print(f"Train dog images:{len(os.listdir(train_dogs_dir))}")
print(f"Validation cat images:{len(os.listdir(validation_cats_dir))}")
print(f"Validation dog images:{len(os.listdir(validation_dogs_dir))}")
print(f"Test cat images:{len(os.listdir(test_cats_dir))}")
print(f"Test dog images:{len(os.listdir(test_dogs_dir))}")

Train cat images:1000
Train dog images:1000
Validation cat images:500
Validation dog images:500
Test cat images:500
Test dog images:500


NOTE: since we have a balanced number of cats and dogs in a dataset accutracy is a valid meaure of succses (this is a balanced binary calssification problem)

## Preprocess data


In [None]:
from keras.preprocessing.image import ImageDataGenerator
train_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)
train_generator = train_datagen.flow_from_directory(train_dir, target_size=(150, 150), batch_size=20, class_mode='binary')
test_generator = test_datagen.flow_from_directory(test_dir, target_size=(150, 150), batch_size=20, class_mode='binary')

## Create network

In [21]:
from keras import layers, models, optimizers

model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(150, 150, 3)))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer=optimizers.rmsprop(lr=1e-4), metrics=['acc'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_11 (Conv2D)           (None, 148, 148, 32)      896       
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 74, 74, 32)        0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 72, 72, 64)        18496     
_________________________________________________________________
max_pooling2d_12 (MaxPooling (None, 36, 36, 64)        0         
_________________________________________________________________
conv2d_13 (Conv2D)           (None, 34, 34, 128)       73856     
_________________________________________________________________
max_pooling2d_13 (MaxPooling (None, 17, 17, 128)       0         
_________________________________________________________________
conv2d_14 (Conv2D)           (None, 15, 15, 128)       147584    
__________