# EDA Notebook

This notebook is used for exploratory data analysis

In [1]:
#imports 
import os, shutil
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score
import time
import matplotlib.pyplot as plt
import scipy
import numpy as np
from PIL import Image
from scipy import ndimage
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img
from keras import models
from keras import layers

In [2]:
train_images_dir = 'C:/Users/User/Documents/Flatiron/Capstone/train_data/images'
test_images_dir = 'C:/Users/User/Documents/Flatiron/Capstone/test_data/images'

#train_images_dir_0 = 'C:/Users/User/Documents/Flatiron/Capstone/train_data/images/0'
#train_images_dir_1 = 'C:/Users/User/Documents/Flatiron/Capstone/train_data/images/1'

#### Prepping data for modeling

#### Image Generator Object

#### Training and Validation set creation

In [3]:
global_target_size = (128,128)
global_batch_size = 32

In [4]:
# Batch size here means how many 
#of the images do we want our model to be trained on
train_val_generator = ImageDataGenerator(rescale=1./255,
                                         validation_split = .10)
train_data = train_val_generator.flow_from_directory(train_images_dir,  
                                                     target_size = global_target_size,
                                                     subset='training',
                                                     batch_size=global_batch_size,
                                                     class_mode='binary')

Found 3156 images belonging to 2 classes.


In [5]:
val_data = train_val_generator.flow_from_directory(train_images_dir,
       subset = 'validation',  
       target_size = global_target_size,
       batch_size=global_batch_size,
       class_mode='binary')

Found 350 images belonging to 2 classes.


### Test Set Creation

In [6]:
test_generator = ImageDataGenerator(rescale=1./255)
test_data = test_generator.flow_from_directory(
        test_images_dir,  
        target_size = global_target_size,
        batch_size=global_batch_size,
        class_mode='binary',
        shuffle= False)

Found 809 images belonging to 2 classes.


### Model Iterations

###### Here I am creating a dictionary to store all of my scores

In [7]:
score_dictionary = {}

#### CNN 1

Description: Running with default parameters

In [8]:
model1 = models.Sequential()
model1.add(layers.Conv2D(filters = 32,
                         kernel_size=(3,3),
                         activation='relu',
                         input_shape = (global_target_size[0],global_target_size[0], 3)
                        ))
model1.add(layers.MaxPooling2D(pool_size = (2, 2)))

model1.add(layers.Conv2D(32, (4, 4), activation='relu'))
model1.add(layers.MaxPooling2D((2, 2)))

model1.add(layers.Conv2D(64, (3, 3), activation='relu'))
model1.add(layers.MaxPooling2D((2, 2)))

model1.add(layers.Flatten())
model1.add(layers.Dense(64, activation='relu'))
model1.add(layers.Dense(1, activation='sigmoid'))

model1.compile(loss='binary_crossentropy',
              optimizer = "adam",
              metrics=['acc'])

In [9]:
history1 = model1.fit(train_data,
                    epochs=15,
                    validation_data = val_data
                   )

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [10]:
model1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 126, 126, 32)      896       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 63, 63, 32)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 60, 60, 32)        16416     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 30, 30, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 28, 28, 64)        18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 14, 14, 64)        0         
_________________________________________________________________
flatten (Flatten)            (None, 12544)             0

In [11]:
results_train_1 = model1.evaluate(train_data)
results_train_1



[0.023903056979179382, 0.9911280274391174]

In [12]:
results_val_1 = model1.evaluate(val_data)
results_val_1



[1.036498785018921, 0.7914285659790039]

In [13]:
score_dictionary['model 1'] =  results_val_1

CNN 1 Train: loss: 4.6420e-05 - acc: 1.0000
CNN 1 Validation: loss: 5.2256 - acc: 0.5000
Highly overfit

### CNN 2

my goal with this next model is to reduce overfitting - I will attempt to do that by adding a dropout later

In [14]:
model2 = models.Sequential()
model2.add(layers.Conv2D(32, (3, 3), activation='relu',
                         input_shape = (global_target_size[0],global_target_size[0], 3)
                        ))
model2.add(layers.MaxPooling2D((2, 2)))

model2.add(layers.Conv2D(32, (4, 4), activation='relu'))
model2.add(layers.MaxPooling2D((2, 2)))

model2.add(layers.Conv2D(64, (3, 3), activation='relu'))
model2.add(layers.MaxPooling2D((2, 2)))

model2.add(layers.Flatten())
model2.add(layers.Dense(64, activation='relu'))
model2.add(layers.Dropout(0.5))
model2.add(layers.Dense(1, activation='sigmoid'))

model2.compile(loss='binary_crossentropy',
              optimizer = "adam",
              metrics=['acc'])

In [15]:
history2 = model2.fit(train_data,
                    epochs=15,
                    #batch_size = 5,
                    validation_data = val_data
                   )

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [16]:
results_train_2 = model2.evaluate(train_data)
results_train_2



[0.027039606124162674, 0.9923954606056213]

In [17]:
results_val_2 = model2.evaluate(val_data)
results_val_2



[0.8709557056427002, 0.7857142686843872]

In [18]:
score_dictionary['model 2'] =  results_val_2

### CNN 3

the last model reduced overfitting a little bit, but now enough as I would like, I am going to add another dropout layer to hopefully reduce the overfitting more 

In [19]:
model3 = models.Sequential()
model3.add(layers.Conv2D(32, (3, 3), activation='relu',
                         input_shape = (global_target_size[0],global_target_size[0], 3)
                        ))
model3.add(layers.MaxPooling2D((2, 2)))

model3.add(layers.Conv2D(32, (4, 4), activation='relu'))
model3.add(layers.MaxPooling2D((2, 2)))

model3.add(layers.Conv2D(64, (3, 3), activation='relu'))
model3.add(layers.MaxPooling2D((2, 2)))

model3.add(layers.Flatten())
model3.add(layers.Dense(64, activation='relu'))
model3.add(layers.Dropout(0.25))
model3.add(layers.Dense(64, activation='relu'))
model3.add(layers.Dropout(0.25))
model3.add(layers.Dense(1, activation='sigmoid'))

model3.compile(loss='binary_crossentropy',
              optimizer = "adam",
              metrics=['acc'])

In [20]:
history3 = model3.fit(train_data,
                    epochs=15,
                    #batch_size = 5,
                    validation_data = val_data
                   )

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [21]:
results_train_3 = model3.evaluate(train_data)
results_train_3



[0.020531611517071724, 0.9942965507507324]

In [22]:
results_val_3 = model3.evaluate(val_data)
results_val_3



[1.4455512762069702, 0.6971428394317627]

In [23]:
score_dictionary['model 3'] =  results_val_3

### CNN 4

This is just for exploratory analysis - to see what does what 

In [24]:
# importing schocastic gradient descent and creating my own learning rate... using that as optimier
#from keras.optimizers import SGD
#opt = SGD(lr=0.1)

In [25]:
#import tensorflow as tf

In [26]:
model4 = models.Sequential()
model4.add(layers.Conv2D(filters = 32,
                         kernel_size=(3,3),
                         activation='relu',
                         input_shape = (global_target_size[0],global_target_size[0], 3)
                        ))
model4.add(layers.MaxPooling2D(pool_size = (2, 2)))

model4.add(layers.Conv2D(32, (4, 4), activation='relu'))
model4.add(layers.MaxPooling2D((2, 2)))

model4.add(layers.Conv2D(64, (3, 3), activation='relu'))
model4.add(layers.MaxPooling2D((2, 2)))

model4.add(layers.Conv2D(64, (3, 3), activation='relu'))
model4.add(layers.MaxPooling2D((2, 2)))

model4.add(layers.Flatten())
model4.add(layers.Dense(64, activation='relu'))
model4.add(layers.Dense(1, activation='sigmoid'))

model4.compile(loss='binary_crossentropy',
              optimizer = 'adam',
              metrics=['acc'])

In [27]:
history4 = model4.fit(train_data,
                    epochs=15,
                    validation_data = val_data
                   )

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [28]:
model4.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_9 (Conv2D)            (None, 126, 126, 32)      896       
_________________________________________________________________
max_pooling2d_9 (MaxPooling2 (None, 63, 63, 32)        0         
_________________________________________________________________
conv2d_10 (Conv2D)           (None, 60, 60, 32)        16416     
_________________________________________________________________
max_pooling2d_10 (MaxPooling (None, 30, 30, 32)        0         
_________________________________________________________________
conv2d_11 (Conv2D)           (None, 28, 28, 64)        18496     
_________________________________________________________________
max_pooling2d_11 (MaxPooling (None, 14, 14, 64)        0         
_________________________________________________________________
conv2d_12 (Conv2D)           (None, 12, 12, 64)       

In [29]:
results_train_4 = model4.evaluate(train_data)
results_train_4



[0.01875309832394123, 0.9933460354804993]

In [30]:
results_val_4 = model4.evaluate(val_data)
results_val_4



[1.13747239112854, 0.7571428418159485]

In [31]:
score_dictionary['model 4'] =  results_val_4

### CNN 5

###### For this CNN, I am trying something different. Since my model is severely overfitting, I looked up how to reduce that and all of the articles I read said to add more data. I don't have more data readily available, so I am going to use the ImageDataGenerator to create more augmented images in hopes that this will reduce overfitting

In [32]:
# Batch size here means how many 
#of the images do we want our model to be trained on

# here is where I am experimenting with augmenting the data to yield reduced overfitting
train_val_generator2 = ImageDataGenerator(rescale=1./255,
                                          validation_split = .20,
                                          rotation_range=15,
                                          width_shift_range=0.1,
                                          height_shift_range=0.1,
                                          shear_range=0.01,
                                          zoom_range=[0.9, 1.25],
                                          horizontal_flip=True,
                                          vertical_flip=False,
                                          fill_mode='reflect',
                                          data_format='channels_last',
                                          brightness_range=[0.5, 1.5]
                                         )

train_data2 = train_val_generator2.flow_from_directory(train_images_dir,  
                                                     target_size = global_target_size,
                                                     subset='training',
                                                     batch_size=global_batch_size,
                                                     class_mode='binary')

Found 2805 images belonging to 2 classes.


In [33]:
val_data2 = train_val_generator2.flow_from_directory(train_images_dir,
       subset = 'validation',  
       target_size = global_target_size,
       batch_size=global_batch_size,
       class_mode='binary')

Found 701 images belonging to 2 classes.


In [34]:
#train_images2, train_labels2 = next(train_data2)

In [35]:
#val_images2, val_labels2 = next(val_data2)

In [36]:
# reshaping the y values to be modeled
#y_train2 = np.asarray(train_labels2).astype('float32').reshape((-1,1))
#y_val2 = np.asarray(val_labels2).astype('float32').reshape((-1,1))

In [37]:
model3 = models.Sequential()
model3.add(layers.Conv2D(32, (3, 3), activation='relu',
                         input_shape = (global_target_size[0],global_target_size[0], 3)
                        ))
model3.add(layers.MaxPooling2D((2, 2)))

model3.add(layers.Conv2D(32, (4, 4), activation='relu'))
model3.add(layers.MaxPooling2D((2, 2)))

model3.add(layers.Conv2D(64, (3, 3), activation='relu'))
model3.add(layers.MaxPooling2D((2, 2)))

model3.add(layers.Flatten())
model3.add(layers.Dense(64, activation='relu'))
model3.add(layers.Dropout(0.5))
model3.add(layers.Dense(64, activation='relu'))
model3.add(layers.Dropout(0.5))
model3.add(layers.Dense(1, activation='sigmoid'))

model3.compile(loss='binary_crossentropy',
              optimizer = "adam",
              metrics=['acc'])

In [38]:
history3 = model3.fit(train_data2,
                    #y_train2,
                    epochs=15,
                    #use_multiprocessing=True,
                      batch_size =32,
                    validation_data = val_data2
                   )

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [39]:
score_dictionary['model 5'] =  results_val_5

NameError: name 'results_val_5' is not defined

In [40]:
score_dictionary

{'model 1': [1.036498785018921, 0.7914285659790039],
 'model 2': [0.8709557056427002, 0.7857142686843872],
 'model 3': [1.4455512762069702, 0.6971428394317627],
 'model 4': [1.13747239112854, 0.7571428418159485]}