# EDA Notebook

This notebook is used for exploratory data analysis

In [1]:
#imports 
import os, shutil
import pandas as pd
from sklearn.metrics import accuracy_score, recall_score
import time
import matplotlib.pyplot as plt
import scipy
import numpy as np
from PIL import Image
from scipy import ndimage
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

In [2]:
train_images_dir = 'C:/Users/User/Documents/Flatiron/Capstone/train_data/images'
test_images_dir = 'C:/Users/User/Documents/Flatiron/Capstone/test_data/images'

#train_images_dir_0 = 'C:/Users/User/Documents/Flatiron/Capstone/train_data/images/0'
#train_images_dir_1 = 'C:/Users/User/Documents/Flatiron/Capstone/train_data/images/1'

#### Prepping data for modeling

#### Image Generator Object

#### Training and Validation set creation

In [3]:
# Batch size here means how many 
#of the images do we want our model to be trained on
train_val_generator = ImageDataGenerator(rescale=1./255,
                                     validation_split = .30)
train_data = train_val_generator.flow_from_directory(
        train_images_dir,  
        target_size = (128,128),
        subset='training',
        batch_size=32,
        class_mode='binary')

Found 3474 images belonging to 2 classes.


In [4]:
val_data = train_val_generator.flow_from_directory(train_images_dir,
       subset = 'validation',  
       target_size = (128,128),
       batch_size=32,
       class_mode='binary')

Found 1488 images belonging to 2 classes.


In [5]:
train_images, train_labels = next(train_data)

In [6]:
val_images, val_labels = next(val_data)

In [7]:
train_images.shape

(32, 128, 128, 3)

In [8]:
train_labels.shape

(32,)

In [9]:
val_images.shape

(32, 128, 128, 3)

In [10]:
val_labels.shape

(32,)

In [11]:
# reshaping the y values to be modeled
y_train = np.asarray(train_labels).astype('float32').reshape((-1,1))
y_val = np.asarray(val_labels).astype('float32').reshape((-1,1))

### Test Set Creation

In [12]:
test_generator = ImageDataGenerator(rescale=1./255)
test_data = test_generator.flow_from_directory(
        test_images_dir,  
        target_size = (128,128),
        batch_size=32,
        class_mode='binary',
        shuffle= False)

Found 857 images belonging to 2 classes.


In [13]:
test_images, test_labels = next(test_data)

In [14]:
y_test = np.asarray(test_labels).astype('float32').reshape((-1,1))


#### Reshaping for normal neural network (flattening)

In [15]:
train_img_nn = train_images.reshape(train_images.shape[0], -1)

In [16]:
val_img_nn = val_images.reshape(val_images.shape[0], -1)

In [17]:
y_train_nn = np.asarray(train_labels).astype('float32').reshape((-1,1))
y_val_nn = np.asarray(val_labels).astype('float32').reshape((-1,1))


In [18]:
print(train_img_nn.shape)
print(y_train_nn.shape)
print(val_img_nn.shape)
print(y_val_nn.shape)

(32, 49152)
(32, 1)
(32, 49152)
(32, 1)


### Baseline

In [19]:
# Build a baseline fully connected model
from keras import models
from keras import layers

model = models.Sequential()
model.add(layers.Dense(20, activation='relu', input_shape=(49152,))) # 2 hidden layers
model.add(layers.Dense(7, activation='relu'))
model.add(layers.Dense(5, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

In [20]:
model.compile(optimizer='sgd',
              loss='binary_crossentropy',
              metrics=['accuracy'])

histoire = model.fit(train_img_nn,
                    y_train_nn,
                    epochs=15,
                     validation_data = (val_img_nn,y_val_nn))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [21]:
results_val = model.evaluate(val_img_nn,y_val_nn)



In [22]:
results_val

[0.748617947101593, 0.59375]

In [23]:
results_train = model.evaluate(train_img_nn,
                    y_train_nn)
results_train



[0.40654969215393066, 0.75]

### Model Iterations

#### CNN 1

Description: Running with default parameters

In [24]:
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu',
                        ))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(32, (4, 4), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Flatten())
model.add(layers.Dense(64, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer="sgd",
              metrics=['acc'])

In [25]:
history = model.fit(train_images,
                    y_train,
                    epochs=15,
                    use_multiprocessing=True,
                    validation_data = (val_images,y_val)
                   )

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [26]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (32, 126, 126, 32)        896       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (32, 63, 63, 32)          0         
_________________________________________________________________
conv2d_1 (Conv2D)            (32, 60, 60, 32)          16416     
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (32, 30, 30, 32)          0         
_________________________________________________________________
conv2d_2 (Conv2D)            (32, 28, 28, 64)          18496     
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (32, 14, 14, 64)          0         
_________________________________________________________________
flatten (Flatten)            (32, 12544)              

In [27]:
results_train_1 = model.evaluate(train_images,y_train)
results_train_1



[0.5862481594085693, 0.65625]

In [28]:
results_val_1 = model.evaluate(val_images, y_val)
results_val_1



[0.6568219661712646, 0.6875]