# Import Data

In [8]:
import os
import tensorflow as tf
from IPython.display import display
from PIL import Image

base_dir = 'chest_xray'

# Directory to our training data
train_folder = os.path.join(base_dir, 'train')

# Directory to our validation data
val_folder = os.path.join(base_dir, 'val')

# Directory to our training data
test_folder = os.path.join(base_dir, 'test')

TypeError: __init__() got an unexpected keyword argument 'serialized_options'

# Data Exploration

In [None]:
# List folders and number of files
print("Directory, Number of files")
for root, subdirs, files in os.walk(base_dir):
    print(root, len(files))

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Parameters for our graph; we'll output images in a 4x4 configuration
nrows = 4
ncols = 4

In [None]:
## Path to normal and pneumonia
train_normal_dir= "chest_xray/train/normal"
train_pneumonia_dir= "chest_xray/train/pneumonia"
train_normal_fnames = os.listdir(train_normal_dir)
train_pneumonia_fnames = os.listdir(train_pneumonia_dir)

# Set up matplotlib fig, and size it to fit 4x4 pics
fig = plt.gcf()
fig.set_size_inches(ncols, nrows)

pic_index = 8
next_normal_pix = [os.path.join(train_normal_dir, fname) 
                for fname in train_normal_fnames[pic_index-8:pic_index]]
next_pneumonia_pix = [os.path.join(train_pneumonia_dir, fname) 
                for fname in train_pneumonia_fnames[pic_index-8:pic_index]]

for i, img_path in enumerate(next_normal_pix+next_pneumonia_pix):
    # Set up subplot; subplot indices start at 1
    sp = plt.subplot(nrows, ncols, i + 1)
    sp.axis('Off') # Don't show axes (or gridlines)
    img = mpimg.imread(img_path)
    plt.imshow(img)

plt.show()

Images are in RGB

In [None]:
import pandas as pd

# An empty list. We will insert the data into this list in (img_path, label) format
train_data = []

# Go through all the normal cases. The label for these cases will be 0
for img in train_normal_fnames:
    train_data.append((img,0))

# Go through all the pneumonia cases. The label for these cases will be 1
for img in train_pneumonia_fnames:
    train_data.append((img, 1))
    
# Get a pandas dataframe from the data we have in our list 
train_data = pd.DataFrame(train_data, columns=['image', 'label'],index=None)

# Shuffle the data 
train_data = train_data.sample(frac=1.).reset_index(drop=True)

# How the dataframe looks like?
train_data.head()

In [None]:
import seaborn as sns

# Get the counts for each class
cases_count = train_data['label'].value_counts()
print(cases_count)

# Plot the results 
plt.figure(figsize=(10,8))
sns.barplot(x=cases_count.index, y= cases_count.values)
plt.title('Number of cases', fontsize=14)
plt.xlabel('Case type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(range(len(cases_count.index)), ['Normal (0)', 'Pneumonia (1)'])
plt.show()

# Data Augmentation

In [None]:
from keras.preprocessing.image import ImageDataGenerator

# Batch size
bs = 10

# All images will be resized to this value
image_size = (32, 32)

# All images will be rescaled by 1./255 
train_datagen = ImageDataGenerator(rescale=1./255)
val_datagen = ImageDataGenerator(rescale=1./255)
test_datagen = ImageDataGenerator(rescale=1./255)

# Flow training images in batches of 10 using train_datagen generator
print("Preparing generator for train dataset")
train_generator = train_datagen.flow_from_directory(
    directory= train_folder, # This is the source directory for training images 
    target_size=image_size, # All images will be resized to value set in image_size
    batch_size=bs,
    class_mode='categorical')

# Flow validation images in batches of 10 using val_datagen generator
print("Preparing generator for validation dataset")
val_generator = val_datagen.flow_from_directory(
    directory= val_folder, 
    target_size=image_size,
    batch_size=bs,
    class_mode='categorical')

# Flow test images in batches of 10 using test_datagen generator
# Added shuffle=False to keep data in same order as labels
print("Preparing generator for test dataset")
test_generator = test_datagen.flow_from_directory(
    directory=test_folder,
    target_size=image_size,
    batch_size=bs)

# Model Layers/Parameters

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Flatten, Dense

# Here we specify the input shape of our data 
# This should match the size of images ('image_size') along with the number of channels (RGB = 3)
input_shape = (32, 32, 3)

# Define the number of classes
num_classes = 2

# Initialise the model
model = Sequential()

# First convolution extracts 16 filters that are of kernel size 3x3 
model.add(Conv2D(16, (3,3), 
                 padding='same', 
                 strides=2, 
                 input_shape=input_shape,
                 activation='relu'))

# Convolution is followed by max-pooling layer with a 2x2 window
model.add(MaxPooling2D(pool_size=(2, 2)))

# Second convolution extracts 32 filters that are of kernel size 3x3 
model.add(Conv2D(32, (3,3), 
                 padding='same', 
                 strides=2,
                 activation='relu'))

# Convolution is followed by max-pooling layer with a 2x2 window
model.add(MaxPooling2D(pool_size=(2, 2)))

# Flatten 2-dim matrix to 1-d vector so we can pass them through the fully connected layer (dense layer)
model.add(Flatten())

# Create a fully connected layer with ReLU activation and 128 hidden units
model.add(Dense(128, activation='relu'))

# Create an output layer with the number of classes and activate using softmax
model.add(Dense(num_classes, activation='softmax'))

In [None]:
model.summary()

In [None]:
from keras import optimizers

model.compile(loss='categorical_crossentropy', #multiclass problem with singular mututally exclusive labels (normal or pneumonia)
              optimizer=optimizers.Adam(lr=0.001),
              metrics=['accuracy'])

In [None]:
from keras.callbacks import ModelCheckpoint

bestValidationCheckpointer = ModelCheckpoint('train_model.hdf5', monitor='val_acc', save_best_only=True, verbose=1)

In [10]:
#Train the model with the full dataset
history = model.fit_generator(
        train_generator, # train generator has 5216 train images
        steps_per_epoch=train_generator.samples // bs + 1,
        epochs=30,
        validation_data=test_generator, # validation generator has 627 validation images
        validation_steps=test_generator.samples // bs + 1,
        callbacks=[bestValidationCheckpointer]
)

Instructions for updating:
Use tf.cast instead.
Epoch 1/30

Epoch 00001: val_acc improved from -inf to 0.76236, saving model to train_model.hdf5
Epoch 2/30

Epoch 00002: val_acc improved from 0.76236 to 0.76555, saving model to train_model.hdf5
Epoch 3/30

Epoch 00003: val_acc improved from 0.76555 to 0.78150, saving model to train_model.hdf5
Epoch 4/30

Epoch 00004: val_acc did not improve from 0.78150
Epoch 5/30

Epoch 00005: val_acc did not improve from 0.78150
Epoch 6/30

Epoch 00006: val_acc did not improve from 0.78150
Epoch 7/30

Epoch 00007: val_acc improved from 0.78150 to 0.78628, saving model to train_model.hdf5
Epoch 8/30

Epoch 00008: val_acc improved from 0.78628 to 0.79107, saving model to train_model.hdf5
Epoch 9/30

Epoch 00009: val_acc did not improve from 0.79107
Epoch 10/30

Epoch 00010: val_acc did not improve from 0.79107
Epoch 11/30

Epoch 00011: val_acc improved from 0.79107 to 0.79904, saving model to train_model.hdf5
Epoch 12/30

Epoch 00012: val_acc did not i

# Results and Analysis

In [16]:
import matplotlib.pyplot as plt

plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

AttributeError: 'Sequential' object has no attribute 'history'

There is increasing accuracy, but loss is also increasing. Overfitting is definitely a problem.

In [14]:
from keras.models import load_model

model_path = 'train_model.hdf5'
model = load_model( model_path )

Instructions for updating:
Use tf.cast instead.


In [13]:
val_generator.reset()

scores = model.evaluate_generator(train_generator, steps=train_generator.samples // val_generator.batch_size + 1, verbose=1)
print('Train loss:', scores[0])
print('Train accuracy:', scores[1])

scores = model.evaluate_generator(val_generator, steps=val_generator.samples // val_generator.batch_size + 1, verbose=1)
print('Val loss:', scores[0])
print('Val accuracy:', scores[1])

scores = model.evaluate_generator(test_generator, steps=test_generator.samples // val_generator.batch_size + 1, verbose=1) #627 images
print('Test loss:', scores[0])
print('Test accuracy:', scores[1])

Train loss: 0.0669769941605235
Train accuracy: 0.9729677866801528
Val loss: 0.09867863345425576
Val accuracy: 1.0
Test loss: 0.9784127618772894
Test accuracy: 0.8165869147203375


With 81.66% accuracy on the test set, the model is fairly effective, but can be improved.

# New Model For Higher Accuracy

In [9]:
import torch
import torchvision.models as models
resnet18 = models.resnet18(pretrained=True)
alexnet = models.alexnet(pretrained=True)
squeezenet = models.squeezenet1_0(pretrained=True)
vgg16 = models.vgg16(pretrained=True)
densenet = models.densenet161(pretrained=True)
inception = models.inception_v3(pretrained=True)
googlenet = models.googlenet(pretrained=True)
shufflenet = models.shufflenet_v2_x1_0(pretrained=True)
mobilenet = models.mobilenet_v2(pretrained=True)
resnext50_32x4d = models.resnext50_32x4d(pretrained=True)

ModuleNotFoundError: No module named 'torch'