In [217]:
import pandas as pd
import numpy as np
import os

import pickle
from datetime import datetime


# Visualizations
import seaborn as sns
import matplotlib.pyplot as pd


# preprocessing
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.image import rgb_to_grayscale


# Reshaping 
from tensorflow import reshape
from tensorflow.image import resize_with_pad


# Modelling 
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam




### Load Train Data From the Current Directory

In [127]:
# Get current working directory
PATH = os.getcwd() + '/../../src/data/chest_xray/'


# importing training normal data
norm_train_path = PATH+'train/NORMAL/'
norm_train_batch = os.listdir(norm_train_path) 

norm_train = []
norm_errors = []
for image_name in (norm_train_batch[:10]): 
    img_path = norm_train_path + image_name 
    try:
        x = image.load_img(img_path) 
        # preprocessing if required 
        norm_train.append(x) 
    except:
        norm_errors.append(image_name)
    
    
# importing training pnuemonia data
pnue_train_path = PATH + 'train/PNEUMONIA/'
pnue_train_batch = os.listdir(pnue_train_path)

pnue_train = []
pnue_errors = []
for image_name in pnue_train_batch[:10]:
    img_path = pnue_train_path + image_name
    try:
        x = image.load_img(img_path)
        pnue_train.append(x)
    except:
        pnue_errors.append(image_name)
    


# Define an import function

In [138]:
def import_image(PATH, image_name):
    """
    PATH --> str: Relative path to image directoy
    image_name --> str: Name of the image to load
    
    Returns:
    PIL image
    
    """
    
    # create path to file
    img_path = PATH + "/" + image_name
    
    # load file and return pil
    return image.load_img(img_path) 

def grayscale_and_resize(PIL, shape=(256,256), padding=False):
    """
    This is the preprocessing function that will take the raw jpeg, gray scale it, resize it and 
    turn it into an array
    
    
    PIL --> PIL object
    shape --> tuple: size of the final array
    padding --> bool: if True, will use tf.resize_with_pad
    """
    if padding:
        gray_image = rgb_to_grayscale(PIL)
        resized_image_arr = resize_with_pad(gray_image, target_height=shape[0], target_width=shape[1])
    else:
        resized_image_arr = img_to_array(PIL.convert(mode = 'L').resize(shape))
    
    return resized_image_arr


def import_image_to_array(RELPATH,
         dir_names = ['train', 'test', 'val'],
         sub_dir_names = ['NORMAL', 'PNEUMONIA'],
         padding=False,
         shape=(256,256), test=False):
    """
    This function loads all train, test and validation data into a dictionary of images
    =====================================================================================
    RELPATH --> str: The relative path to the cwd to the directory containing image directories
    eg '../../src/data/chest_xray'
    =====================================================================================
    dir_names --> list, str: The names of the subdirectories containing the images
    eg ['train', 'test', 'val'] <-- default
    =====================================================================================
    sub_dir_names --> list -> str: names of the subdirectory containg postivie and negative cases
    
    =====================================================================================
    padding  --> bool: Whether you want the reshaping to be padded to or not
    
    =====================================================================================
    shape --> tuple-> int: The final shape of the tensor array
    
    returns
    
    dict --> str:list -> tuple -> (tf.array, bool)
    A dictionary where the keys are the dir_names and the values are lists containing tuple where 
    the first index is the tf.array and the second is a boolian, True if class is pnuemonia, false otherwise.
    """
    # test relative path works!! 
    PATH = os.getcwd() + RELPATH
    
    try:
        os.listdir(PATH)
        print("You're relative directory is good, proceeding to import files...", end="\n\n")
    except Exception as e:
        print(str(e))
        print(f"Your relative path directory is not pointing to the correct location. Double check your input \n")
        print("Terminating Program", end='\n')
        print("=======================================================================================")
        return False
    
    
    # instantiate a dict object and populate the keys
    image_dict = {}
    for name in dir_names:
        image_dict[name] = []
        
        print(f"Loading images from {name}", end='\n')
        
        
        # For each subdirectory, get all of the images and append to dictionary
        for sub_dir in sub_dir_names:
            subPATH = PATH + name + "/" + sub_dir
            # list of all image names in the subdirectory
            image_batch = os.listdir(subPATH)
            
            for image in image_batch:
                # import the image in pil format
                pil = import_image(subPATH, image)
                # gray scale and reshape the image turning it into an array
                gray_resized_pil = grayscale_and_resize(pil, shape=shape, padding=padding)
                
                # center the pixels
                centered_array = gray_resized_pil/255
                
                # append to the image_dict
                flag = 1
                if sub_dir == 'NORMAL':
                    flag = 0
                
                image_dict[name].append((centered_array, flag))
                
            
                # if this is just a test case, break out of this loop so we get one from each class
                if test == True:
                    break
            
            print(f"Finished loading images from {sub_dir}", end="\n")

        print()
    
    return image_dict               

In [142]:
images = import_image_to_array('/../../src/data/chest_xray/',test=False)

You're relative directory is good, proceeding to import files...

Loading images from train
Finished loading images from NORMAL
Finished loading images from PNEUMONIA

Loading images from test
Finished loading images from NORMAL
Finished loading images from PNEUMONIA

Loading images from val
Finished loading images from NORMAL
Finished loading images from PNEUMONIA



Check for class imbalance in the data. 

In [160]:
# calculate inverse 
pnue_frequency = sum(y_train)/len(y_train)
inv_pnue_frequency = 1/pnue_frequency

normal_frequency = (len(y_train)-sum(y_train))/len(y_train)
inv_normal_frequency = 1/normal_frequency

weights = {
    0: inv_normal_frequency,
    1: inv_pnue_frequency
}

# Train on a model

In [161]:
y_train = np.array([i[1] for i in images['train']])
X_train = np.array([i[0] for i in images['train']])  
X_test = np.array([i[0] for i in images['test']])
y_test = np.array([i[1] for i in images['test']])
X_val = np.array([i[0] for i in images['val']])
y_val = np.array([i[1] for i in images['val']])

In [196]:
cnn = models.Sequential()
adam = Adam()
# Input layer conv
cnn.add(layers.Conv2D(64, (3, 3), activation='relu', input_shape=(256, 256,  1)))
cnn.add(layers.MaxPooling2D((2, 2)))

# First hidden layer conv
cnn.add(layers.Conv2D(32, (3, 3), activation='relu'))
cnn.add(layers.MaxPooling2D((2, 2)))
cnn.add(layers.Flatten())

# Added first dense layer
cnn.add(layers.Dense(32, activation='relu'))
cnn.add(layers.Dropout(.2))

# Add Second Layer
cnn.add(layers.Dense(16, activation='relu'))
cnn.add(layers.Dropout(.1))

cnn.add(layers.Dense(1, activation='sigmoid'))
cnn.compile(loss='binary_crossentropy',
              optimizer= adam,
              metrics=['acc'])

In [207]:
cnn1 = cnn.fit(X_train, y_train,
               epochs=5,
               batch_size=50,
               validation_data = (X_test, y_test), 
               class_weight=weights,
               verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


Evaluate the model. The validation accuracy with the test data is low, but let's look at the metrics

In [240]:
sum(y_val)

8

In [237]:
validation = cnn.evaluate(X_val, y_val)



In [239]:
0.5**16

1.52587890625e-05

In [229]:
today = str(datetime.today()).split()[0]
directory = "../../models/"
model_id = "tim-1"
file = directory+today+model_number+".HDF5"
cnn.save(file)

INFO:tensorflow:Assets written to: ../../models/2020-12-011.HDF5/assets


# Baseline Model 