## Skeleton Code

The code below provides a skeleton for the model building & training component of your project. You can add/remove/build on code however you see fit, this is meant as a starting point.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from glob import glob
%matplotlib inline
import matplotlib.pyplot as plt

##Import any other stats/DL/ML packages you may need here. E.g. Keras, scikit-learn, etc.

## Do some early processing of your metadata for easier model training:

In [2]:
## Below is some helper code to read all of your full image filepaths into a dataframe for easier manipulation
## Load the NIH data to all_xray_df
all_xray_df = pd.read_csv('/data/Data_Entry_2017.csv')
all_image_paths = {os.path.basename(x): x for x in 
                   glob(os.path.join('/data','images*', '*', '*.png'))}
print('Scans found:', len(all_image_paths), ', Total Headers', all_xray_df.shape[0])
all_xray_df['path'] = all_xray_df['Image Index'].map(all_image_paths.get)
all_xray_df.sample(10)

Scans found: 112120 , Total Headers 112120


Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11,path
95011,00024730_000.png,No Finding,0,24730,26,M,PA,2500,2048,0.168,0.168,,/data/images_011/images/00024730_000.png
94395,00024173_000.png,No Finding,0,24173,31,M,PA,2530,2725,0.143,0.143,,/data/images_010/images/00024173_000.png
75586,00018548_000.png,No Finding,0,18548,45,M,PA,2992,2991,0.143,0.143,,/data/images_009/images/00018548_000.png
22152,00005872_000.png,Mass,0,5872,30,M,PA,2048,2500,0.171,0.171,,/data/images_003/images/00005872_000.png
109413,00029763_012.png,Infiltration,12,29763,33,M,PA,2021,2021,0.194311,0.194311,,/data/images_012/images/00029763_012.png
83763,00020576_001.png,No Finding,1,20576,58,M,PA,2992,2991,0.143,0.143,,/data/images_009/images/00020576_001.png
59219,00014651_002.png,Atelectasis|Consolidation|Effusion,2,14651,66,F,PA,2738,2991,0.143,0.143,,/data/images_007/images/00014651_002.png
76601,00018829_007.png,No Finding,7,18829,55,M,AP,2500,2048,0.168,0.168,,/data/images_009/images/00018829_007.png
31522,00008235_000.png,No Finding,0,8235,48,F,PA,2048,2500,0.168,0.168,,/data/images_004/images/00008235_000.png
5691,00001534_008.png,No Finding,8,1534,69,F,PA,2662,2577,0.143,0.143,,/data/images_002/images/00001534_008.png


In [4]:
## Here you may want to create some extra columns in your table with binary indicators of certain diseases 
## rather than working directly with the 'Finding Labels' column

# Todo
df_lab = all_xray_df['Finding Labels'].str.get_dummies().add_prefix('lab_')
all_xray_df = pd.concat([all_xray_df, df_lab], axis=1)
all_xray_df.head()
    



Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,lab_Emphysema,lab_Fibrosis,lab_Hernia,lab_Infiltration,lab_Mass,lab_No Finding,lab_Nodule,lab_Pleural_Thickening,lab_Pneumonia,lab_Pneumothorax
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,...,0,0,0,0,0,0,0,0,0,0
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,...,1,0,0,0,0,0,0,0,0,0
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,...,0,0,0,0,0,0,0,0,0,0
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,...,0,0,0,0,0,1,0,0,0,0
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,...,0,0,1,0,0,0,0,0,0,0


In [4]:
## Here we can create a new column called 'pneumonia_class' that will allow us to look at 
## images with or without pneumonia for binary classification
all_xray_df.sample(10)
# Todo
all_xray_df = all_xray_df.rename({'lab_Pneumonia': 'pneumonia_class'}, axis=1)
pneumcount = len(all_xray_df[(all_xray_df.pneumonia_class == 1)])
print(pneumcount)

1431


## Create your training and testing data:

In [5]:
from sklearn.model_selection import train_test_split
def create_splits(vargs):
    
    ## Either build your own or use a built-in library to split your original dataframe into two sets 
    ## that can be used for training and testing your model
    ## It's important to consider here how balanced or imbalanced you want each of those sets to be
    ## for the presence of pneumonia 
    
    # Todo
    #There are 1431 rows where the pneumonia label is positive. For this reason, we are going to evenly balance it out with a random
    #sample of 1431 rows without the pneumonia label to avoid the accuracy paradox without resampling the data.
    pneum = vargs[(all_xray_df.pneumonia_class == 1)]
    nopneum = vargs[(all_xray_df.pneumonia_class == 0)].head(1431)
    concat = pd.concat([pneum, nopneum])
    train_data, val_data = train_test_split(concat, test_size=0.2)
    
    return train_data, val_data

train, test = create_splits(all_xray_df)

# Now we can begin our model-building & training

#### First suggestion: perform some image augmentation on your data

In [11]:
from numpy import expand_dims
from keras.preprocessing.image import load_img
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import ImageDataGenerator

def my_image_augmentation():
    
    ## recommendation here to implement a package like Keras' ImageDataGenerator
    ## with some of the built-in augmentations 
    
    ## keep an eye out for types of augmentation that are or are not appropriate for medical imaging data
    ## Also keep in mind what sort of augmentation is or is not appropriate for testing vs validation data
    
    ## STAND-OUT SUGGESTION: implement some of your own custom augmentation that's *not*
    ## built into something like a Keras package
    
    # Todo
    my_idg = ImageDataGenerator(featurewise_center=True,
    featurewise_std_normalization=True,
    rotation_range=10,
    width_shift_range=0,
    height_shift_range=0,
    vertical_flip=False,)
    
    
    return my_idg

datagen = my_image_augmentation()

def make_train_gen(vargs):
    
    ## Create the actual generators using the output of my_image_augmentation for your training data
    ## Suggestion here to use the flow_from_dataframe library, e.g.:
     # Todo
    train_gen=datagen.flow_from_dataframe(
    dataframe=vargs,
    directory="./train/",
    x_col="path",
    y_col="pneumonia_class",
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="binary",
    target_size=(32,32))
    return train_gen


def make_val_gen(vargs):
    
    # Todo
    val_gen=datagen.flow_from_dataframe(
    dataframe=vargs,
    directory="./test/",
    x_col="path",
    y_col=None,
    batch_size=32,
    seed=42,
    shuffle=True,
    class_mode="binary",
    target_size=(32,32))
    return val_gen

traingen = make_train_gen(train)
testgen = make_test_gen(test)

TypeError: If class_mode="binary", y_col="pneumonia_class" column values must be strings.

In [None]:
## May want to pull a single large batch of random validation data for testing after each epoch:
valX, valY = val_gen.next()

In [None]:
## May want to look at some examples of our augmented training data. 
## This is helpful for understanding the extent to which data is being manipulated prior to training, 
## and can be compared with how the raw data look prior to augmentation

t_x, t_y = next(train_gen)
fig, m_axs = plt.subplots(4, 4, figsize = (16, 16))
for (c_x, c_y, c_ax) in zip(t_x, t_y, m_axs.flatten()):
    c_ax.imshow(c_x[:,:,0], cmap = 'bone')
    if c_y == 1: 
        c_ax.set_title('Pneumonia')
    else:
        c_ax.set_title('No Pneumonia')
    c_ax.axis('off')

## Build your model: 

Recommendation here to use a pre-trained network downloaded from Keras for fine-tuning

In [None]:
def load_pretrained_model(vargs):
    
    # model = VGG16(include_top=True, weights='imagenet')
    # transfer_layer = model.get_layer(lay_of_interest)
    # vgg_model = Model(inputs = model.input, outputs = transfer_layer.output)
    
    # Todo
    
    return vgg_model


In [None]:
def build_my_model(vargs):
    
    # my_model = Sequential()
    # ....add your pre-trained model, and then whatever additional layers you think you might
    # want for fine-tuning (Flatteen, Dense, Dropout, etc.)
    
    # if you want to compile your model within this function, consider which layers of your pre-trained model, 
    # you want to freeze before you compile 
    
    # also make sure you set your optimizer, loss function, and metrics to monitor
    
    # Todo
    
    return my_model



## STAND-OUT Suggestion: choose another output layer besides just the last classification layer of your modele
## to output class activation maps to aid in clinical interpretation of your model's results

In [None]:
## Below is some helper code that will allow you to add checkpoints to your model,
## This will save the 'best' version of your model by comparing it to previous epochs of training

## Note that you need to choose which metric to monitor for your model's 'best' performance if using this code. 
## The 'patience' parameter is set to 10, meaning that your model will train for ten epochs without seeing
## improvement before quitting

# Todo

# weight_path="{}_my_model.best.hdf5".format('xray_class')

# checkpoint = ModelCheckpoint(weight_path, 
#                              monitor= CHOOSE_METRIC_TO_MONITOR_FOR_PERFORMANCE, 
#                              verbose=1, 
#                              save_best_only=True, 
#                              mode= CHOOSE_MIN_OR_MAX_FOR_YOUR_METRIC, 
#                              save_weights_only = True)

# early = EarlyStopping(monitor= SAME_AS_METRIC_CHOSEN_ABOVE, 
#                       mode= CHOOSE_MIN_OR_MAX_FOR_YOUR_METRIC, 
#                       patience=10)

# callbacks_list = [checkpoint, early]

### Start training! 

In [None]:
## train your model

# Todo

# history = my_model.fit_generator(train_gen, 
#                           validation_data = (valX, valY), 
#                           epochs = , 
#                           callbacks = callbacks_list)

##### After training for some time, look at the performance of your model by plotting some performance statistics:

Note, these figures will come in handy for your FDA documentation later in the project

In [None]:
## After training, make some predictions to assess your model's overall performance
## Note that detecting pneumonia is hard even for trained expert radiologists, 
## so there is no need to make the model perfect.
my_model.load_weights(weight_path)
pred_Y = new_model.predict(valX, batch_size = 32, verbose = True)

In [None]:
def plot_auc(t_y, p_y):
    
    ## Hint: can use scikit-learn's built in functions here like roc_curve
    
    # Todo
    
    return

## what other performance statistics do you want to include here besides AUC? 


# def ... 
# Todo

# def ...
# Todo
    
#Also consider plotting the history of your model training:

def plot_history(history):
    
    # Todo
    return

In [None]:
## plot figures

# Todo

Once you feel you are done training, you'll need to decide the proper classification threshold that optimizes your model's performance for a given metric (e.g. accuracy, F1, precision, etc.  You decide) 

In [None]:
## Find the threshold that optimize your model's performance,
## and use that threshold to make binary classification. Make sure you take all your metrics into consideration.

# Todo

In [None]:
## Let's look at some examples of true vs. predicted with our best model: 

# Todo

# fig, m_axs = plt.subplots(10, 10, figsize = (16, 16))
# i = 0
# for (c_x, c_y, c_ax) in zip(valX[0:100], testY[0:100], m_axs.flatten()):
#     c_ax.imshow(c_x[:,:,0], cmap = 'bone')
#     if c_y == 1: 
#         if pred_Y[i] > YOUR_THRESHOLD:
#             c_ax.set_title('1, 1')
#         else:
#             c_ax.set_title('1, 0')
#     else:
#         if pred_Y[i] > YOUR_THRESHOLD: 
#             c_ax.set_title('0, 1')
#         else:
#             c_ax.set_title('0, 0')
#     c_ax.axis('off')
#     i=i+1

In [None]:
## Just save model architecture to a .json:

model_json = my_model.to_json()
with open("my_model.json", "w") as json_file:
    json_file.write(model_json)