In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir ("/content/drive/MyDrive/FourthBrain/Capstone Samsung OCT/Data/OCT2017")

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

In [None]:
train_dir = os.getcwd() + "/train/"
valid_dir = os.getcwd() + "/val/"
test_dir = os.getcwd() + "/test/"

In [None]:
classes = ['NORMAL', 'CNV', 'DME', 'DRUSEN']
cols = [x.lower() for x in classes]
dirs = [train_dir, valid_dir, test_dir]
label = {0: 'normal', 1: 'cnv', 2: 'dme', 3: 'drusen'}

REGEN = False # if we should read the directory structre, if False then use the CSV files already saved

In [None]:
def create_df (path, classes=classes):
  df = pd.DataFrame(columns=['filename', 'normal', 'cnv', 'dme', 'drusen'])
  for sub_dir in classes:
    condition = {'normal': 0, 'cnv': 0, 'dme':0, 'drusen': 0}
    files = os.listdir(path + sub_dir)
    if (sub_dir== 'NORMAL'):
      condition['normal'] = 1
    elif (sub_dir == 'CNV'):
      condition['cnv'] = 1
    elif (sub_dir == 'DME'):
      condition['dme'] = 1
    else:
      condition['drusen']= 1
    for f in files:
      df = df.append({'filename': path +  sub_dir  + "/" + f, 
                      'normal': condition['normal'], 
                      'cnv': condition['cnv'],
                      'dme': condition['dme'],
                      'drusen': condition['drusen']}, ignore_index=True)
  return df

In [None]:
# Generting the DataFrames of the filenames
# this is primarily used so we can sub-sample files easier for the different training strategies
if (REGEN):
  train_df = create_df(train_dir)
  valid_df = create_df(valid_dir)
  test_df = create_df(test_dir)
  train_df.to_csv("train_data.csv")
  valid_df.to_csv("valid_data.csv")
  test_df.to_csv("test_data.csv")
else:
  train_df = pd.read_csv("train_data.csv")
  valid_df = pd.read_csv("valid_data.csv")
  test_df = pd.read_csv("test_data.csv")

In [None]:
print ("Training Data: ", train_df.shape)
print ("Validation Data: ", valid_df.shape)
print ("Test Data: ", test_df.shape)

Training Data:  (83484, 6)
Validation Data:  (32, 6)
Test Data:  (968, 6)


In [None]:
# Printing out the # of samples for each subsample percentage 
print ("Trainig Data percentages:")
print (" 1% ==> ", int(.01 * train_df.shape[0]))
print (" 5% ==> ", int(.05 * train_df.shape[0]))
print ("10% ==> ", int(.1  * train_df.shape[0] ))
print ("25% ==> ", int(.25 * train_df.shape[0]))
print ("75% ==> ", int(.75 * train_df.shape[0]))
print ("90% ==> ", int(.9  * train_df.shape[0]))
print ("98% ==> ", int(.98 * train_df.shape[0]))

Trainig Data percentages:
 1% ==>  834
 5% ==>  4174
10% ==>  8348
25% ==>  20871
75% ==>  62613
90% ==>  75135
98% ==>  81814


In [None]:
# Sampling 1% of the data
sample = train_df.sample(frac=0.01, random_state=10, axis=0)
sample.shape

(835, 6)

In [None]:
# determine class weights to feed into neural network during training
def get_classweight(df):
  total = df.shape[0]
  num_norm = df['normal'].sum()
  num_cnv = df['cnv'].sum()
  num_dme = df['dme'].sum()
  num_drusen = df['drusen'].sum()
  norm_weight = (1/num_norm) * (total/4)
  norm_cnv = (1/num_cnv) * (total/4)
  norm_dme = (1/num_dme) * (total/4)
  norm_drusen = (1/num_drusen) * (total/4)
  class_weight = {0 : norm_weight, 1: norm_cnv,
                  2 : norm_dme, 3: norm_drusen}
  return class_weight

In [None]:
class_weight = get_classweight(sample)
class_weight

{0: 0.8186274509803921,
 1: 0.5537135278514589,
 2: 2.0072115384615388,
 3: 2.1085858585858586}

In [None]:
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.applications as app
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:
train_image_datagen = ImageDataGenerator(rotation_range=90, width_shift_range=[-.1,.1], height_shift_range=[-.1,.1],
                                         shear_range=0.25, zoom_range=0.3, horizontal_flip=True,
                                         vertical_flip=True, rescale = 1./255.)

# Setting the imgages to come from the dataframe where we specify the filenames and columns to use for "labels"
train_imgs = train_image_datagen.flow_from_dataframe(sample, directory=None, x_col='filename', y_col=cols, validate_filenames=False,
                                        class_mode="raw", target_size=(224,224), batch_size=32, seed=10)
valid_imgs = train_image_datagen.flow_from_dataframe(valid_df, directory=None, x_col='filename', y_col=cols, validate_filenames=False,
                                        class_mode="raw", target_size=(224,224), batch_size=16, seed=10)

Found 835 non-validated image filenames.
Found 32 non-validated image filenames.


In [None]:
pre_process = keras.applications.densenet.preprocess_input
base_model = app.densenet.DenseNet121 (include_top=False, weights="imagenet", input_shape=(224,224,3), pooling=None)
base_model.trainable = False

# This is a must for Densenet as per documenatation, otherwise you get all kinds of errors when building model
input = keras.layers.Input(shape=(224,224,3))
x = pre_process(input)

# Creating DenseNet model
x = base_model(x)

#Layers added to Densenet
y = keras.layers.Conv2D(512, 3, padding="same", activation="ReLU")(x)
y = keras.layers.BatchNormalization()(y)
x = keras.layers.Conv2D(512, 3, padding="same", activation="ReLU")(y)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Concatenate()([x,y])
x = keras.layers.MaxPooling2D(2)(x)
y = keras.layers.Conv2D(512, 3, padding="same", activation="ReLU")(x)
y = keras.layers.BatchNormalization()(y)
x = keras.layers.Conv2D(512, 3, padding="same", activation="ReLU")(y)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Concatenate()([x,y])
x = keras.layers.MaxPooling2D(2)(x)
y = keras.layers.Conv2D(512, 3, padding="same", activation="ReLU")(x)
y = keras.layers.BatchNormalization()(y)
x = keras.layers.Conv2D(512, 3, padding="same", activation="ReLU")(y)
x = keras.layers.BatchNormalization()(x)
x = keras.layers.Concatenate()([x,y])
x = keras.layers.GlobalAveragePooling2D()(x)
# Output layer for the 4 classes we are dealing with
output = keras.layers.Dense(4, activation="softmax")(x)

model = keras.Model(inputs=input, outputs=output)

model.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/densenet/densenet121_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
tf.math.truediv (TFOpLambda)    (None, 224, 224, 3)  0           input_2[0][0]                    
__________________________________________________________________________________________________
tf.nn.bias_add (TFOpLambda)     (None, 224, 224, 3)  0           tf.math.truediv[0][0]            
__________________________________________________________________________________________________
tf.math.truediv_1 (TFOpLambda)  (None, 224, 2

In [None]:
# This code did not work, it caused I/O Error 5:
# model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3), loss='categorical_crossentropy', metrics='accuracy')
model.compile(optimizer=keras.optimizers.Adam(), loss='categorical_crossentropy', metrics='accuracy')

In [None]:
import datetime
%load_ext tensorboard
%reload_ext tensorboard

In [None]:
if not (os.path.exists('./logs/fit/')):
  os.mkdir('./logs/fit/')

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

# Creating a checkpoint to save the best model so that we can reload it once training is complete
checkpoint_cb = keras.callbacks.ModelCheckpoint("oct_densenet.h5", save_best_only=True)
# Adding an an early stop callback to avoid overfitting in case the model is not improving after 5 consescutive epochs
earlystop_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
# Tensorboard callback to view the training & validation curves
tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [None]:
history = model.fit(train_imgs,  steps_per_epoch=15, epochs=100, verbose=1, validation_data=valid_imgs, 
                    class_weight=class_weight, callbacks=[checkpoint_cb]) #earlystop_cb]) # tensorboard_cb])

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50


In [None]:
sample_test = test_df.sample(frac=0.5, random_state=10, axis=0)
test_image_datagen = ImageDataGenerator( rescale = 1./255.)

test_imgs = test_image_datagen.flow_from_dataframe(sample_test, directory=None, x_col='filename', y_col=cols, validate_filenames=False,
                                        class_mode="raw", target_size=(224,224), batch_size=32)

Found 484 non-validated image filenames.


In [None]:
model.load_weights("oct_densenet.h5")
model.evaluate(test_imgs)



[1.2407846450805664, 0.46900826692581177]

In [None]:
results = model.predict(test_imgs)


In [None]:
choice = np.argmax(results, axis=1) 
true_label = sample_test.iloc[:,2:]
true_label = np.argmax(np.array(true_label), axis=1) 

In [None]:
from sklearn.metrics import classification_report, plot_confusion_matrix, confusion_matrix

print(classification_report(true_label, choice, target_names=[label[i] for i in label]))

              precision    recall  f1-score   support

      normal       0.27      0.39      0.32       126
         cnv       0.26      0.48      0.33       123
         dme       0.27      0.05      0.09       128
      drusen       0.23      0.10      0.14       107

    accuracy                           0.26       484
   macro avg       0.26      0.26      0.22       484
weighted avg       0.26      0.26      0.22       484

