In [1]:
pip install -U tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.13.0-cp37-cp37m-manylinux2010_x86_64.whl (679 kB)
[?25l[K     |▌                               | 10 kB 27.1 MB/s eta 0:00:01[K     |█                               | 20 kB 31.9 MB/s eta 0:00:01[K     |█▌                              | 30 kB 28.9 MB/s eta 0:00:01[K     |██                              | 40 kB 21.5 MB/s eta 0:00:01[K     |██▍                             | 51 kB 18.3 MB/s eta 0:00:01[K     |███                             | 61 kB 12.8 MB/s eta 0:00:01[K     |███▍                            | 71 kB 13.8 MB/s eta 0:00:01[K     |███▉                            | 81 kB 15.1 MB/s eta 0:00:01[K     |████▍                           | 92 kB 15.1 MB/s eta 0:00:01[K     |████▉                           | 102 kB 14.7 MB/s eta 0:00:01[K     |█████▎                          | 112 kB 14.7 MB/s eta 0:00:01[K     |█████▉                          | 122 kB 14.7 MB/s eta 0:00:01[K     |██████▎         

In [2]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir ("/content/drive/MyDrive/FourthBrain/Capstone Samsung OCT/Data/OCT2017")

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np

In [4]:
train_dir = os.getcwd() + "/train/"
valid_dir = os.getcwd() + "/val/"
test_dir = os.getcwd() + "/test/"

In [5]:
classes = ['NORMAL', 'CNV', 'DME', 'DRUSEN']
cols = [x.lower() for x in classes]
dirs = [train_dir, valid_dir, test_dir]
label = {0: 'normal', 1: 'cnv', 2: 'dme', 3: 'drusen'}

REGEN = False # if we should read the directory structre, if False then use the CSV files already saved

In [6]:
def create_df (path, classes=classes):
  df = pd.DataFrame(columns=['filename', 'normal', 'cnv', 'dme', 'drusen'])
  for sub_dir in classes:
    condition = {'normal': 0, 'cnv': 0, 'dme':0, 'drusen': 0}
    files = os.listdir(path + sub_dir)
    if (sub_dir== 'NORMAL'):
      condition['normal'] = 1
    elif (sub_dir == 'CNV'):
      condition['cnv'] = 1
    elif (sub_dir == 'DME'):
      condition['dme'] = 1
    else:
      condition['drusen']= 1
    for f in files:
      df = df.append({'filename': path +  sub_dir  + "/" + f, 
                      'normal': condition['normal'], 
                      'cnv': condition['cnv'],
                      'dme': condition['dme'],
                      'drusen': condition['drusen']}, ignore_index=True)
  return df

In [7]:
# Generting the DataFrames of the filenames
# this is primarily used so we can sub-sample files easier for the different training strategies
if (REGEN):
  train_df = create_df(train_dir)
  valid_df = create_df(valid_dir)
  test_df = create_df(test_dir)
  train_df.to_csv("train_data.csv")
  valid_df.to_csv("valid_data.csv")
  test_df.to_csv("test_data.csv")
else:
  train_df = pd.read_csv("train_data.csv")
  valid_df = pd.read_csv("valid_data.csv")
  test_df = pd.read_csv("test_data.csv")

In [8]:
print ("Training Data: ", train_df.shape)
print ("Validation Data: ", valid_df.shape)
print ("Test Data: ", test_df.shape)

Training Data:  (83484, 6)
Validation Data:  (32, 6)
Test Data:  (968, 6)


In [9]:
# Printing out the # of samples for each subsample percentage 
print ("Trainig Data percentages:")
print (" 1% ==> ", int(.01 * train_df.shape[0]))
print (" 5% ==> ", int(.05 * train_df.shape[0]))
print ("10% ==> ", int(.1  * train_df.shape[0] ))
print ("25% ==> ", int(.25 * train_df.shape[0]))
print ("75% ==> ", int(.75 * train_df.shape[0]))
print ("90% ==> ", int(.9  * train_df.shape[0]))
print ("98% ==> ", int(.98 * train_df.shape[0]))

Trainig Data percentages:
 1% ==>  834
 5% ==>  4174
10% ==>  8348
25% ==>  20871
75% ==>  62613
90% ==>  75135
98% ==>  81814


In [10]:
# Sampling 1% of the data
sample = train_df.sample(frac=0.01, random_state=10, axis=0)
sample.shape

(835, 6)

In [11]:
# determine class weights to feed into neural network during training
def get_classweight(df):
  total = df.shape[0]
  num_norm = df['normal'].sum()
  num_cnv = df['cnv'].sum()
  num_dme = df['dme'].sum()
  num_drusen = df['drusen'].sum()
  norm_weight = (1/num_norm) * (total/4)
  norm_cnv = (1/num_cnv) * (total/4)
  norm_dme = (1/num_dme) * (total/4)
  norm_drusen = (1/num_drusen) * (total/4)
  class_weight = {0 : norm_weight, 1: norm_cnv,
                  2 : norm_dme, 3: norm_drusen}
  return class_weight

In [12]:
class_weight = get_classweight(sample)
class_weight

{0: 0.8186274509803921,
 1: 0.5537135278514589,
 2: 2.0072115384615388,
 3: 2.1085858585858586}

In [13]:
os.getcwd()

'/content/drive/My Drive/FourthBrain/Capstone Samsung OCT/Data/OCT2017'

In [30]:
import tensorflow as tf
from tensorflow import keras
import tensorflow.keras.applications as app
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import tensorflow_hub as hub
from data_util import *
import tensorflow_addons as tfa
import datetime

tf.config.run_functions_eagerly(True)

In [16]:
gs_path = "gs://simclr-checkpoints/simclrv2/finetuned_100pct/r50_1x_sk0/hub/"

input = keras.layers.Input(shape=(224,224,3))
simclr_layer = hub.KerasLayer(gs_path, trainable=False)
output = keras.layers.Dense(4, activation="softmax")

model = keras.Sequential([input, 
                          simclr_layer,
                          output])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
keras_layer (KerasLayer)     (None, 2048)              34277672  
_________________________________________________________________
dense (Dense)                (None, 4)                 8196      
Total params: 34,285,868
Trainable params: 8,196
Non-trainable params: 34,277,672
_________________________________________________________________


In [17]:
train_image_datagen = ImageDataGenerator(rotation_range=90, width_shift_range=[-.1,.1], height_shift_range=[-.1,.1],
                                         shear_range=0.25, zoom_range=0.3, horizontal_flip=True,
                                         vertical_flip=True, rescale = 1./255.)

# Setting the imgages to come from the dataframe where we specify the filenames and columns to use for "labels"
train_imgs = train_image_datagen.flow_from_dataframe(sample, directory=None, x_col='filename', y_col=cols, validate_filenames=False,
                                        class_mode="raw", target_size=(224,224), batch_size=32, seed=10)
valid_imgs = train_image_datagen.flow_from_dataframe(valid_df, directory=None, x_col='filename', y_col=cols, validate_filenames=False,
                                        class_mode="raw", target_size=(224,224), batch_size=16, seed=10)

Found 835 non-validated image filenames.
Found 32 non-validated image filenames.


In [24]:
model.compile(optimizer=keras.optimizers.Adam(learning_rate=1e-3), loss='categorical_crossentropy', 
              metrics=[tfa.metrics.F1Score(4,"micro")])
model.run_eagerly=True

In [25]:
# Creating a checkpoint to save the best model so that we can reload it once training is complete
checkpoint_cb = keras.callbacks.ModelCheckpoint("oct_simclrtrain.h5", save_best_only=True)
# Adding an an early stop callback to avoid overfitting in case the model is not improving after 5 consescutive epochs
earlystop_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [32]:
def get_features(dataset, max_iter=100):
  tf.executing_eagerly()
  out_feature, out_label = list(), list()
  count = 0
  for img, lbl in dataset:
    out_feature.append(simclr_layer(img).numpy())
    out_label.append(lbl.numpy())
    if count % 100 == 0: print(f'count = {count}')
    if count > max_iter:
      break
    count += 1
  out_label = np.hstack(out_label)
  out_feature   = np.vstack(out_feature)
  return out_feature, out_label

In [33]:
valid_feature, valid_label = get_features(valid_imgs)

AttributeError: ignored

In [31]:
tf.executing_eagerly()

False