## **Setup**

In [None]:
import os
import shutil
import glob
import pickle
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import datetime
import pytz
from zipfile import ZipFile
from tempfile import TemporaryDirectory
from PIL import Image

import tensorflow as tf
from tensorflow import keras

from sklearn.metrics import f1_score, accuracy_score
from skimage import filters

# Load the TensorBoard notebook extension
%load_ext tensorboard

# plot options
# plt.rcParams.update({'font.size': 11})
plt.style.use('fivethirtyeight')

In [None]:
from google.colab import drive
mount_path = '/content/gdrive/'
drive.mount(mount_path)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive/


In [None]:
# Set and test path to competition data files
competition_path = 'My Drive/AI For Good - AI Blitz 3/Snake/Data/'
filename = 'train.zip'
assert(os.path.exists(f'{mount_path}/{competition_path}/{filename}'))
print('Drive mounted correctly and data accessible')

Drive mounted correctly and data accessible


# **Functions**



### **Data loading**

In [None]:
# custom copytree because Colab doesn't have the latest version of shutil.copytree which now contains the dirs_exist_ok flag solving this issue
# copied verbatim from https://stackoverflow.com/a/12514470/5991868
def copytree(src, dst, symlinks=False, ignore=None):
    for item in os.listdir(src):
        s = os.path.join(src, item)
        d = os.path.join(dst, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks, ignore)
        else:
            shutil.copy2(s, d)

# Unzip and combine images
Combine the train and validation sets from the competition organizers, we're going to use our own!


In [None]:
# this can take a bit (maybe 30s)
# All_Data is the home directory for the images, from which training and validation splits will be taken
if not os.path.exists('/content/All_Data/'):
  os.mkdir('/content/All_Data/')
  os.mkdir('/content/All_Data/non_venomous/')
  os.mkdir('/content/All_Data/venomous/')

  for filename in ['train','val']:
    # create a temporary directory using TemporaryDirectory and context manager and unzip to there
    with TemporaryDirectory() as tmpdirname:
      with ZipFile(f'{mount_path}/{competition_path}/{filename}.zip', 'r') as zip_ref:
        zip_ref.extractall(tmpdirname)
      # copy the data to the All_Data dir
      copytree(f'{tmpdirname}/content/data/{filename}/non_venomous/','/content/All_Data/non_venomous/')
      copytree(f'{tmpdirname}/content/data/{filename}/venomous/','/content/All_Data/venomous/')

In [None]:
def cv_splits(home_dir,num_folds,val_pct,truly_random):
  # returns a dictionary[fold_num][train or val][venom] : list of images
  # eg cvdict[0]['train']['venomous']
  cvdict={key:{'train':{},'val':{}} for key in range(num_folds)}

  assert(val_pct<=(1/num_folds))

  if truly_random:
    rng = np.random.RandomState() # random seeding
  else:
    rng = np.random.RandomState(42) # deterministic seeding

  for venom in ['non_venomous','venomous']:
    imgs=np.array(glob.glob(f'{home_dir}{venom}/*.jpg'))
    # shuffle the image array. NOTE all the randomness in the train-val split comes from this shuffle
    rng.shuffle(imgs)

    # size of the fold
    foldsz=np.floor(len(imgs)/num_folds).astype(int)
    # how many validation images
    numval=np.floor(val_pct*len(imgs)).astype(int)

    for cvfold in range(num_folds):
      valimgs=imgs[(cvfold*foldsz):(cvfold*foldsz+numval)]
      cvdict[cvfold]['val'][venom]=valimgs
      cvdict[cvfold]['train'][venom]=np.setdiff1d(imgs,valimgs)

  return cvdict

In [None]:
def train_val_dirs(home_dir,cvdict,cur_fold):
  # moves the files in cvdict to a train and val directory

  for trainval in ['train','val']:
    # Delete any existing directory then make a new one
    if os.path.exists(os.path.join(home_dir,trainval+os.sep)):
      shutil.rmtree(os.path.join(home_dir,trainval+os.sep))
    os.mkdir(os.path.join(home_dir,trainval+os.sep))

    for venom in ['non_venomous','venomous']:
      os.mkdir(os.path.join(home_dir,trainval,venom+os.sep))
      for curfile in cvdict[cur_fold][trainval][venom]:
        shutil.copy2(curfile,os.path.join(home_dir,trainval,venom+os.sep)+os.path.basename(curfile))
  return

In [None]:
cvdict=cv_splits('/content/All_Data/',3,0.15,False)

In [None]:
train_val_dirs('/content/',cvdict,0)

In [None]:
print('Training Images')
print('   Danger snek : '+str(len(glob.glob('/content/train/venomous/*.jpg'))))
print('   Safe snek : '+str(len(glob.glob('/content/train/non_venomous/*.jpg'))))
print('Validation Images')
print('   Danger snek : '+str(len(glob.glob('/content/val/venomous/*.jpg'))))
print('   Safe snek : '+str(len(glob.glob('/content/val/non_venomous/*.jpg'))))

Training Images
   Danger snek : 22544
   Safe snek : 28193
Validation Images
   Danger snek : 3978
   Safe snek : 4975


## Functions for evaluation and visualization of model results

In [None]:
def plot_training_history(history):
  # keras appends a number after some of the keys, this little ditty here just pulls them out
  histkeys=pd.Series(list(history.history.keys()))
  histkeys=histkeys[histkeys.str.contains('auc')]
  val_key=histkeys[histkeys.str.contains('val')].values[0]
  auc_key=histkeys[~histkeys.str.contains('val')].values[0]

  plt.figure(figsize=(10, 5))
  # summarize history for auc
  plt.subplot(1,2,1)
  plt.plot(history.history[auc_key])
  plt.plot(history.history[val_key])
  plt.title('Training ROC AUC')
  plt.ylabel('ROC AUC')
  plt.xlabel('Epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.grid(True)
  plt.tight_layout()
  
  # summarize history for loss
  plt.subplot(1,2,2)
  plt.plot(history.history['loss'])
  plt.plot(history.history['val_loss'])
  plt.title('Training Loss')
  plt.ylabel('Loss')
  plt.xlabel('Epoch')
  plt.legend(['train', 'test'], loc='upper left')
  plt.grid(True)
  plt.tight_layout()

  plt.show()

# Model setup

In [None]:
infoimg=Image.open(glob.glob('/content/train/non_venomous/*.jpg')[0])

In [None]:
base_model = keras.applications.resnet50.ResNet50(weights='imagenet', include_top=False, input_shape=infoimg.size+(3,))

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
avg = keras.layers.GlobalAveragePooling2D()(base_model.output)
output = keras.layers.Dense(1, activation="sigmoid")(avg)
model = keras.Model(inputs=base_model.input, outputs=output)

In [None]:
# freeze the weights of the pre-trained layers
for layer in base_model.layers:
  layer.trainable = False

In [None]:
init_lr=1e-2
optimizer = keras.optimizers.Nadam(lr=init_lr)
loss_function = keras.losses.BinaryCrossentropy()

model.compile(loss=loss_function, optimizer=optimizer, metrics=['acc'])

max_epochs=100
batch_size = 128

In [None]:
# early stopping callback
# patience is number of epochs without improvement
early_stopping_cb = keras.callbacks.EarlyStopping(patience=5,restore_best_weights=True)

## Data generator

In [None]:
def add_gaussian_noise(img):
  # [min,max] of the SNR (recall that the img and noise is normalized)
  scl=[0.1,1]

  # normal noise in the shape of input image
  X=np.random.randn(*img.shape).astype(np.float16)
  # scale the range of the noise
  amp=np.random.rand()*(scl[1]-scl[0])+scl[0]  

  return ((img+X*amp)/(1+amp)).astype(np.float16)

In [None]:
datagen_kwargs = dict(dtype=np.float16,rescale=1./255,)
dataflow_kwargs = dict(target_size=infoimg.size, batch_size=batch_size,class_mode='binary',
                   interpolation="bilinear")

In [None]:
train_datagen = keras.preprocessing.image.ImageDataGenerator( **datagen_kwargs,
    rotation_range=45,
    width_shift_range=0.2,
    height_shift_range=0.2, 
    zoom_range=.2,
    horizontal_flip=True,
    vertical_flip=True)
    # ,
    # preprocessing_function=diff_of_gaussians)

val_datagen=keras.preprocessing.image.ImageDataGenerator(**datagen_kwargs)

In [None]:
train_generator = train_datagen.flow_from_directory(
    '/content/train/',
    **dataflow_kwargs)

val_generator = val_datagen.flow_from_directory(
    '/content/val/',
    **dataflow_kwargs)

Found 50737 images belonging to 2 classes.
Found 8953 images belonging to 2 classes.


# Training

In [None]:
# set up Tensorboard
logdir = os.path.join("logs", datetime.datetime.now().strftime("%Y%m%d-%H%M%S"))
%tensorboard --logdir logs

In [None]:
# tensorboard 
logdir = os.path.join("logs", datetime.datetime.now(pytz.timezone('US/Eastern')).strftime("%y%m%d_%H%M"))
tb_cb = keras.callbacks.TensorBoard(logdir)

steps_per_epoch = train_generator.samples // train_generator.batch_size
val_steps = val_generator.samples // val_generator.batch_size
# train it!
history = model.fit(train_generator, epochs = max_epochs, steps_per_epoch=steps_per_epoch,
                    validation_data = val_generator, validation_steps = val_steps,
                    callbacks=[early_stopping_cb,tb_cb])
    
    # train_generator, batch_size = batch_size, epochs = max_epochs, 
    #                 validation_data = validation_generator,
    #                 callbacks=[early_stopping_cb,tb_cb])

# plot_training_history(history)

Epoch 1/100
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
 54/396 [===>..........................] - ETA: 9:08 - loss: 0.6897 - acc: 0.5615

In [None]:
keras.__version__

In [None]:
results_df.to_pickle('200819_denoising_gridsearch.pkl')

In [None]:
from google.colab import files
files.download('200819_denoising_gridsearch.pkl') 

In [None]:
results_df

In [None]:
results_df=pd.read_pickle('/content/200819_denoising_gridsearch.pkl')

In [None]:
pd.set_option('display.max_rows',100)

In [None]:
results_df

In [None]:
# average results over each cv fold
results_df=results_df.groupby(np.arange(len(results_df))//3).mean()

In [None]:
print(results_df.shape)
results_df

In [None]:
plt.figure()
plt.plot(results_df.loc[results_df['high_sigma']==3,'low_sigma'],results_df.loc[results_df['high_sigma']==3,'auc'],'o',label='3')
plt.plot(results_df.loc[results_df['high_sigma']==5,'low_sigma'],results_df.loc[results_df['high_sigma']==5,'auc'],'o',label='5')
plt.plot(results_df.loc[results_df['high_sigma']==8,'low_sigma'],results_df.loc[results_df['high_sigma']==8,'auc'],'o',label='8')
plt.plot(results_df.loc[results_df['high_sigma']==11,'low_sigma'],results_df.loc[results_df['high_sigma']==11,'auc'],'o',label='11')
plt.legend()
plt.ylabel('AUC')
plt.xlabel('low sigma')
plt.show()

plt.figure()
plt.plot(results_df.loc[results_df['high_sigma']==3,'low_sigma'],results_df.loc[results_df['high_sigma']==3,'auc_std'],'o',label='3')
plt.plot(results_df.loc[results_df['high_sigma']==5,'low_sigma'],results_df.loc[results_df['high_sigma']==5,'auc_std'],'o',label='5')
plt.plot(results_df.loc[results_df['high_sigma']==8,'low_sigma'],results_df.loc[results_df['high_sigma']==8,'auc_std'],'o',label='8')
plt.plot(results_df.loc[results_df['high_sigma']==11,'low_sigma'],results_df.loc[results_df['high_sigma']==11,'auc_std'],'o',label='11')
plt.legend()
plt.ylabel('std(AUC)')
plt.xlabel('low sigma')
plt.show()

plt.figure()
plt.plot(results_df.loc[results_df['high_sigma']==3,'low_sigma'],results_df.loc[results_df['high_sigma']==3,'loss_std'],'o',label='3')
plt.plot(results_df.loc[results_df['high_sigma']==5,'low_sigma'],results_df.loc[results_df['high_sigma']==5,'loss_std'],'o',label='5')
plt.plot(results_df.loc[results_df['high_sigma']==8,'low_sigma'],results_df.loc[results_df['high_sigma']==8,'loss_std'],'o',label='8')
plt.plot(results_df.loc[results_df['high_sigma']==11,'low_sigma'],results_df.loc[results_df['high_sigma']==11,'loss_std'],'o',label='11')
plt.legend()
plt.ylabel('std(loss)')
plt.xlabel('low sigma')
plt.show()

# Train single model

In [None]:
cv_folds=3

low_sigma=0.75
high_sigma=5

val_tracks=trackbased_cv(procdata,cv_folds)

X_train, X_test, y_train, y_test = trackbased_ttsplit(procdata,val_tracks[2])

model = make_model()
model.compile(loss=loss_function, optimizer=optimizer, metrics=keras.metrics.AUC())

history=model.fit(datagen.flow(X_train, y_train, batch_size=batch_size), batch_size = batch_size, epochs = max_epochs, 
    validation_data = (X_test, y_test),
    callbacks=[early_stopping_cb],
    verbose=1)


plot_training_history(history)

# Plot ROC curve and show ROC-AUC results of the training and validation sets. 
pred = [model.predict(X_train), model.predict(X_test)]
actual = [y_train, y_test]
stats(pred, actual)