# Imports & Config

In [1]:
import os, shutil, argparse, random, pickle, cv2, fnmatch, PIL, math, signal
from tqdm import tqdm_notebook as tqdm
import tensorflow as tf
import tensorflow.keras as k
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from tensorflow.keras.preprocessing import image
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.metrics import auc
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import datetime

%matplotlib inline

print('[INFO] Imports done.  Using Tensorflow version: {}, and Keras version: {}'.format(tf.__version__, k.__version__))

[INFO] Imports done.  Using Tensorflow version: 1.13.1, and Keras version: 2.2.4-tf


In [3]:
config={}

config['note'] = '-AllData'

config['new_directory'] = os.path.join(os.getcwd()
                                       , 'marks-runs'
                                       , '{}, {}'.format(datetime.datetime.now().strftime("%y%m%d%H%M"), config['note']))

if os.path.exists(config['new_directory']):
        shutil.rmtree(config['new_directory'])

os.mkdir(config['new_directory'])



config['do_dataprep'] = True

config['source'] = os.path.join(os.getcwd(), 'source_data')
config['home_dir'] = os.path.join(os.getcwd(), 'data_binary_split')

config['train_dir'] = os.path.join(os.getcwd(), 'data_binary_split', 'train')
config['test_dir'] = os.path.join(os.getcwd(), 'data_binary_split', 'test')
config['val_dir'] = os.path.join(os.getcwd(), 'data_binary_split', 'validation')

config['train_pain'] = os.path.join(config['train_dir'], 'pain')
config['train_nopain'] = os.path.join(config['train_dir'], 'nopain')
config['test_pain'] = os.path.join(config['test_dir'], 'pain')
config['test_nopain'] = os.path.join(config['test_dir'], 'nopain')
config['val_pain'] = os.path.join(config['val_dir'], 'pain')
config['val_nopain'] = os.path.join(config['val_dir'], 'nopain')

config['val_split'] = 0.1
config['test_split'] = 0.1

config['target_size'] = (320, 240)

config['train_batch'] = 100
config['test_batch'] = 100
config['val_batch'] = 100

config['epochs'] = 100

config['set_limit'] = False
config['train_limit'] = 6600
config['test_val_limit'] = 830

# Data Prep

## Split Helper Function

In [4]:
def split(SOURCE, TRAINING, TESTING, VALIDATION):
    
    print('[INFO] In Splitting')

    files = []
    
    for file in os.listdir(SOURCE): 
        files.append(file)
    
    # since the limit is less than the # of images, I will just hardcode it.  

    training_length = int(len(files) * (1-config['val_split']-config['test_split']))
    testing_length = int(len(files) * config['test_split'])
    validation_length = int(len(files) * config['val_split'])

    shuffled_set = random.sample(files, len(files))

    training_set = shuffled_set[0:training_length]
    testing_set = shuffled_set[training_length:training_length+testing_length]
    validation_set = shuffled_set[training_length+testing_length:]

#     print('[INFO] Creating Training Set')
    for filename in tqdm(training_set):
        this_file = os.path.join(SOURCE, filename)
        destination = os.path.join(TRAINING, filename)
        shutil.copyfile(this_file, destination)

#     print('[INFO] Creating Testing Set')
    for filename in tqdm(testing_set):
        this_file = os.path.join(SOURCE, filename)
        destination = os.path.join(TESTING, filename)
        shutil.copyfile(this_file, destination)

#     print('[INFO] Creating Validation Set')
    for filename in tqdm(validation_set):
        this_file = os.path.join(SOURCE, filename)
        destination = os.path.join(VALIDATION, filename)
        shutil.copyfile(this_file, destination)

## Creating Directories

Strategy is to use the data in 0.0 labeled as no pain and all others as pain

In [5]:
# Seems I have mixed them up: 
# 0.0 is no pain
# the rest is pain

In [6]:
# Create the new dirctories

new_dirs = [
    config['home_dir']
    , config['train_dir'], config['test_dir'], config['val_dir']
    , config['train_pain'], config['train_nopain']
    , config['test_pain'], config['test_nopain']
    , config['val_pain'], config['val_nopain']
           ]

if config['do_dataprep']: 
    if os.path.exists(config['home_dir']):
        shutil.rmtree(config['home_dir'])
        
    for value in new_dirs: 
        if not os.path.exists(value):
            os.mkdir(value)
    
    nopain_data_location = os.path.join(config['source'], '0.0')

    print('[INFO] Files in nopain origin dir: \t{}'.format(len(os.listdir(nopain_data_location))))
    print('[INFO] Files in nopain train dir: \t{}'.format(len(os.listdir(config['train_pain']))))
    print('[INFO] Files in nopain test dir: \t\t{}'.format(len(os.listdir(config['test_pain']))))
    print('[INFO] Files in nopain val dir: \t\t{}'.format(len(os.listdir(config['val_pain']))))
    
    split(
        SOURCE = nopain_data_location
        , TRAINING = config['train_nopain'] 
        , TESTING = config['test_nopain'] 
        , VALIDATION = config['val_nopain'] 
    )

    print('[INFO] Files in nopain origin dir: \t{}'.format(len(os.listdir(nopain_data_location))))
    print('[INFO] Files in nopain train dir: \t{}'.format(len(os.listdir(config['train_nopain']))))
    print('[INFO] Files in nopain test dir: \t\t{}'.format(len(os.listdir(config['test_nopain']))))
    print('[INFO] Files in nopain val dir: \t\t{}'.format(len(os.listdir(config['val_nopain']))))
    
    ###########################################################################################
    
    print('[INFO] Files in pain train dir: \t{}'.format(len(os.listdir(config['train_pain']))))
    print('[INFO] Files in pain test dir: \t{}'.format(len(os.listdir(config['test_pain']))))
    print('[INFO] Files in pain val dir: \t{}'.format(len(os.listdir(config['val_pain']))))
    
    pain_folders = os.listdir(config['source'])
    pain_folders.remove('0.0')
    
    for folder in pain_folders:
        split(
            SOURCE = os.path.join(config['source'], folder)
            , TRAINING = config['train_pain'] 
            , TESTING = config['test_pain'] 
            , VALIDATION = config['val_pain'] 
        )

    print('[INFO] Files in pain train dir: \t{}'.format(len(os.listdir(config['train_pain']))))
    print('[INFO] Files in pain test dir: \t\t{}'.format(len(os.listdir(config['test_pain']))))
    print('[INFO] Files in pain val dir: \t\t{}'.format(len(os.listdir(config['val_pain']))))

[INFO] Files in nopain origin dir: 	40029
[INFO] Files in nopain train dir: 	0
[INFO] Files in nopain test dir: 		0
[INFO] Files in nopain val dir: 		0
[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=32023), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4002), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4004), HTML(value='')))


[INFO] Files in nopain origin dir: 	40029
[INFO] Files in nopain train dir: 	32023
[INFO] Files in nopain test dir: 		4002
[INFO] Files in nopain val dir: 		4004
[INFO] Files in pain train dir: 	0
[INFO] Files in pain test dir: 	0
[INFO] Files in pain val dir: 	0
[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=42), HTML(value='')))




HBox(children=(IntProgress(value=0, max=5), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=641), HTML(value='')))




HBox(children=(IntProgress(value=0, max=80), HTML(value='')))




HBox(children=(IntProgress(value=0, max=81), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=193), HTML(value='')))




HBox(children=(IntProgress(value=0, max=24), HTML(value='')))




HBox(children=(IntProgress(value=0, max=25), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=63), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=38), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=53), HTML(value='')))




HBox(children=(IntProgress(value=0, max=6), HTML(value='')))




HBox(children=(IntProgress(value=0, max=8), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=17), HTML(value='')))




HBox(children=(IntProgress(value=0, max=2), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




HBox(children=(IntProgress(value=0, max=4), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=216), HTML(value='')))




HBox(children=(IntProgress(value=0, max=27), HTML(value='')))




HBox(children=(IntProgress(value=0, max=27), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=60), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7), HTML(value='')))




HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=2327), HTML(value='')))




HBox(children=(IntProgress(value=0, max=290), HTML(value='')))




HBox(children=(IntProgress(value=0, max=292), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=1129), HTML(value='')))




HBox(children=(IntProgress(value=0, max=141), HTML(value='')))




HBox(children=(IntProgress(value=0, max=142), HTML(value='')))


[INFO] In Splitting


HBox(children=(IntProgress(value=0, max=1880), HTML(value='')))




HBox(children=(IntProgress(value=0, max=235), HTML(value='')))




HBox(children=(IntProgress(value=0, max=236), HTML(value='')))


[INFO] Files in pain train dir: 	6688
[INFO] Files in pain test dir: 		831
[INFO] Files in pain val dir: 		850


## Limiting data

In [9]:
print('[INFO] Files in nopain train dir: \t{}'.format(len(os.listdir(config['train_nopain']))))
print('[INFO] Files in nopain test dir: \t{}'.format(len(os.listdir(config['test_nopain']))))
print('[INFO] Files in nopain val dir: \t{}'.format(len(os.listdir(config['val_nopain']))))

###########################################################################################

print('[INFO] Files in pain train dir: \t{}'.format(len(os.listdir(config['train_pain']))))
print('[INFO] Files in pain test dir: \t\t{}'.format(len(os.listdir(config['test_pain']))))
print('[INFO] Files in pain val dir: \t\t{}'.format(len(os.listdir(config['val_pain']))))

[INFO] Files in nopain train dir: 	32023
[INFO] Files in nopain test dir: 	4002
[INFO] Files in nopain val dir: 	4004
[INFO] Files in pain train dir: 	6688
[INFO] Files in pain test dir: 		831
[INFO] Files in pain val dir: 		850


In [14]:
train_limiting_dirs = [
        config['train_pain'], config['train_nopain']
    ]

test_val_limiting_dirs = [
        config['test_pain'], config['test_nopain']
        , config['val_pain'], config['val_nopain']
    ]

if config['set_limit']: 
    print('Before')
    for d in train_limiting_dirs: 
        print('{}: {}'.format(d, len(os.listdir(d))))
    for d in test_val_limiting_dirs: 
        print('{}: {}'.format(d, len(os.listdir(d))))
        
    for d in train_limiting_dirs: 
        counter = 1
        for filename in os.listdir(d): 
            if counter > config['train_limit']:
                os.remove(os.path.join(d, filename))
            counter = counter +1
    
    for d in test_val_limiting_dirs: 
        counter = 1
        for filename in os.listdir(d): 
            if counter > config['test_val_limit']:
                os.remove(os.path.join(d, filename))
            counter = counter +1

    print('\nAfter')
    for d in train_limiting_dirs: 
        print('{}: {}'.format(d, len(os.listdir(d))))
    for d in test_val_limiting_dirs: 
        print('{}: {}'.format(d, len(os.listdir(d))))

#  Modelling

## Image Generators

In [17]:
config['train_count'] = sum([len(files) for r, d, files in os.walk(config['train_dir'])])
config['train_steps'] = math.ceil(config['train_count']/config['train_batch'])

train_datagen = ImageDataGenerator(rescale=1.0/255.0)

train_generator = train_datagen.flow_from_directory(
    config['train_dir']
    , target_size = config['target_size'] 
    , batch_size = config['train_batch']
    , class_mode = 'binary'
)

Found 38711 images belonging to 2 classes.


In [18]:
config['val_count'] = sum([len(files) for r, d, files in os.walk(config['val_dir'])])
config['val_steps'] = math.ceil(config['val_count']/config['val_batch'])

val_datagen = ImageDataGenerator(rescale=1.0/255.0)

val_generator = val_datagen.flow_from_directory(
    config['val_dir']  # This is the source directory for training images
    , target_size = config['target_size']  # All images will be resized to 150x150 for compressing
    , batch_size = config['val_batch']
    , class_mode = 'binary' # Since we use binary_crossentropy loss, we need binary labels
)

Found 4854 images belonging to 2 classes.


In [19]:
config['test_count'] = sum([len(files) for r, d, files in os.walk(config['test_dir'])])
config['test_steps'] = math.ceil(config['test_count']/config['test_batch'])

test_datagen = ImageDataGenerator(rescale=1.0/255.0)

test_generator = test_datagen.flow_from_directory(
    config['test_dir']  # This is the source directory for training images
    , target_size = config['target_size']  # All images will be resized to 150x150 for compressing
    , batch_size = config['test_batch']
    , class_mode = 'binary' # Since we use binary_crossentropy loss, we need binary labels
)

Found 4833 images belonging to 2 classes.


## Helper Functions

In [16]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('binary_accuracy')>0.99):
            print('\nReached 99% accuracy which is satisfactory so stopped training!')
            self.model.stop_training = True
        if(logs.get('binary_accuracy')<0.4):
            print('\nUnable to go over 40% accuracy, so cancelling run!')
            self.model.stop_training = True

markscallbacks = myCallback()

In [21]:
def create_and_compile_model(): 
    model = k.models.Sequential([
        k.layers.Conv2D(16, (3,3), activation='relu', input_shape=(*config['target_size'], 3), padding = 'same', name='mh-conv-1')
        , k.layers.MaxPooling2D(2, 2, name='mh-maxpool-1')
        , k.layers.Conv2D(32, (3,3), activation='relu', name='mh-conv-2')
        , k.layers.MaxPooling2D(2,2, name='mh-maxpool-2')
        , k.layers.Conv2D(64, (3,3), activation='relu', name='mh-conv-3')
        , k.layers.MaxPooling2D(2,2, name='mh-maxpool-3')
        , k.layers.Conv2D(128, (3,3), activation='relu', name='mh-conv-4')
        , k.layers.MaxPooling2D(2,2, name='mh-maxpool-4')
        , k.layers.Conv2D(256, (3,3), activation='relu', name='mh-conv-5')
        , k.layers.MaxPooling2D(2,2, name='mh-maxpool-5')
        , k.layers.Flatten(name='mh-flatten-1')
        , k.layers.Dense(1024, activation='relu', name='mh-dense-1')
#         , k.layers.Dropout(0.2)
        , k.layers.Dense(128, activation='relu', name='mh-dense-2')
#         , k.layers.Dropout(0.2)
        , k.layers.Dense(1, activation='sigmoid', name='mh-dense-output')
    ])
    
    model.compile(
        optimizer=Adam()
        , loss='binary_crossentropy'
        , metrics=['binary_accuracy']
    )
    
    return model

## Training

In [22]:
model = create_and_compile_model()

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
mh-conv-1 (Conv2D)           (None, 320, 240, 16)      448       
_________________________________________________________________
mh-maxpool-1 (MaxPooling2D)  (None, 160, 120, 16)      0         
_________________________________________________________________
mh-conv-2 (Conv2D)           (None, 158, 118, 32)      4640      
_________________________________________________________________
mh-maxpool-2 (MaxPooling2D)  (None, 79, 59, 32)        0         
_________________________________________________________________
mh-conv-3 (Conv2D)           (None, 77, 57, 64)        18496     
_________________________________________________________________
mh-maxpool-3 (MaxPooling2D)  (None, 38, 28, 64)        0         
_________________________________________________________________
mh-conv-4 (Conv2D)           (None, 36, 26, 128)       73856     
__________

In [None]:
history = model.fit_generator(
    train_generator
    , steps_per_epoch = config['train_steps']
    , epochs = config['epochs']
    , validation_data = val_generator
    , validation_steps = config['val_steps']
    , callbacks=[
        markscallbacks
        , EarlyStopping(monitor='val_binary_accuracy', patience=10)
        , ModelCheckpoint(filepath=os.path.join(config['new_directory'], 'model.h5'), monitor='val_binary_accuracy', save_best_only=True)
        , ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)
    ]
)

print('\n[INFO] Training Complete')

In [None]:
predicted = model.predict_generator(test_generator, steps=config['test_steps'])
labels = test_generator.classes

In [None]:
fpr_keras, tpr_keras, thresholds_keras = roc_curve(labels, predicted)

auc_keras = auc(fpr_keras, tpr_keras)

plt.figure(figsize=(18,6))

plt.subplot(1, 2 ,1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
# plt.show()

hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
plt.subplot(1, 2, 2)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.plot(hist['epoch'], hist['binary_accuracy'], label='Train Binary Accuracy')
plt.plot(hist['epoch'], hist['val_binary_accuracy'], label = 'Val Binary Accuracy')
plt.title('Accuracy vs Training')
plt.legend(loc='best')

# plt.show()

plt.savefig(os.path.join(config['new_directory'], 'result plots.png'))

hist.to_csv(os.path.join(config['new_directory'], 'run_results.csv'))

In [None]:
print('Completed running Notebook')

In [None]:
os.kill(os.getpid(), signal.SIGKILL)