# Imports & Config

In [3]:
import os, shutil, argparse, random, pickle, cv2, fnmatch, PIL, math, signal
from tqdm import tqdm_notebook as tqdm
import tensorflow as tf
import tensorflow.keras as k
import numpy as np
import pandas as pd
from sklearn.utils import shuffle
from tensorflow.keras.optimizers import RMSprop, Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from tensorflow.keras.preprocessing import image
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.metrics import auc
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import datetime

%matplotlib inline

print('[INFO] Imports done.  Using Tensorflow version: {}, and Keras version: {}'.format(tf.__version__, k.__version__))

[INFO] Imports done.  Using Tensorflow version: 1.13.1, and Keras version: 2.2.4-tf


In [4]:
config={}

config['note'] = '-CatAllData'

config['new_directory'] = ''

config['do_dataprep'] = False

config['source'] = os.path.join(os.getcwd(), 'source_data')
config['home_dir'] = os.path.join(os.getcwd(), 'data_cat_split')

config['train_dir'] = os.path.join(config['home_dir'], 'train')
config['test_dir'] = os.path.join(config['home_dir'], 'test')
config['val_dir'] = os.path.join(config['home_dir'], 'validation')

config['cat_list'] = ['{}.0'.format(i) for i in range(0, 16)]

config['val_split'] = 0.1
config['test_split'] = 0.1

config['target_size'] = (320, 240)

config['train_batch'] = 100
config['test_batch'] = 100
config['val_batch'] = 100

config['epochs'] = 1

config['set_limit'] = False
config['train_limit'] = 100
config['test_val_limit'] = 100

In [5]:
def working_dir(): 
    config['new_directory'] = os.path.join(os.getcwd()
                                       , 'marks-runs'
                                       , '{}, {}'.format(datetime.datetime.now().strftime("%y%m%d%H%M"), config['note']))

    if os.path.exists(config['new_directory']):
            shutil.rmtree(config['new_directory'])

    os.mkdir(config['new_directory'])

# Data Prep

## Split Helper Function

In [6]:
def split(SOURCE, TRAINING, TESTING, VALIDATION):
    
    print('[INFO] Splitting: {}'.format(SOURCE))

    files = []
    
    for file in os.listdir(SOURCE): 
        files.append(file)

    training_length = int(len(files) * (1-config['val_split']-config['test_split']))
    testing_length = int(len(files) * config['test_split'])
    validation_length = int(len(files) * config['val_split'])

    shuffled_set = random.sample(files, len(files))

    training_set = shuffled_set[0:training_length]
    testing_set = shuffled_set[training_length:training_length+testing_length]
    validation_set = shuffled_set[training_length+testing_length:]

    for filename in tqdm(training_set):
        this_file = os.path.join(SOURCE, filename)
        destination = os.path.join(TRAINING, filename)
        shutil.copyfile(this_file, destination)

    for filename in tqdm(testing_set):
        this_file = os.path.join(SOURCE, filename)
        destination = os.path.join(TESTING, filename)
        shutil.copyfile(this_file, destination)

    for filename in tqdm(validation_set):
        this_file = os.path.join(SOURCE, filename)
        destination = os.path.join(VALIDATION, filename)
        shutil.copyfile(this_file, destination)

In [7]:
def create_clean_dirs(): 
    new_dirs = [config['train_dir'], config['test_dir'], config['val_dir']]
    if os.path.exists(config['home_dir']):
        shutil.rmtree(config['home_dir'])
    os.mkdir(config['home_dir'])
        
    for value in new_dirs: 
        parent_folder = os.path.join(config['home_dir'], value)
        if not os.path.exists(parent_folder):
            os.mkdir(parent_folder)
            for cat in config['cat_list']: 
                temp = cat.split('.')[0]
                if int(temp) <= 9: 
                    temp2 = '0{}'.format(temp)
                else: 
                    temp2 = temp
                print('class{}'.format(temp2))
                child_folder = os.path.join(parent_folder, temp2)
                if not os.path.exists(child_folder):
                    os.mkdir(child_folder)

In [8]:
create_clean_dirs()

class00
class01
class02
class03
class04
class05
class06
class07
class08
class09
class10
class11
class12
class13
class14
class15
class00
class01
class02
class03
class04
class05
class06
class07
class08
class09
class10
class11
class12
class13
class14
class15
class00
class01
class02
class03
class04
class05
class06
class07
class08
class09
class10
class11
class12
class13
class14
class15


## Creating Directories

In [9]:
def check_dir_size(): 
    # create_clean_dirs()
    new_dirs = [config['train_dir'], config['test_dir'], config['val_dir']]
    for directory in new_dirs: 
        for sub_dir in os.listdir(directory): 
            file_counter = sum([len(files) for r, d, files in os.walk(os.path.join(config['train_dir'], directory, sub_dir))])
            print('{}/{}: has {} files'.format(directory, sub_dir, file_counter))

In [10]:
if config['do_dataprep']: 
    
    # Part 1: Make Directories
    
    create_clean_dirs()
    check_dir_size()
    
    # Part 2: Loop on all directories and call the split function
    
    for cat in config['cat_list']:
        split(
            SOURCE = os.path.join(config['source'], cat)
            , TRAINING = os.path.join(config['train_dir'], cat) 
            , TESTING = os.path.join(config['test_dir'], cat) 
            , VALIDATION = os.path.join(config['val_dir'], cat) 
        )
        
    check_dir_size()

## Limiting data

In [11]:
train_limiting_dirs = [
        config['train_pain'], config['train_nopain']
    ]

test_val_limiting_dirs = [
        config['test_pain'], config['test_nopain']
        , config['val_pain'], config['val_nopain']
    ]

if config['set_limit']: 
    print('Before')
    for d in train_limiting_dirs: 
        print('{}: {}'.format(d, len(os.listdir(d))))
    for d in test_val_limiting_dirs: 
        print('{}: {}'.format(d, len(os.listdir(d))))
        
    for d in train_limiting_dirs: 
        counter = 1
        for filename in os.listdir(d): 
            if counter > config['train_limit']:
                os.remove(os.path.join(d, filename))
            counter = counter +1
    
    for d in test_val_limiting_dirs: 
        counter = 1
        for filename in os.listdir(d): 
            if counter > config['test_val_limit']:
                os.remove(os.path.join(d, filename))
            counter = counter +1

    print('\nAfter')
    for d in train_limiting_dirs: 
        print('{}: {}'.format(d, len(os.listdir(d))))
    for d in test_val_limiting_dirs: 
        print('{}: {}'.format(d, len(os.listdir(d))))

KeyError: 'train_pain'

#  Modelling

## Image Generators

In [13]:
config['train_count'] = sum([len(files) for r, d, files in os.walk(config['train_dir'])])
config['train_steps'] = math.ceil(config['train_count']/config['train_batch'])

train_datagen = ImageDataGenerator(
    rescale=1.0/255.0
#     , rotation_range=40
#     , width_shift_range=0.2
#     , height_shift_range=0.2
#     , shear_range=0.2
#     , zoom_range=0.2
#     , horizontal_flip=True
#     , fill_mode='nearest'
)

train_generator = train_datagen.flow_from_directory(
    config['train_dir']
    , target_size = config['target_size'] 
    , batch_size = config['train_batch']
    , class_mode = 'categorical'
)

Found 0 images belonging to 16 classes.


In [38]:
config['val_count'] = sum([len(files) for r, d, files in os.walk(config['val_dir'])])
config['val_steps'] = math.ceil(config['val_count']/config['val_batch'])

val_datagen = ImageDataGenerator(rescale=1.0/255.0)

val_generator = val_datagen.flow_from_directory(
    config['val_dir']  # This is the source directory for training images
    , target_size = config['target_size']  # All images will be resized to 150x150 for compressing
    , batch_size = config['val_batch']
    , class_mode = 'categorical' # Since we use binary_crossentropy loss, we need binary labels
)

Found 4854 images belonging to 16 classes.


In [39]:
config['test_count'] = sum([len(files) for r, d, files in os.walk(config['test_dir'])])
config['test_steps'] = math.ceil(config['test_count']/config['test_batch'])

test_datagen = ImageDataGenerator(rescale=1.0/255.0)

test_generator = test_datagen.flow_from_directory(
    config['test_dir']  # This is the source directory for training images
    , target_size = config['target_size']  # All images will be resized to 150x150 for compressing
    , batch_size = config['test_batch']
    , class_mode = 'categorical' # Since we use binary_crossentropy loss, we need binary labels
)

Found 4833 images belonging to 16 classes.


## Helper Functions

In [40]:
class myCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        if(logs.get('categorical_accuracy')>0.99):
            print('\nReached 99% accuracy which is satisfactory so stopped training!')
            self.model.stop_training = True
        if(logs.get('categorical_accuracy')<0.4):
            print('\nUnable to go over 40% accuracy, so cancelling run!')
            self.model.stop_training = True

markscallbacks = myCallback()

In [41]:
def create_and_compile_model(num_layers, window_size): 
    
    cnn_config = [k.layers.Conv2D(16, (window_size,window_size), activation='relu', input_shape=(*config['target_size'], 3), padding = 'same', name='mh-conv-1')
                  , k.layers.MaxPooling2D(2, 2, name='mh-maxpool-1')]
    
    for i in range(2,num_layers+1,1):
        cnn_config.append([k.layers.Conv2D(32, (window_size, window_size), activation='relu', name='mh-conv-2'),
                           k.layers.MaxPooling2D(2,2, name='mh-maxpool-2')])
        
            , k.layers.Conv2D(64, (3,3), activation='relu', name='mh-conv-3')
            , k.layers.MaxPooling2D(2,2, name='mh-maxpool-3')
            , k.layers.Conv2D(128, (3,3), activation='relu', name='mh-conv-4')
            , k.layers.MaxPooling2D(2,2, name='mh-maxpool-4')
            , k.layers.Conv2D(256, (3,3), activation='relu', name='mh-conv-5')
            , k.layers.MaxPooling2D(2,2, name='mh-maxpool-5')
            , k.layers.Flatten(name='mh-flatten-1')
            , k.layers.Dropout(0.5)
            , k.layers.Dense(1024, activation='relu', name='mh-dense-1')
            , k.layers.Dropout(0.2)
            , k.layers.Dense(128, activation='relu', name='mh-dense-2')
            , k.layers.Dropout(0.2)
            , k.layers.Dense(16, activation='softmax', name='mh-dense-output')
            ]
    
    
    model = k.models.Sequential(cnn_config)
    model.compile(
        optimizer=Adam()
        , loss='categorical_crossentropy'
        , metrics=['categorical_accuracy']
    )
    
    return model

## Training

In [42]:
for num_layers in range(2,5,1):
    for w_size in range(3,20,1):
        model = create_and_compile_model(num_layers,w_size)
        model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
mh-conv-1 (Conv2D)           (None, 320, 240, 16)      448       
_________________________________________________________________
mh-maxpool-1 (MaxPooling2D)  (None, 160, 120, 16)      0         
_________________________________________________________________
mh-conv-2 (Conv2D)           (None, 158, 118, 32)      4640      
_________________________________________________________________
mh-maxpool-2 (MaxPooling2D)  (None, 79, 59, 32)        0         
_________________________________________________________________
mh-conv-3 (Conv2D)           (None, 77, 57, 64)        18496     
_________________________________________________________________
mh-maxpool-3 (MaxPooling2D)  (None, 38, 28, 64)        0         
_________________________________________________________________
mh-conv-4 (Conv2D)           (None, 36, 26, 128)       73856     
__________

In [43]:
working_dir()

history = model.fit_generator(
    train_generator
    , steps_per_epoch = config['train_steps']
    , epochs = config['epochs']
    , validation_data = val_generator
    , validation_steps = config['val_steps']
    , callbacks=[
        markscallbacks
        , EarlyStopping(monitor='val_categorical_accuracy', patience=10)
        , ModelCheckpoint(filepath=os.path.join(config['new_directory'], 'model.h5'), monitor='val_categorical_accuracy', save_best_only=True)
        , ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=5, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)
    ]
)

print('\n[INFO] Training Complete')


[INFO] Training Complete


In [44]:
predicted = model.predict_generator(test_generator, steps=config['test_steps'])
labels = test_generator.classes

In [50]:
import numpy as np
from scipy import interp
import matplotlib.pyplot as plt
from itertools import cycle
from sklearn.metrics import roc_curve, auc

# Number of classes
n_classes = 16

# Plot linewidth.
lw = 2

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()

In [51]:
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(labels[:, i], predicted[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

IndexError: too many indices for array

In [61]:
labels[-100:]

array([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
       11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
       11, 11, 11, 11, 11, 11, 11, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
       12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12,
       13, 13, 13, 13, 13, 14, 14, 14, 14, 14, 14, 14, 15, 15, 15],
      dtype=int32)

In [46]:
# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(labels.ravel(), predicted.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

# Compute macro-average ROC curve and ROC area

# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure(1)
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=lw,
             label='ROC curve of class {0} (area = {1:0.2f})'
             ''.format(i, roc_auc[i]))

plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

IndexError: too many indices for array

In [None]:
fpr_keras, tpr_keras, thresholds_keras = roc_curve(labels, predicted)

auc_keras = auc(fpr_keras, tpr_keras)

plt.figure(figsize=(18,6))

plt.subplot(1, 2 ,1)
plt.xticks([])
plt.yticks([])
plt.grid(False)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
# plt.show()

hist = pd.DataFrame(history.history)
hist['epoch'] = history.epoch
plt.subplot(1, 2, 2)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.plot(hist['epoch'], hist['binary_accuracy'], label='Train Binary Accuracy')
plt.plot(hist['epoch'], hist['val_binary_accuracy'], label = 'Val Binary Accuracy')
plt.title('Accuracy vs Training')
plt.legend(loc='best')

# plt.show()

plt.savefig(os.path.join(config['new_directory'], 'result plots.png'))

hist.to_csv(os.path.join(config['new_directory'], 'run_results.csv'))

In [None]:
print('Completed running Notebook')

In [None]:
os.kill(os.getpid(), signal.SIGKILL)