In [None]:
import glob
import pandas as pd
import numpy as np
import imageio
from skimage.transform import resize
from scipy import misc

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix

In [None]:
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing import image
from keras.models import Sequential
from keras.layers import Activation, Dropout, Flatten, Dense, Conv2D, MaxPooling2D
from keras.callbacks import ReduceLROnPlateau 
from keras.utils import np_utils, normalize

In [None]:
import pickle

In [None]:
PATH_DATA = './Data/'
PATH_IMAGES = glob.glob('./Images/*.jpg')

In [None]:
df_meta = pd.read_csv(PATH_DATA + 'HAM10000_metadata.csv')

## I. EDA

In [None]:
df_meta.shape

In [None]:
df_meta.sample()

In [None]:
df_meta['dx'].value_counts()

In [None]:
df_meta['dx_type'].value_counts()

In [None]:
df_meta['age'].value_counts()

In [None]:
df_meta['sex'].value_counts()

In [None]:
df_meta['localization'].value_counts()

## Experimentation
### Method 1 results in dead kernel

In [None]:
# Method 1: Load all Images and Scale by 255
# images = []
# for path_image in PATH_IMAGES:
#     image = misc.imread(path_image)
#     image = image / 255
#     images.append(image)

### Method 2 can be completed with approx 3GB memory consumption

In [None]:
# Method 2: Load all Images and Resize to 50%, no scaling
# images = []
# for path_image in PATH_IMAGES:
#     image = misc.imread(path_image)
#     image = misc.imresize(image, size=(300, 225), interp='nearest')
#     images.append(image)
#     print('Completed processing {}'.format(path_image))

### Method 3 can be completed with approx 5GB memory consumption

In [None]:
# Method 3: Load all Images and Resize to 70%, no scaling
# images = []
# for path_image in PATH_IMAGES:
#     image = misc.imread(path_image)
#     image = misc.imresize(image, size=(420, 315), interp='nearest')
#     images.append(image)
#     print('Completed processing {}'.format(path_image))

In [None]:
# images = np.asarray(images)

In [None]:
# np.asarray([images.shape[1], images.shape[2], images.shape[3]])
# array([420, 315,   3])

### Method 4 scaling values takes up entire 16GB + swap

In [None]:
# Method 4: Load all Images and Resize to 50%, with scaling
# images = []
# for path_image in PATH_IMAGES:
#     image = misc.imread(path_image)
#     image = misc.imresize(image, size=(300, 225), interp='nearest')
#     image = image / 255
#     images.append(image)

## Data Visualisation

In [None]:
# This is working, redo this later
# figure = plt.figure()
# count = 0
# for index, row in df_combined.iterrows():
#     if count < 10:
#         plt.imshow(row['image'])
#         plt.axis('off')
#         count += 1
#     else:
#         break

## Modeling - CNN1

In [None]:
# Prepare Train and Test Data
df_meta = df_meta.set_index('image_id')
y = df_meta['dx']
y_train, y_test = train_test_split(y, test_size=0.2, random_state=42)

In [None]:
# Original Size: 600*450
# Metho: Load all Images and Resize to 30%, with normalization

TARGET_SIZE = (180, 135)

x_train, x_test = [], []
for image_id in y_train.index.values:
    # Uses too much memory, will revert first to scipy
    # image = imageio.imread('./Images/{}.jpg'.format(image_id))
    # image = resize(image, output_shape=(360, 270), anti_aliasing=False, mode='constant')
    
    image = misc.imread('./Images/{}.jpg'.format(image_id))
    image = misc.imresize(image, size=TARGET_SIZE, interp='nearest')
    image = normalize(image)
    x_train.append(image)
    print('Completed processing {}.jpg'.format(image_id))

x_train = np.asarray(x_train)

for image_id in y_test.index.values:
    # Uses too much memory, will revert first to scipy
    # image = imageio.imread('./Images/{}.jpg'.format(image_id))
    # image = resize(image, output_shape=(360, 270), anti_aliasing=False, mode='constant')
    
    image = misc.imread('./Images/{}.jpg'.format(image_id))
    image = misc.imresize(image, size=TARGET_SIZE, interp='nearest')
    image = normalize(image)
    x_test.append(image)
    print('Completed processing {}.jpg'.format(image_id))

x_test = np.asarray(x_test)

In [None]:
target_names = y_test.value_counts()

y_train = y_train.values
y_test = y_test.values

In [None]:
def one_hot_encode_object_array(arr):
    # One hot encode a numpy array of objects (e.g. strings)'''
    uniques, ids = np.unique(arr, return_inverse=True)
    return np_utils.to_categorical(ids, len(uniques))

y_train = one_hot_encode_object_array(y_train)
y_test = one_hot_encode_object_array(y_test)

In [None]:
def cnn(size, n_layers):
    # INPUTS
    # size     - size of the input images
    # n_layers - number of layers
    # OUTPUTS
    # model    - compiled CNN

    # Define hyperparamters
    MIN_NEURONS = 64
    MAX_NEURONS = 256 # change this, make it 256, 512
    KERNEL = (3, 3)

    # Determine the # of neurons in each convolutional layer
    neurons = np.arange(MIN_NEURONS, MAX_NEURONS, MIN_NEURONS/2)
    neurons = neurons.astype(np.int32)

    # Define a model
    model = Sequential()

    # Add convolutional layers
    for i in range(0, n_layers):
        if i == 0:
            shape = (size[0], size[1], size[2])
            model.add(Conv2D(neurons[i], KERNEL, padding='same', input_shape=shape))
        else:
            model.add(Conv2D(neurons[i], KERNEL, padding='same'))
        
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(0.25))

    # Add max pooling layer
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Dropout(0.40))
    model.add(Flatten())
    
    model.add(Dense(MAX_NEURONS))
    model.add(Activation('relu'))
    model.add(Dropout(0.50))

    # Add output layer
    model.add(Dense(7))
    model.add(Activation('softmax'))
    
    # Compile the model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Print a summary of the model
    model.summary()

    return model

In [None]:
# Instantiate the model
N_LAYERS = 4
image_size = np.asarray([x_train.shape[1], x_train.shape[2], x_train.shape[3]])
model = cnn(size=image_size, n_layers=N_LAYERS)

In [None]:
# Training hyperparamters
EPOCHS = 64
BATCH_SIZE = 16

# Add a reduction of learning rate to the model
reduce_lr = ReduceLROnPlateau(monitor='acc', factor=0.5, patience=3, verbose=1, mode='auto', min_delta=0.0001, cooldown=0, min_lr=0)

In [None]:
# Train the model
model_history = model.fit(x_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1, callbacks=[reduce_lr])

In [None]:
# Make a prediction on the test set
y_pred = model.predict(x_test)
y_pred = np.round(y_pred)

In [None]:
# Evaluate the Model
train_loss, train_acc = model.evaluate(x_train, y_train, verbose=1, batch_size=BATCH_SIZE)
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=1, batch_size=BATCH_SIZE)

In [None]:
# Check the Previous Value Counts of y_test and compare with the classification report below
target_names

In [None]:
confusion_matrix(y_test.argmax(axis=1), test_predictions.argmax(axis=1))

In [None]:
# Check the Classification Report
print(classification_report(y_test, test_predictions))

## Analysis...
- There is a huge data imbalance which causes the rest of the data to be classified wrongly as nv
- Will re-run this by combining all dx (not nv) as others and compare again
- Update the CNN function by adding dropout layers

In [None]:
# Save the Model
model_file = open('model-eda-preliminary-cnn5.sav','wb')
pickle.dump(model, model_file)
model_file.close()