In [None]:
# import statements
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import skimage.io
import skimage.color as color
import pandas as pd
import glob
import json
from PIL import Image
np.random.seed(123)
import os

from sklearn.preprocessing import label_binarize
from sklearn.metrics import confusion_matrix
import itertools

import keras
from keras.utils.np_utils import to_categorical # used for converting labels to one-hot-encoding
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D

from keras import backend as K
import itertools
from keras.layers.normalization import BatchNormalization
from keras.utils.np_utils import to_categorical # convert to one-hot-encoding

from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau
from sklearn.model_selection import train_test_split

%matplotlib inline
%config InlineBackend.figure_formats = {'png', 'retina'}

# JB's favorite Seaborn settings for notebooks
rc = {'lines.linewidth': 2, 
      'axes.labelsize': 18, 
      'axes.titlesize': 18, 
      'axes.facecolor': 'DFDFE5'}
sns.set_context('notebook', rc=rc)
sns.set_style('darkgrid', rc=rc)

# Loading the JPEGs and Segmentations

In [None]:
# Importing all jpeg images stored in "Data/Images" to a list called im_lst

im_dir = './ISIC-Archive-Downloader/Data/Images'

im_glob = os.path.join(im_dir, '*.jpeg')
im_lst = glob.glob(im_glob)

# Importing all png images stored in "Data/Segmentation" to a list called im_lst_seg

im_dir_seg = './ISIC-Archive-Downloader/Data/Segmentation'

im_glob_seg = os.path.join(im_dir_seg, '*.png')
im_lst_seg = glob.glob(im_glob_seg)

# Processing the Metadata

In [None]:
# Importing all json images stored in "Data/Descriptions" to a list called json_lst

json_dir = './ISIC-Archive-Downloader/Data/Descriptions'

json_glob = os.path.join(json_dir,'*')
json_lst = glob.glob(json_glob)

In [None]:
json_lst

## Creating a DataFrame of the Metadata

In [None]:
df_final = pd.DataFrame()
for file in json_lst:
    with open(file, 'r') as fp:
        obj = json.load(fp)
        meta_dict = obj['meta']['clinical']
        meta_dict['name'] = obj['name']
        meta_dict['path'] = im_dir + '/{}.jpeg'.format(meta_dict['name']) 
        df_add = pd.DataFrame(meta_dict, index=[0])
    df_final = df_final.append(df_add)

df_final[0:10]

In [None]:
df_final.shape

In [None]:
# Processing the dataframe WARNING: LONG TIME TO RUN

# Turning age to floats and replacing NaN's with mean
df_final['age_approx'] = pd.to_numeric(df_final['age_approx'], errors='coerce')
df_final['age_approx'].fillna((df_final['age_approx'].mean()), inplace=True)

# Turning benign_malignant columns to 0's and 1's and storing it in a column called "result"
df_final['result'] = (df_final.benign_malignant == "malignant").astype('int')



In [None]:
images = df_final['path'].map(lambda x: np.asarray(Image.open(x).resize((100,100))))
#df_final
images

In [None]:
# Creating target and features (we need this to create the testing data)
# Target = y = result col
# Features = rest of df_final

target = df_final['result']
features=df_final.drop(columns=['benign_malignant', 'result'],axis=1)


# Creating Training and Test Data
Training:Test Split is 80:20

In [None]:
x_train_o, x_test_o, y_train_o, y_test_o = train_test_split(images, target, test_size=0.20,random_state=1234)

In [None]:
# Taking the image data 
x_train = np.asarray(x_train_o.tolist())
x_test = np.asarray(x_test_o.tolist())

# Normalize training and test x data
x_train_mean = np.mean(x_train)
x_train_std = np.std(x_train)

x_test_mean = np.mean(x_test)
x_test_std = np.std(x_test)

x_train = (x_train - x_train_mean)/x_train_std
x_test = (x_test - x_test_mean)/x_test_std

# Creating one hot encoded y_train and y_test
y_train = to_categorical(y_train_o, num_classes = 2)
y_test = to_categorical(y_test_o, num_classes = 2)


In [None]:
x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size = 0.1, random_state = 2)
x_train = x_train.reshape(x_train.shape[0], *(100, 100, 3))
x_test = x_test.reshape(x_test.shape[0], *(100, 100, 3))
x_validate = x_validate.reshape(x_validate.shape[0], *(100, 100, 3))

# CNN Model

In [None]:
input_shape = (100, 100, 3)
num_classes = 2

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',padding = 'Same',input_shape=input_shape, dilation_rate=2))
model.add(Conv2D(32,kernel_size=(3, 3), activation='relu',padding = 'Same',stries=2))
model.add(MaxPool2D(pool_size = (2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), activation='relu',padding = 'Same'))
model.add(Conv2D(64, (3, 3), activation='relu',padding = 'Same'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.40))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

In [None]:
# Define the optimizer
optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

# Compile the model
model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

In [None]:
# Set a learning rate annealer
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

In [None]:

# Train the model, iterating on the data in batches of 32 samples
history = model.fit(x_train, y_train, epochs=10, batch_size=32,
                    validation_data=(x_test, y_test))

In [None]:
x_train.shape

In [None]:
# Fit the model
epochs = 50 
batch_size = 10
history = model.fit_generator(x_train, y_train,
                              epochs = epochs, validation_data = (x_validate,y_validate),
                              verbose = 1, steps_per_epoch=x_train.shape[0] // batch_size
                              , callbacks=[learning_rate_reduction])

In [None]:
# Importing all jpeg images stored in "Data/Images" to a list called im_lst

im_dir = './ISIC-Archive-Downloader/Data/Images'

im_glob = os.path.join(im_dir, '*.jpeg')
im_lst = glob.glob(im_glob)

# Importing all png images stored in "Data/Segmentation" to a list called im_lst_seg

im_dir_seg = './ISIC-Archive-Downloader/Data/Segmentation'

im_glob_seg = os.path.join(im_dir_seg, '*.png')
im_lst_seg = glob.glob(im_glob_seg)

# Processing the Metadata

In [None]:
# Importing all json images stored in "Data/Descriptions" to a list called json_lst

json_dir = './ISIC-Archive-Downloader/Data/Descriptions'

json_glob = os.path.join(json_dir,'*')
json_lst = glob.glob(json_glob)

In [None]:
json_lst

## Creating a DataFrame of the Metadata

In [None]:
df_final = pd.DataFrame()
for file in json_lst:
    with open(file, 'r') as fp:
        obj = json.load(fp)
        meta_dict = obj['meta']['clinical']
        meta_dict['name'] = obj['name']
        meta_dict['path'] = im_dir + '/{}.jpeg'.format(meta_dict['name']) 
        df_add = pd.DataFrame(meta_dict, index=[0])
    df_final = df_final.append(df_add)

df_final[0:10]

In [None]:
# Processing the dataframe WARNING: LONG TIME TO RUN

# Turning age to floats and replacing NaN's with mean
df_final['age_approx'] = pd.to_numeric(df_final['age_approx'], errors='coerce')
df_final['age_approx'].fillna((df_final['age_approx'].mean()), inplace=True)

# Turning benign_malignant columns to 0's and 1's and storing it in a column called "result"
df_final['result'] = (df_final.benign_malignant == "malignant").astype('int')



In [None]:
images = df_final['path'].map(lambda x: np.asarray(Image.open(x).resize((100,100))))
#df_final
images

In [None]:
# Creating target and features (we need this to create the testing data)
# Target = y = result col
# Features = rest of df_final

target = df_final['result']
features=df_final.drop(columns=['benign_malignant', 'result'],axis=1)


# Creating Training and Test Data
Training:Test Split is 80:20

In [None]:
x_train_o, x_test_o, y_train_o, y_test_o = train_test_split(images, target, test_size=0.20,random_state=1234)

In [None]:
# Taking the image data 
x_train = np.asarray(x_train_o.tolist())
x_test = np.asarray(x_test_o.tolist())

# Normalize training and test x data
x_train_mean = np.mean(x_train)
x_train_std = np.std(x_train)

x_test_mean = np.mean(x_test)
x_test_std = np.std(x_test)

x_train = (x_train - x_train_mean)/x_train_std
x_test = (x_test - x_test_mean)/x_test_std

# Creating one hot encoded y_train and y_test
y_train = to_categorical(y_train_o, num_classes = 2)
y_test = to_categorical(y_test_o, num_classes = 2)


In [None]:
x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size = 0.1, random_state = 2)
x_train = x_train.reshape(x_train.shape[0], *(100, 100, 3))
x_test = x_test.reshape(x_test.shape[0], *(100, 100, 3))
x_validate = x_validate.reshape(x_validate.shape[0], *(100, 100, 3))

# CNN Model

In [None]:
input_shape = (100, 100, 3)
num_classes = 2

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',padding = 'Same',input_shape=input_shape, dilation_rate=2))
model.add(Conv2D(32,kernel_size=(3, 3), activation='relu',padding = 'Same',strides=2))
model.add(MaxPool2D(pool_size = (2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), activation='relu',padding = 'Same'))
model.add(Conv2D(64, (3, 3), activation='relu',padding = 'Same'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.40))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

In [None]:
# Define the optimizer
optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

# Compile the model
model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

In [None]:
# Set a learning rate annealer
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

In [None]:

# Train the model, iterating on the data in batches of 32 samples
history = model.fit(x_train, y_train, epochs=10, batch_size=32,
                    validation_data=(x_test, y_test))

In [None]:
x_train.shape

In [None]:
# Fit the model
epochs = 50 
batch_size = 10
history = model.fit_generator(x_train, y_train,
                              epochs = epochs, validation_data = (x_validate,y_validate),
                              verbose = 1, steps_per_epoch=x_train.shape[0] // batch_size
                              , callbacks=[learning_rate_reduction])

# Fixing Dataset

In [None]:
# Importing all jpeg images stored in "Data/Images" to a list called im_lst

im_dir = './Downloader/Data/sm100x100images'

im_glob = os.path.join(im_dir, '*.jpeg')
im_lst = glob.glob(im_glob)

json_dir = './Downloader/Data/Descriptions'

json_glob = os.path.join(json_dir,'*')
json_lst = glob.glob(json_glob)

In [None]:
df_final = pd.DataFrame()
for file in json_lst:
    with open(file, 'r') as fp:
        obj = json.load(fp)
        meta_dict = obj['meta']['clinical']
        meta_dict['name'] = obj['name']
        meta_dict['path'] = im_dir + '/{}.jpeg'.format(meta_dict['name']) 
        df_add = pd.DataFrame(meta_dict, index=[0])
    df_final = df_final.append(df_add)

df_final[0:10]

In [None]:
splice_benign = pd.values_count(df_final.benign_malignant.values, sort = False)['malignant']
df_benign = df_final.sort("benign_malignant")[:splice_benign]

df_malignant= df_final[df_final.benign_malignant=='malignant']

images_benign = df_benign['path'].map(lambda x: np.asarray(Image.open(x)))
images_malignant = df_malignant['path'].map(lambda x: np.asarray(Image.open(x)))

In [None]:
images = images_benign + images_malignant
target = np.append(np.ones(len(images_benign)), np.zeros(len(images_benign)))

In [None]:
x_train_o, x_test_o, y_train_o, y_test_o = train_test_split(images, target, test_size=0.20,random_state=1234)

In [None]:
# Taking the image data 
x_train = np.asarray(x_train_o.tolist())
x_test = np.asarray(x_test_o.tolist())

# Normalize training and test x data
x_train_mean = np.mean(x_train)
x_train_std = np.std(x_train)

x_test_mean = np.mean(x_test)
x_test_std = np.std(x_test)

x_train = (x_train - x_train_mean)/x_train_std
x_test = (x_test - x_test_mean)/x_test_std

# Creating one hot encoded y_train and y_test
y_train = to_categorical(y_train_o, num_classes = 2)
y_test = to_categorical(y_test_o, num_classes = 2)


In [None]:
x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size = 0.1, random_state = 2)
x_train = x_train.reshape(x_train.shape[0], *(100, 100, 3))
x_test = x_test.reshape(x_test.shape[0], *(100, 100, 3))
x_validate = x_validate.reshape(x_validate.shape[0], *(100, 100, 3))

# CNN Model

In [None]:
input_shape = (100, 100, 3)
num_classes = 2

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),activation='relu',padding = 'Same',input_shape=input_shape, dilation_rate=2))
model.add(Conv2D(32,kernel_size=(3, 3), activation='relu',padding = 'Same',strides=2))
model.add(MaxPool2D(pool_size = (2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), activation='relu',padding = 'Same'))
model.add(Conv2D(64, (3, 3), activation='relu',padding = 'Same'))
model.add(MaxPool2D(pool_size=(2, 2)))
model.add(Dropout(0.40))

model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

In [None]:
# Define the optimizer
optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

# Compile the model
model.compile(optimizer = optimizer , loss = "categorical_crossentropy", metrics=["accuracy"])

In [None]:
# Set a learning rate annealer
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=3, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

In [None]:

# Train the model, iterating on the data in batches of 32 samples
history = model.fit(x_train, y_train, epochs=10, batch_size=32,
                    validation_data=(x_test, y_test))