In [1]:
# Mounting google drive
from google.colab import drive
drive.mount('/content/gdrive')
# Displaying content of the drive
!ls /content/gdrive/'My Drive'/skin-cancer-mnist-ham10000

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive
excercise.ipynb		    HAM10000_images_part_2	imageArray.npy
HAM10000_images_part_1	    HAM10000_images_part_2.zip	mnist_cnn.ipynb
HAM10000_images_part_1.zip  HAM10000_metadata.csv


In [None]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.image import image, ImageDataGenerator, array_to_img, img_to_array, load_img
from PIL import Image
import os
import glob

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

# Keras
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Reshape, Conv2D, MaxPooling2D, BatchNormalization
from keras import regularizers
from keras.layers.advanced_activations import LeakyReLU
from keras.optimizers import Adam


In [None]:
# Loading .csv file
data = pd.read_csv('/content/gdrive/My Drive/skin-cancer-mnist-ham10000/HAM10000_metadata.csv')
# Get names of columns and first rows of Dataframe
data.head()

In [None]:
# Visualizing the distribution of various cell types
fig,ax1 =plt.subplots(1,1,figsize =(10,5))
data["dx"].value_counts().plot(kind="bar",ax = ax1)
plt.xticks(rotation='horizontal')
plt.ylabel('Frequency')
plt.xlabel('Diagnosis')

In [None]:
# Visalizing the distribution of the localization of various cell types.
data["localization"].value_counts().plot(kind="bar")
plt.ylabel('Frequency')
plt.xlabel('Localization')

In [None]:
# Visualizing gender-wise distribution
data["sex"].value_counts().plot(kind="bar")
plt.xticks(rotation='horizontal')
plt.ylabel('Frequency')
plt.xlabel('Gender')

In [None]:
#Creating a dictionary of image id and image location
folders = glob.glob('/content/gdrive/My Drive/skin-cancer-mnist-ham10000/*/*.jpg')
image_path = {}
img_id =[]
img_loc=[]
img_rows = 28
img_cols = 28
channels = 3
# Size of the noise vector, used as input to the Generator
z_dim = 100

# input_dims = (int(600/8), int(450/8))
input_dims = (28, 28)
# Input image dimensions
img_shape = input_dims + (3,)
for name in folders:
  img_id.append(os.path.splitext(os.path.basename(name))[0])
  img_loc.append(name)
image_path = dict(zip(img_id, img_loc))
data["path"] = data["image_id"].map(image_path) # importing image path into the dataframe
#importing images in the form of an array and resizing
data['images'] = data['path'].map(lambda x: np.asarray(Image.open(x).resize(input_dims))) 
#print(image_path)
#data.to_csv("/content/gdrive/My Drive/img.csv")

In [None]:
# visualizing one image from every celltype before and after resizing to decide the approprite downsampling
cell_type = data.groupby(["dx"]).head(1)
img_ids = list(cell_type["image_id"])
dx_type = list(cell_type["dx"])
folders = glob.glob('/content/gdrive/My Drive/skin-cancer-mnist-ham10000/*/*.jpg')
#input_dims = (120,98)
for name in folders:
  file_name =os.path.basename(name).strip(".jpg")
  if file_name in img_ids:
    i = img_ids.index(file_name)
    t = dx_type[i]
    img3 = image.load_img(name)
    s2 = input_dims #Downsample by 1/10
    img_resized = img3.resize(s2, Image.ANTIALIAS)
   
    plt.figure()
    plt.title(str(t) + " original")
    plt.imshow(img3)
    plt.figure()
    plt.title(str(t) + " downsample to 10% of original")
    plt.imshow(img_resized)  

In [None]:
# Labels preprocessing of the categorical data
import numpy as np
from sklearn.preprocessing import LabelEncoder 

#returns the categorical label given the column label and number
def humanIndex(columnLabel,n):
  print(columnLabel.inverse_transform([n]))

# 'dx' column label encoding
dx = LabelEncoder()
dx.fit(data['dx'])
data['dx'] = dx.transform(data['dx'])
dx_i = list(np.arange(0,max(data['dx'])+1,1))
print('')
print('Column "dx" labels:')
print('')
print(dx_i)
print(list(dx.classes_))

# 'dx_type' column label encoding
dx_type = LabelEncoder()
dx_type.fit(data['dx_type'])
data['dx_type'] = dx_type.transform(data['dx_type'])
dx_type_i = list(np.arange(0,max(data['dx_type'])+1,1))
print('')
print('Column "dx_type" labels:')
print('')
print(dx_type_i)
print(list(dx_type.classes_))

# 'sex' column label encoding
sex = LabelEncoder()
sex.fit(data['sex'])
data['sex'] = sex.transform(data['sex'])
sex_i = list(np.arange(0,max(data['sex'])+1,1))
print('')
print('Column "sex" labels:')
print('')
print(sex_i)
print(list(sex.classes_))

# 'localization' column label encoding
localization = LabelEncoder()
localization.fit(data['localization'])
data['localization'] = localization.transform(data['localization'])
localization_i = list(np.arange(0,max(data['localization'])+1,1))
print('')
print('Column "localization" labels:')
print('')
print(localization_i)
print(list(localization.classes_))
print('')

humanIndex(dx,5)

In [None]:
# Sorting data into Training (60%), Test (30%), and Validation (10%) sets
data = data.sample(frac=1)
X = np.asarray(data['images'].tolist()).reshape((data.shape[0],)+input_shape)
y = data["dx"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.143)
# Domain Randomization using mini-batch generator
domain_randomization = ImageDataGenerator(rotation_range=8, 
                               width_shift_range=0.08, 
                               shear_range=0.3, 
                               height_shift_range=0.08, 
                               zoom_range=0.08)

# convert class vectors to binary class matrices
y_train = keras.utils.to_categorical(y_train, num_classes=7) 
y_val = keras.utils.to_categorical(y_val, num_classes=7)
y_test = keras.utils.to_categorical(y_test, num_classes=7)

X_train = X_train.astype('float32')
X_val = X_val.astype('float32')
X_test = X_test.astype('float32')
X_train /= 255
X_val /= 255
X_test /= 255
print('X_train shape:', X_train.shape)
print('X_val shape:', X_val.shape)
print('X_test shape:', X_test.shape)
print(X_train.shape[0], 'train samples')
print(X_val.shape[0], 'validation samples')
print(X_test.shape[0], 'test samples')
print(y_val.shape[0], 'validation samples')

In [None]:
# Task 2: Building CNN architecture
batch_size = 128
epochs = 50
steps = 100
num_classes = 7
learn_rate = 0.0001

## CNN
model0 = Sequential()

# Layer 1
model0.add(Conv2D(32, 
                 kernel_size=(3, 3),
                 strides=1,
                 padding='same',
                 dilation_rate=1,
                 activation='relu',
                 input_shape=input_shape))
model0.add(BatchNormalization())
model0.add(MaxPooling2D(pool_size=(2, 2)))

# Layer 2
model0.add(Conv2D(64, (3, 3), activation='relu'))
model0.add(BatchNormalization())
model0.add(MaxPooling2D(pool_size=(2, 2)))

model0.add(Dropout(0.25))

# # Layer 3
# model0.add(Conv2D(64, (3,3),  activation='relu'))
# model0.add(BatchNormalization())
# model0.add(MaxPooling2D(pool_size=(2,2)))


model0.add(Flatten())
model0.add(Dense(128, activation='relu'))
model0.add(Dropout(0.5))
model0.add(Dense(num_classes, activation='softmax')) #output layer is dense layer of 7 nodes as there are 7 classes

model0.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=keras.optimizers.Adam(lr=learn_rate),
              metrics=['accuracy'])
print(model0.summary())

history = model0.fit_generator(datagen.flow(X_train, y_train, batch_size=batch_size),
                    steps_per_epoch=steps , epochs=epochs,verbose=1,validation_data=(X_test, y_test))


score = model0.evaluate(X_test, y_test, verbose=0)
print(history.history.keys())
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
# Making predictions with trained model
y_pred = model0.predict(X_test)
y_t1 = np.argmax(y_test, axis=1)
y_p1 = np.argmax(y_pred, axis=1)

# Balanced accuracy of test set predictions
balanced_accuracy_score(y_t1, y_p1)

In [None]:
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()