<a href="https://colab.research.google.com/github/alehb80/skin-lesion-classification/blob/master/skin_lesion_master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SKIN LESION CLASSIFICATION

## 1. Introduction

### Import API Kaggle

In [0]:
!pip install -q kaggle

In [0]:
!mkdir  ~/.kaggle
!touch ~/.kaggle/kaggle-5.json

api_token = {"username":"gianlucavisentin","key":"c609dca13d58751e6fc9d489a0dddb2d"}

import json

with open('/root/.kaggle/kaggle.json', 'w') as file:
    json.dump(api_token, file)

!chmod 600 ~/.kaggle/kaggle.json

In [0]:
!pip install --upgrade --force-reinstall --no-deps kaggle

### Search the dataset on Kaggle

In [0]:
!kaggle datasets list -s skin

### Download the dataset

In [0]:
# Copy the dataset locally
!kaggle datasets download --force -d kmader/skin-cancer-mnist-ham10000

In [0]:
!unzip skin-cancer-mnist-ham10000.zip

### Import all libraries

In [0]:
import tensorflow as tf
print(tf.__version__)

In [0]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
%matplotlib inline
import matplotlib.pyplot as plt
from glob import glob
import seaborn as sns
from PIL import Image
from sklearn.model_selection import train_test_split
import keras
from keras.utils.np_utils import to_categorical
# from keras.models import Sequential
# from keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPool2D
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D, Flatten, Dropout, MaxPooling2D

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import ImageDataGenerator
from keras.optimizers import Adam
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ReduceLROnPlateau

## 2. Data Analysis And Preprocessing

### Build Dataframe

In [0]:
# merge the images in jpg format from both the folders
base_skin_dir = "./"
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

In [0]:
import os
tile_df = pd.read_csv(os.path.join(base_skin_dir, 'HAM10000_metadata.csv'))

# Creating New Columns for better readability
tile_df['path'] = tile_df['image_id'].map(imageid_path_dict.get)
tile_df['cell_type'] = tile_df['dx'].map(lesion_type_dict.get) 
tile_df['cell_type_idx'] = pd.Categorical(tile_df['cell_type']).codes
tile_df.sample(6)

In [0]:
tile_df["age"].hist()
plt.title("Distribution of Age", fontdict={"fontsize" : 20});

In [0]:
tile_df["sex"].hist()
plt.title("Male or Female", fontdict={"fontsize" : 20});

In [0]:
tile_df["localization"].hist()
plt.title("Where are skin lesions?", fontdict={"fontsize" : 20});


In [0]:
tile_df["cell_type"].hist()
plt.title("Where are skin lesions?", fontdict={"fontsize" : 20});

### Cleaning Data

In [0]:
# show if there is a null data
tile_df.isnull().sum()

In [0]:
# calculate te mean value of the "age" feature
print(tile_df['age'].mean())

In [0]:
# fill the null values by their mean
tile_df['age'].fillna((tile_df['age'].mean()), inplace=True)

In [0]:
# show if there is a null data
tile_df.isnull().sum()

### View Image Samples

### Loading and resize images

In [0]:
# show some image samples
n_samples = 3
fig, m_axs = plt.subplots(7, n_samples, figsize = (4*n_samples, 3*7))
for n_axs, (type_name, type_rows) in zip(m_axs, 
                                         tile_df.sort_values(['cell_type']).groupby('cell_type')):
    n_axs[0].set_title(type_name)
    for c_ax, (_, c_row) in zip(n_axs, type_rows.sample(n_samples, random_state=1234).iterrows()):
        c_ax.imshow(c_row['image'])
        c_ax.axis('off')
fig.savefig('category_samples.png', dpi=300)

In [0]:
# resize the images to 100 * 75 which tensorflow can handle
tile_df['image'] = tile_df['path'].map(lambda x: np.asarray(Image.open(x).resize((100,75))))

In [0]:
# checking the image size distribution
tile_df['image'].map(lambda x: x.shape)

## 3. Build The Dataset

### Split data into train and test with 80:20 ratio

In [0]:
features=tile_df.drop(columns=['cell_type_idx'],axis=1)
target=tile_df['cell_type_idx']

# 'train_test_split' splits arrays or matrices into random train and test subsets
x_train_o, x_test_o, y_train_o, y_test_o = train_test_split(features, target, test_size=0.20,random_state=1234)

### Normalization

In [0]:
# subtract their mean values from it and then dividing by their standard deviation
x_train = np.asarray(x_train_o['image'].tolist())
x_test = np.asarray(x_test_o['image'].tolist())

x_train_mean = np.mean(x_train)
x_train_std = np.std(x_train)

x_test_mean = np.mean(x_test)
x_test_std = np.std(x_test)

x_train = (x_train - x_train_mean)/x_train_std
x_test = (x_test - x_test_mean)/x_test_std

### Label Encoding

In [0]:
# labels are 7 different classes of cell types from 0 to 6. We need to encode these lables to one hot vectors
y_train = to_categorical(y_train_o, num_classes = 7)
y_test = to_categorical(y_test_o, num_classes = 7)

### Training and Validation Split

In [0]:
# validate:train >> 10:90 %
x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size = 0.2, random_state = 2)

In [0]:
x_train = x_train.reshape(x_train.shape[0], *(75, 100, 3))
x_test = x_test.reshape(x_test.shape[0], *(75, 100, 3))
x_validate = x_validate.reshape(x_validate.shape[0], *(75, 100, 3))

## 4. Model Building

### Define the deep convolutional network architecture

In [0]:
#input_shape = (75, 100, 3)
#num_classes = 7
#model = Sequential([
 #   Conv2D(32, 3, padding='same', activation='relu', input_shape=input_shape),
  #  Conv2D(32, 3, padding='same', activation='relu'),
   # MaxPooling2D(),
    #Dropout(0.25),

    #Conv2D(64, 3, padding='same', activation='relu', input_shape=input_shape),
    #Conv2D(64, 3, padding='same', activation='relu'),
    #MaxPooling2D(),
    #Dropout(0.4),
    
    #Conv2D(128, 3, padding='same', activation='relu'),
    #MaxPooling2D(),
    #Dropout(0.5),
    
   # Flatten(),
   # Dense(128, activation='relu'),
    #Dropout(0.5),
    
    #Dense(7, activation='softmax')
#])

from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.optimizers import Adam


# Our input feature map is 75x100x3: 75x100 for the image pixels, and 3 for
# the three color channels: R, G, and B
img_input = layers.Input(shape=(75, 100, 3))

# First convolution extracts 16 filters that are 3x3
# Convolution is followed by max-pooling layer with a 2x2 window
x = layers.Conv2D(16, 3, activation='relu', padding='same')(img_input)
x = layers.MaxPooling2D(2)(x)

# Second convolution extracts 32 filters that are 3x3
# Convolution is followed by max-pooling layer with a 2x2 window
x = layers.Conv2D(32, 3, activation='relu', padding='same')(x)
x = layers.MaxPooling2D(2)(x)

# Third convolution extracts 64 filters that are 3x3
# Convolution is followed by max-pooling layer with a 2x2 window
x = layers.Convolution2D(64, 3, activation='relu', padding='same')(x)
x = layers.MaxPooling2D(2)(x)

# Flatten feature map to a 1-dim tensor
x = layers.Flatten()(x)

# Create a fully connected layer with ReLU activation and 512 hidden units
x = layers.Dense(512, activation='relu')(x)

# Add a dropout rate of 0.5
x = layers.Dropout(0.5)(x)

# Create output layer with a single node and softmax activation
output = layers.Dense(7, activation='softmax')(x)

# Configure and compile the model
model = Model(img_input, output)

optimizer = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [0]:
#model.compile(optimizer='adam',
             # loss='binary_crossentropy',
             #metrics=['accuracy'])

In [0]:

#model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[accuracy])

In [0]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau
def get_call_back():
  reduce_lr = tf.keras.callbacks.ReduceLROnPlateau()
  reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', 
                                            patience=3, 
                                            verbose=1, 
                                          factor=0.5, 
                                           min_lr=0.00001)
  return[reduce_lr]

#reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                 #factor=0.2,
                                #patience=5,
                                 #  min_lr=0.001)


In [0]:
model.summary()

### Avoid the overfitting

In [0]:
# avoiding the overfitting
datagen = ImageDataGenerator(
        featurewise_center=False,  # set input mean to 0 over the dataset
        samplewise_center=False,  # set each sample mean to 0
        featurewise_std_normalization=False,  # divide inputs by std of the dataset
        samplewise_std_normalization=False,  # divide each input by its std
        zca_whitening=False,  # apply ZCA whitening
        rotation_range=10,  # randomly rotate images in the range (degrees, 0 to 180)
        zoom_range = 0.1, # Randomly zoom image 
        width_shift_range=0.1,  # randomly shift images horizontally (fraction of total width)
        height_shift_range=0.1,  # randomly shift images vertically (fraction of total height)
        horizontal_flip=False,  # randomly flip images
        vertical_flip=False
        )  # randomly flip images
datagen.fit(x_train)

## 5. Train Model

### Training

In [0]:
#epochs = 50 
epochs = 30 
batch_size = 160
history = model.fit(
    datagen.flow(x_train,y_train, batch_size=batch_size),
    steps_per_epoch=x_train.shape[0] // batch_size,
    epochs=epochs,
    validation_data=(x_validate,y_validate),
    validation_steps=x_validate.shape[0] // batch_size
    ,callbacks=get_call_back()
)

## 6. Evaluate Model

### Accuracy

In [0]:
loss, accuracy = model.evaluate(x_test, y_test, verbose=1)
loss_v, accuracy_v = model.evaluate(x_validate, y_validate, verbose=1)
print("Validation: accuracy = %f  ;  loss_v = %f" % (accuracy_v, loss_v))
print("Test: accuracy = %f  ;  loss = %f" % (accuracy, loss))
model.save("model.h5")

### Visualize Learning

In [0]:
#1. Function to plot model's validation loss and validation accuracy
def plot_model_history(model_history):
    fig, axs = plt.subplots(1,2,figsize=(15,5))
    # summarize history for accuracy
    axs[0].plot(range(1,len(model_history.history['accuracy'])+1),model_history.history['accuracy'])
    axs[0].plot(range(1,len(model_history.history['val_accuracy'])+1),model_history.history['val_accuracy'])
    axs[0].set_title('Model Accuracy')
    axs[0].set_ylabel('Accuracy')
    axs[0].set_xlabel('Epoch')
    axs[0].set_xticks(np.arange(1,len(model_history.history['accuracy'])+1),len(model_history.history['accuracy'])/10)
    axs[0].legend(['train', 'val'], loc='best')
    # summarize history for loss
    axs[1].plot(range(1,len(model_history.history['loss'])+1),model_history.history['loss'])
    axs[1].plot(range(1,len(model_history.history['val_loss'])+1),model_history.history['val_loss'])
    axs[1].set_title('Model Loss')
    axs[1].set_ylabel('Loss')
    axs[1].set_xlabel('Epoch')
    axs[1].set_xticks(np.arange(1,len(model_history.history['loss'])+1),len(model_history.history['loss'])/10)
    axs[1].legend(['train', 'val'], loc='best')
    plt.show()

In [0]:
plot_model_history(history)