# MOLE DETECTION

### Data Analyses

#### Import Necessary Libraries

In [42]:
import os
import numpy as np
import pandas as pd

from PIL import Image
from glob import glob
import itertools
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, Flatten, BatchNormalization, Dropout, Dense, MaxPool2D
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

#### Define default variables

In [43]:
DATA_DIR = os.path.join('.', 'data')
IMAGE_DIR = os.path.join('.', 'data\\HAM10000_images\\')


lesion_type_dict = {
    'nv': 'Melanocytic nevi (nv)',
    'mel': 'Melanoma (mel)',
    'bkl': 'Benign keratosis-like lesions (bkl)',
    'bcc': 'Basal cell carcinoma (bcc)',
    'akiec': 'Actinic keratoses (akiec)',
    'vasc': 'Vascular lesions (vasc)',
    'df': 'Dermatofibroma (df)'
}
label_mapping = {
    0: 'nv',
    1: 'mel',
    2: 'bkl',
    3: 'bcc',
    4: 'akiec',
    5: 'vasc',
    6: 'df'
}
labels = {
    0: 'Melanocytic nevi (nv)',
    1: 'Melanoma (mel)',
    2: 'Benign keratosis-like lesions (bkl)',
    3: 'Basal cell carcinoma (bcc)',
    4: 'Actinic keratoses (akiec)',
    5: 'Vascular lesions (vasc)',
    6: 'Dermatofibroma (df)'
}
int_label_mapping = dict((value, key) for key, value in label_mapping.items())

#### Import the data 

In [44]:
data = pd.read_csv(os.path.join(DATA_DIR,'HAM10000_metadata.csv'))
data.head(5)


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [45]:
data.dtypes

lesion_id        object
image_id         object
dx               object
dx_type          object
age             float64
sex              object
localization     object
dtype: object

In [46]:
print(data.describe(exclude=[np.number]))
print("--------------------------------------------------------------------------------")
print(data.isnull().value_counts())

          lesion_id      image_id     dx dx_type    sex localization
count         10015         10015  10015   10015  10015        10015
unique         7470         10015      7       4      3           15
top     HAM_0003789  ISIC_0027419     nv   histo   male         back
freq              6             1   6705    5340   5406         2192
--------------------------------------------------------------------------------
lesion_id  image_id  dx     dx_type  age    sex    localization
False      False     False  False    False  False  False           9958
                                     True   False  False             57
dtype: int64


In [47]:
# We wont use age but in any case handle  null values 
data['age'].fillna(value=int(data['age'].mean()), inplace=True)
data['age'] = data['age'].astype('int32')

In [48]:
# Adding mole_type and path of images columns
data['mole_type'] = data['dx'].map(lesion_type_dict.get)
data['path'] = IMAGE_DIR + data['image_id'] + ".JPG"

In [49]:
# Adding image pixel data to dataframe
data['image_pixel'] = data['path'].map(lambda x: np.asarray(Image.open(x).resize((56,56))))

In [None]:
# Converting labels to numbers
data['label'] = data['dx'].map(int_label_mapping.get)

data = data.sort_values('label')
data = data.reset_index()

In [None]:
data.head(5)

In [None]:
fig, ax1 = plt.subplots(1, 1, figsize= (10, 5))
data['mole_type'].value_counts().plot(kind='bar', ax=ax1)

### Increase sample size of some groups

In [None]:
# make the same size samples 
data['label'].value_counts()

### Color channel analysis

How do the normalized color samples vary across different types of skin diseases?



In [None]:
rgb_info_df = data.apply(lambda x: pd.Series({'{}_mean'.format(k): v for k, v in 
                                  zip(['Red', 'Green', 'Blue'], 
                                      np.mean(x['image_pixel'], (0, 1)))}),1)
gray_col_vec = rgb_info_df.apply(lambda x: np.mean(x), 1)
for c_col in rgb_info_df.columns:
    rgb_info_df[c_col] = rgb_info_df[c_col]/gray_col_vec
rgb_info_df['Gray_mean'] = gray_col_vec
rgb_info_df.head(10)

In [None]:
for c_col in rgb_info_df.columns:
    data[c_col] = rgb_info_df[c_col].values

In [None]:
import seaborn as sns
sns.pairplot(data[['Red_mean', 'Green_mean', 'Blue_mean', 'Gray_mean', 'mole_type']], 
             hue='mole_type', plot_kws = {'alpha': 0.5})


Some types has a little samples, Create extra images for image generator

In [None]:
# 0	6705    1            20K			
# 1	1113	8	8904	11130
# 2	1099	8	8792	10990
# 3	514	    22	11308	12336
# 4	327	    34	11118	11772
# 5	142	    90	12780	13064
# 6	115	    104	11960	12190


_label = 0
groups = [data]
for i in [1,8,8,22,34,90,104]:
    index = data[data['label'] == _label].index.values
    df_l = data.iloc[int(min(index)):int(max(index)+1)]
    df_l = df_l.append([df_l]*i, ignore_index = True)
    groups.append(df_l)
    _label+=1

In [None]:
final_data = pd.concat(groups)

In [None]:
final_data['label'].value_counts()

In [None]:
final_data.tail(5)

Save both original and augemented data to csv for future use

In [None]:
data.to_csv("dataall.csv")
final_data.to_csv("final_aug.csv")

In [None]:
# ORIGINAL
# Convert image pixel columnm into numpy array
X_orig = data['image_pixel'].to_numpy()
X_orig = np.stack(X_orig, axis=0)
Y_orig = np.array(data.iloc[:, -1:])
print(X_orig.shape)
print(Y_orig.shape)

In [None]:
# AUGMENTED DATA
# Convert image pixel columnm into numpy array
X_aug = final_data['image_pixel'].to_numpy()
X_aug = np.stack(X_aug, axis=0)
Y_aug = np.array(final_data.iloc[:, -1:])
print(X_aug.shape)
print(Y_aug.shape)

In [None]:
def train_test(X, Y):
    # train test split
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42,shuffle=True)
    
    # train and test (found the numbers for parameters from some other repos)
    train_datagenerate = ImageDataGenerator(rescale = 1./255,
                                  rotation_range = 10,
                                  width_shift_range = 0.2,
                                  height_shift_range = 0.2,
                                  shear_range = 0.2,
                                  horizontal_flip = True,
                                  vertical_flip = True,
                                  fill_mode = 'nearest')
    
    train_datagenerate.fit(X_train)
    test_datagenerate = ImageDataGenerator(rescale = 1./255)
    test_datagenerate.fit(X_test)
    
    return X_train, X_test, Y_train, Y_test

In [39]:
def create_model():
    model = Sequential()
    
    model.add(Conv2D(16, kernel_size = (3,3), input_shape = (56, 56, 3), activation = 'relu', padding = 'same'))
    model.add(MaxPool2D(pool_size = (2,2)))

    model.add(Conv2D(32, kernel_size = (3,3), activation = 'relu', padding = 'same'))
    model.add(MaxPool2D(pool_size = (2,2), padding = 'same'))

    model.add(Conv2D(64, kernel_size = (3,3), activation = 'relu', padding = 'same'))
    model.add(MaxPool2D(pool_size = (2,2), padding = 'same'))
    model.add(Conv2D(128, kernel_size = (3,3), activation = 'relu', padding = 'same'))
    model.add(MaxPool2D(pool_size = (2,2), padding = 'same'))

    model.add(Flatten())
    model.add(Dense(128, activation = 'relu'))
    model.add(Dense(64, activation = 'relu'))
    model.add(Dense(32, activation='relu'))
    model.add(Dense(7, activation='softmax'))

    optimizer = tf.keras.optimizers.Adam(learning_rate = 0.001)

    model.compile(loss = 'sparse_categorical_crossentropy',
                 optimizer = optimizer,
                  metrics = ['accuracy'])
    print(model.summary())
    return model

In [None]:
def train_model(model, X_tr, Y_tr, EPOCHS=25):
    early_stop = EarlyStopping(monitor='val_loss', patience=8, verbose=1,mode='auto')
                               
    
    learning_rate = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, mode='auto')
    
    history = model.fit(X_tr, Y_tr, validation_split=0.2, batch_size = 64, epochs = EPOCHS,callbacks = [learning_rate, early_stop])
    return history

In [None]:
def test_model(model, X_test, Y_test):
    model_acc = model.evaluate(X_test, Y_test, verbose=0)[1]
    print("Test Accuracy: {:.3f}%".format(model_acc * 100))
    y_true = np.array(Y_test)
    y_pred = model.predict(X_test)
    y_pred = np.array(list(map(lambda x: np.argmax(x), y_pred)))
    clr = classification_report(y_true, y_pred, target_names=label_mapping.values())
    print(clr)
    cm=confusion_matrix(y_true, y_pred)
    plot_confusion_matrix(cm,label_mapping,"44")
    print(cm)

    plt.show() 

In [None]:
def plot_confusion_matrix(cm, classes, name,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Greens):
    
    plt.figure(figsize=(8,6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(name)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

   
    plt.tight_layout()
    plt.ylabel('True Labels')
    plt.xlabel('Predicted Labels')
    fig = plt
    fig.savefig('_confusion_matrix.png', dpi=300)

### Create train and test datas for original and augumented datasets

In [None]:
# For Original Dataset
X_train_orig, X_test_orig, Y_train_orig, Y_test_orig = train_test(X_orig, Y_orig)
#Augemented dataset
X_train_aug, X_test_aug, Y_train_aug, Y_test_aug = train_test(X_aug, Y_aug)
print(X_train_orig.shape)
print(X_train_aug.shape)


In [None]:
X_train_orig

### Create the Model

In [40]:
model = create_model()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_21 (Conv2D)          (None, 56, 56, 16)        448       
                                                                 
 conv2d_22 (Conv2D)          (None, 56, 56, 32)        4640      
                                                                 
 max_pooling2d_20 (MaxPoolin  (None, 28, 28, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_23 (Conv2D)          (None, 28, 28, 32)        9248      
                                                                 
 max_pooling2d_21 (MaxPoolin  (None, 14, 14, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_24 (Conv2D)          (None, 14, 14, 64)       

In [41]:
model_history = train_model(model, X_train_aug, Y_train_aug, 50)

Epoch 1/50
132/916 [===>..........................] - ETA: 15s - loss: nan - accuracy: 0.0000e+00

KeyboardInterrupt: 

##### Loss and accuracy curves for training and test sets.

In [None]:
g, ax = plt.subplots(2,1)
ax[0].plot(model_history.history['loss'], color='g', label="Training loss")
ax[0].plot(model_history.history['val_loss'], color='r', label="Validation loss",axes =ax[0])
legend = ax[0].legend(loc='best', shadow=True)

ax[1].plot(model_history.history['accuracy'], color='g', label="Training accuracy")
ax[1].plot(model_history.history['val_accuracy'], color='r',label="Validation accuracy")
legend = ax[1].legend(loc='best', shadow=True)

### Saving the model for deployment 

In [None]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("model.h5")

### Test Original and Augumented datasets

In [None]:
test_model(model, X_test_orig, Y_test_orig)

In [None]:
test_model(model, X_test_aug, Y_test_aug)

### Prediction for only one image

In [None]:
def predict_one_image(image_nr):

    predict_only_one = data.iloc[image_nr]
    print("Imid :" ,predict_only_one.image_id)
    print("Real :" ,predict_only_one.mole_type)
    
    new_one = predict_only_one.image_pixel.reshape((1,28,28,3))
    preds=model.predict(new_one)
    pred_probabilty = " % {:.2f}".format(np.amax(preds)*100) 
    max_index_col = np.argmax(preds)
    result=labels.get(max_index_col) + pred_probabilty
    print("Pred :", result)

    sample_data = predict_only_one['image_pixel']
    plt.figure(figsize=(10, 10))
    plt.imshow(sample_data)
    plt.axis("off")
    plt.show() 


In [None]:
predict_one_image(9372)