<a href="https://colab.research.google.com/github/adindaayundra/DeteksiKankerKulit-MBKM-Riset/blob/main/DeteksiKankerKulit_Cnn_MbkmRiset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Persiapan Data (Data Preparation)**

Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import shutil
import os
import pathlib
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2

import tensorflow.keras as keras
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout, BatchNormalization, Activation, Conv2D, MaxPooling2D

In [None]:
# Memasukkan lokasi Dataset Jenis Kanker Kulit

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!ls "/content/gdrive/My Drive/project_mbkmriset/dataset"

In [None]:
path = "/content/gdrive/My Drive/project_mbkmriset/dataset"

In [None]:
df = pd.read_csv("/content/gdrive/My Drive/project_mbkmriset/dataset/.....")  #belum lengkap, mohon dilengkapi
df.head()


* Identifikasi dan Menghapus Duplikat
 
  (Identify and Remove Duplicates)


In [None]:
df.drop_duplicates(inplace=True)


*   **Pembersihan Data**

    (Data Cleaning)


In [None]:
df.isnull().sum()


*   Menangani Masalah 

    (Handling the Problem) :

 1. Detecting "Na" and "na" Values
 2. Fill na Values with Mean of "age" Column


1. Detecting "Na" and "na" Values

In [None]:
missing_value = ['N/a', 'na', ' ', np.nan]
df = pd.read_csv("/content/gdrive/My Drive/project_mbkmriset/dataset/.....", na_values=missing_value)

In [None]:
df.isnull().sum()

In [None]:
df.isnull().any()

In [None]:
sns.heatmap(df.isnull(), yticklabels=False)

2. Fill na Values with Mean of "age" Column

In [None]:
df['age'].fillna((df['age'].mean()), inplace=True)

In [None]:
df.isnull().sum()

Exploratory Data Analysis

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
sns.countplot(df['dx'])

In [None]:
dx = df['dx'].value_counts().sort_index()
print(dx)

In [None]:
plt.figure(figsize = (10,8))
sns.countplot(df['localization'])
plt.xticks(rotation = 90)

In [None]:
sns.countplot(df['dx_type'])

In [None]:
dx_type = df['dx_type'].value_counts().sort_index()
print(dx_type)

In [None]:
sns.countplot(df['sex'])

In [None]:
sex = df['sex'].value_counts().sort_index()
print(sex)

In [None]:
plt.figure(figsize = (12,6))
sns.countplot(df['age'])
plt.xticks(rotation = 90)

In [None]:
age = df['age'].value_counts().sort_index()
print(age)


*   Record the Label Names



In [None]:
labels = ['Actinic Keratoses',
          'Basal Cell Carcinoma',
          'Benign Keratosis',
          'Dermatofibroma',
          'Melanocytic Nevi',
          'Melanoma',
          'Vascular Skin Lesions']

num_classes = len(labels)
print(num_classes)


*   Create Local Data Directory



In [None]:
data_dir = 'data'
os.mkdir(data_dir)

train_dir = os.path.join(data_dir, 'train')
os.mkdir(train_dir)

val_dir = os.path.join(data_dir, 'val')
os.mkdir(val_dir)

test_dir = os.path.join(data_dir, 'test')
os.mkdir(test_dir)



*   Create Directory for Each Category in Train/Validation/Test Directory



In [None]:
categories = dx.index.values
for category in categories:
    os.mkdir(os.path.join(train_dir, category))
    os.mkdir(os.path.join(val_dir, category))
    os.mkdir(os.path.join(test_dir, category))


*   Split to Train and Validation Set

In [None]:
from sklearn.model_selection import train_test_split
df_train, df_tmp = train_test_split(df, test_size = 0.2, random_state = 101, stratify = df['dx'])
df_val, df_test = train_test_split(df_tmp, test_size = 0.5, random_state = 101)
print(df_train.shape)
print(df_val.shape)
print(df_test.shape)

In [None]:
# image_id as df index
df_train = df_train.set_index('image_id')
df_val   = df_val.set_index('image_id')
df_test  = df_test.set_index('image_id')

In [None]:
folder_1 = os.listdir('/content/gdrive/MyDrive/.....') #part 1, dilengkapi
folder_2 = os.listdir('/content/gdrive/MyDrive/.....') #part 2, dilengkapi

def copy_files(df, data_dir):
  fileList = df.index.values

  for file in fileList:
    fname = file + '.jpg'
    label = df.loc[file, 'dx']

    if fname in folder_1:
      src = os.path.join('/content/gdrive/MyDrive/....', fname) #dilengkapi
      dst = os.path.join(data_dir, label, fname)
      shutil.copyfile(src, dst)

    if fname in folder_2
      src = os.path.join('/content/gdrive/MyDrive/....', fname) #dilengkapi
      dst = os.path.join(data_dir, label, fname)
      shutil.copyfile(src, dst)

Alternative 1

In [None]:
copy_files(df_train, train_dir)
copy_files(df_val, val_dir)
copy_files(df_test, test_dir)

In [None]:
from shutil import make_archive
shutil.make_archive('data', 'zip', root_dir='/content/data')

In [None]:
from google.colab import files
files.download('/content/data.zip')

Alternative 2 after
- We download data.zip and we can access from Gdrive

In [None]:
train_dir = pathlib.Path("/content/gdrive/My Drive/.....") #dilengkapi
val_dir = pathlib.Path("/content/gdrive/My Drive/.....") #dilengkapi
test_dir = pathlib.Path("/content/gdrive/My Drive/.....") #dilengkapi

In [None]:
df_train['dx'].value_counts()

In [None]:
df_val['dx'].value_counts()

In [None]:
df_test['dx'].value_count()

In [None]:
print(len(os.listdir('/content/data/train/akiec')))
print(len(os.listdir('/content/data/train/bcc')))
print(len(os.listdir('/content/data/train/bkl')))
print(len(os.listdir('/content/data/train/df')))
print(len(os.listdir('/content/data/train/mel')))
print(len(os.listdir('/content/data/train/nv')))
print(len(os.listdir('/content/data/train/vasc')))

In [None]:
print(len(os.listdir('/content/data/train/akiec')))
print(len(os.listdir('/content/data/train/bcc')))
print(len(os.listdir('/content/data/train/bkl')))
print(len(os.listdir('/content/data/train/df')))
print(len(os.listdir('/content/data/train/mel')))
print(len(os.listdir('/content/data/train/nv')))
print(len(os.listdir('/content/data/train/vasc')))

In [None]:
print(len(os.listdir('/content/data/train/akiec')))
print(len(os.listdir('/content/data/train/bcc')))
print(len(os.listdir('/content/data/train/bkl')))
print(len(os.listdir('/content/data/train/df')))
print(len(os.listdir('/content/data/train/mel')))
print(len(os.listdir('/content/data/train/nv')))
print(len(os.listdir('/content/data/train/vasc')))

In [None]:
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/train_1/akiec')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/train_1/kcc')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/train_1/bkl')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/train_1/df')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/train_1/mel')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/train_1/nv')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/train_1/vasc')))

In [None]:
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/val_1/akiec')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/val_1/kcc')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/val_1/bkl')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/val_1/df')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/val_1/mel')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/val_1/nv')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/val_1/vasc')))

In [None]:
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/test_1/akiec')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/test_1/kcc')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/test_1/bkl')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/test_1/df')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/test_1/mel')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/test_1/nv')))
print(len(os.listdir('/content/gdrive/My Drive/project_mbkmriset/dataset/test_1/vasc')))


*   Data Augmentation and Flow Data



In [None]:
train_datagen = ImageDataGenerator(rescale = 1./255,
                                   horizontal_flip = True,
                                   vertical_flip = True,
                                   rotation_range = 20,
                                   width_shift_range = 0.2,
                                   height_shift_range = 0.2,
                                   shear_range = 0.2,
                                   zoom_range = 0.2)

train_generator = train_datagen.flow_from_directory('/content/data/train',
                                                    target_size = (224, 224),
                                                    batch_size = 20,
                                                    color_mode = 'rgb',
                                                    shuffle = True,
                                                    seed = 42,
                                                    class_mode = 'categorical')

In [None]:
validation_datagen = ImageDataGenerator(rescale = 1./255,
                                        horizontal_flip = True,
                                        vertical_flip = True,
                                        rotation_range = 20,
                                        width_shift_range = 0.2,
                                        height_shift_range = 0.2,
                                        shear_range = 0.2,
                                        zoom_range = 0.2)

validation_generator = train_datagen.flow_from_directory('/content/data/val',
                                                         target_size = (224, 224),
                                                         batch_size = 20,
                                                         color_mode = 'rgb',
                                                         shuffle = False,
                                                         seed = 42,
                                                         class_mode = 'categorical')

In [None]:
test_datagen = ImageDataGenerator(rescale = 1./255)

test_generator = train_datagen.flow_from_directory('/content/data/test',
                                                   target_size = (224, 224),
                                                   batch_size = 20,
                                                   color_mode = 'rgb',
                                                   shuffle = False,
                                                   seed = 42,
                                                   class_mode = 'categorical')


*   Visualize the Data



In [None]:
for image_batch, labels_batch in train_generator:
  print(image_batch.shape)
  print(labels_batch.shape)
  break

# Build Model #

Manually Define CNN

In [None]:
input_shape = (224, 224, 3)
num_classes = 7

In [None]:
model = Sequential([
                    Conv2D(32, 3, padding = 'same', activation = 'relu', input_shape = input_shape),
                    Conv2D(32, 3, padding = 'same', activation = 'relu'),
                    MaxPooling2D(),

                    Dropout(0.25),

                    Conv2D(64, 3, padding = 'same', activation = 'relu', input_shape = input_shape),
                    Conv2D(64, 3, padding = 'same', activation = 'relu'),
                    MaxPooling2D(),

                    Dropout(0.4),

                    Conv2D(128, 3, padding = 'same', activation = 'relu'),
                    MaxPooling2D(),

                    Dropout(0.5),

                    Flatten(),
                    Dense(128, activation = 'relu'),

                    Dropout(0.55),

                    Dense(7, activation = 'softmax')])

print('Compiling Model..............')
model.compile(optimizer='adam',
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

In [None]:
model.summary()


*   **Build Base Model**



In [None]:
base_model= MobileNetV2(input_shape = input_shape,
                        weights = 'imagenet',
                        include_top = False,
                        classes = num_classes)

In [None]:
for layer in base_model.layers:
    layer.trainable = False



*   Add Top Layers Network



In [None]:
model = base_model.output
model = Dropout(0.5)(model)
model = GlobalAveragePooling2D()(model)
model = Dense(256, activation="relu")(model)
model = Dense(128, activation="relu")(model)
model = BatchNormalization()(model)

predicitions = Dense(num_classes, activation='softmax')(model)

model = Model(inputs = base_model.input,outputs = predictions)

base_model.trainable = False

In [None]:
model.summary()



*   Compile Model



In [None]:
print('Compiling Model..........')

# Compile the model
model.compile(optimizer = 'adam',
              loss = "categorical_crossentropy",
              metrics = ["accuracy"])


*   Visualization Models

In [None]:
model_visual = tf.keras.utils.plot_model(model,
                                         to_file = 'model_visualization.png',
                                         show_shapes = True,
                                         show_layer_names = True,
                                         rankdir = 'TB',
                                         expand_nested = True,
                                         dpi = 55)
model_visual


*   Training the Model

In [None]:
import numpy as np
import math

# labels_dict : {ind_label: count_label}
# mu : parameter to true

def create_class_weight(labels_dict,mu=0.15):
    total = np.sum(list(labels_dict.values()))
    keys = labels_dict.keys()
    class_weight = dict()

    for key in keys:
      score = math.log(mu*total/float(labels_dict[key]))
      class_weight[key] = score if score > 1.0 else 1.0

    return class_weight

    #random labels_dict
    labels_dict = {: 327, 1: 514, 2: 1099, 3: 115, 4: 1113, 5: 6705, 6: 142}

    create_class_weight(labels_dict)

In [None]:
class_weight = {0 : ........,
                1 : ........,
                2 : ........,
                3 : ........,
                4 : ........,
                5 : ........,
                6 : ........,}

#0 : akiec
#1 : bcc
#2 : bk;
#3 : df
#4 : mel
#5 : nv
#6 : vasc

In [None]:
history = model.fit_generator(train_generator,
                              class_weight = class_weight,
                              steps_per_epoch = len(train_generator),
                              epoch 100,
                              validation_data = validation_generator,
                              validation_steps = len(validation_generator),
                              verbose = 1)

Plot the Model Training

In [None]:
#------------------------------------------------
# Evaluating acc and loss for model
#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------

plt.plot(history.history['accuracy'], label = 'Training Accuracy' )
plt.plot(history.history['val_accuracy'], label = 'Validation Accuracy' )
plt.title('Tranining and validation accuracy')
plt.xlabel('Number of Epoch')
plt.ylabel('Value')
plt.legend(loc='lower right')
plt.show

In [None]:
#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------

