In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pathlib
import tensorflow as tf
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import PIL
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
import matplotlib.pyplot as plt


In [None]:
!conda install -y gdown

In [None]:
import gdown

url = 'https://drive.google.com/file/d/1KY1eqXDMu4zPgT8B6Vx7MkJCYWsTNRRk'

output = 'file.zip'

gdown.download(url, output, quiet=False)

In [None]:
!gdown --id 1KY1eqXDMu4zPgT8B6Vx7MkJCYWsTNRRk

In [None]:
import os
os.listdir()

In [None]:
#extracting zip file
import zipfile
z= zipfile.ZipFile('CNN_assignment.zip')
z.extractall()

In [None]:
train = pathlib.Path("Skin cancer ISIC The International Skin Imaging Collaboration/Train/")
test = pathlib.Path("Skin cancer ISIC The International Skin Imaging Collaboration/Test/")

In [None]:
image_count_train = len(list(train.glob('*/*.jpg')))
print('Images in training dataset: ',image_count_train)
image_count_test = len(list(test.glob('*/*.jpg')))
print('Images in testing dataset: ',image_count_test)

In [None]:
batch_size = 32
img_height = 180
img_width = 180

### Creating train and validation dataset

In [None]:
train_ds = tf.keras.utils.image_dataset_from_directory(
            train,
            batch_size=batch_size,
            image_size=(img_height, img_width),
            seed=123,
            validation_split=0.2,
            subset='training',
            )

In [None]:
val_ds = tf.keras.utils.image_dataset_from_directory(
            train,
            batch_size=batch_size,
            image_size=(img_height, img_width),
            seed=123,
            validation_split=0.2,
            subset='validation',
            )

In [None]:
class_names = train_ds.class_names
print(class_names)

### Visualizing one instance of all the nine classes present in the dataset

In [None]:
for images, labels in train_ds:
    unique_li = []
    unique_images = []
    for i in range(32):
        if class_names[labels[i]] not in unique_li:
            unique_li.append(class_names[labels[i]])
            unique_images.append((class_names[labels[i]],images[i]))

In [None]:
unique_li

In [None]:
plt.figure(figsize=(10, 10))
for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(unique_images[i][1].numpy().astype("uint8"))
    plt.title(unique_images[i][0])
    plt.axis("off")

In [None]:
AUTOTUNE = tf.data.experimental.AUTOTUNE
train_ds = train_ds.cache().shuffle(1000).prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

### Model 1 - using dropout

In [None]:
num_class = 9
img_size = 180
model = Sequential([
    layers.experimental.preprocessing.Rescaling(1./255,input_shape=(img_height, img_width, 3)),
    layers.experimental.preprocessing.Resizing(img_size, img_size),# Resizing the image to 180x180
    layers.Conv2D(16, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Conv2D(32, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Conv2D(64, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(num_class,activation='softmax')
])

In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True),
              metrics=['accuracy'])

In [None]:
epochs = 20
history = model.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs
)

In [None]:
model.summary()

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

loss = history.history['loss']
val_loss = history.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
accuracy_results_df = pd.DataFrame(data=[{"Type":"Model1 using Dropout","Training Accuracy":acc[-1],"Validation Accuracy":val_acc[-1],"Epochs":epochs}])
accuracy_results_df

**Observations on model1 :**
- The training accuracy at the end we got is 70% and validation accuracy is 52%. Since there is huge gap between training and validation accuracy, the model will not perform well on newer dataset.
- As we can see from the plot that as the number of epochs increases, the training accuracy also increased but the validation accuracy reduced. This is clear case of overfitting.
- We can also see that training loss decreased and validation loss increased with the number of epochs.

### Data augmentation

In [None]:
data_augmentation = tf.keras.Sequential([
  layers.experimental.preprocessing.RandomRotation(0.2)
])

In [None]:
import random
image = tf.expand_dims(unique_images[random.randint(0,9)][1], 0)
plt.imshow(image[0].numpy().astype("uint8"))
plt.title("Original Image")
plt.axis("off")

In [None]:
augmented_image = data_augmentation(image)
plt.figure(figsize=(10, 10))
for i in range(16):
    augmented_image = data_augmentation(image)
    ax = plt.subplot(4, 4, i + 1)
    plt.imshow(augmented_image[0].numpy().astype("uint8"))
    plt.axis("off")

- We can see that slight rotations are applied to the images using the data augmentation


In [None]:
num_class = 9
img_size = 180
model_data = Sequential([
    layers.experimental.preprocessing.Rescaling(1./255,input_shape=(img_height, img_width, 3)),
    layers.experimental.preprocessing.Resizing(img_size, img_size),# Resizing the image to 180x180
    data_augmentation,
    layers.Conv2D(16, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Conv2D(32, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Conv2D(64, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(num_class,activation='softmax')
])

In [None]:
model_data.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'],
              )
model_data.summary()

In [None]:
epochs = 20
# Lets the fit the model with batch size of 32 and 20 epochs
history_data = model_data.fit(
  train_ds,
  validation_data=val_ds,
  epochs=epochs,
  batch_size=32
)

In [None]:
acc = history_data.history['accuracy']
val_acc = history_data.history['val_accuracy']

loss = history_data.history['loss']
val_loss = history_data.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
accuracy_results_df = accuracy_results_df.append([{"Type":"Model2 using Data Augmentation","Training Accuracy":acc[-1],"Validation Accuracy":val_acc[-1],"Epochs":epochs}])
accuracy_results_df

**Observations on model2 - Model using Data augmentaion**
- We can observe that training and validation accuracy are almost similar, hence overfitting issue has resolved.
- But we can see that accuracy is around 50% which is very low.

### Checking distribution of classes in the training dataset

In [None]:

classes_dict={}
# Go through all training dataset batches
for image_batch, labels_batch in train_ds:
  # prepare the count of images in each class
    for i in range(image_batch.shape[0]):
        if class_names[labels_batch[i]] in classes_dict:
            classes_dict[class_names[labels_batch[i]]]+=1
        else:
            classes_dict[class_names[labels_batch[i]]]=1

In [None]:
classes_dict

In [None]:
# class distribution in Dataframe
df_distr = pd.DataFrame({"Class Name":classes_dict.keys(), "Samples":classes_dict.values()})
df_distr.sort_values('Samples',ascending=False)

### Visualizing class distribution

In [None]:
# Lets visualize Class Distribution
import seaborn as sns
plt.figure(figsize=(10, 10))
sns.barplot(data=df_distr,x='Class Name',y='Samples',palette='pastel')
plt.xticks(rotation = 90) 
plt.show()

**Observations on Class Imbalance :**
- Which class has the least number of samples?
> seborrheic keratosis

- Which classes dominate the data in terms proportionate number of samples?
> pigmented benign keratosis

### Rectifying the class imbalance 

In [None]:
!pip install Augmentor

In [None]:
path_to_training_dataset="Skin cancer ISIC The International Skin Imaging Collaboration/Train/"
import Augmentor
for i in class_names:
    p = Augmentor.Pipeline(path_to_training_dataset + i)
    p.rotate(probability=0.7, max_left_rotation=10, max_right_rotation=10)
    p.sample(500) ## We are adding 500 samples per class to make sure that none of the classes are sparse.

In [None]:
image_count_train = len(list(train.glob('*/output/*.jpg')))
print(image_count_train)

### Lets see the distribution of augmented data after adding new images to the original training data.

In [None]:
path_list = [x for x in train.glob(os.path.join('*','output', '*.jpg'))]

In [None]:
lesion_list_new = [os.path.basename(os.path.dirname(os.path.dirname(y))) for y in train.glob(os.path.join('*','output', '*.jpg'))]

In [None]:
dataframe_dict_new = dict(zip(path_list, lesion_list_new))

In [None]:
df_augmentator = pd.DataFrame(list(dataframe_dict_new.items()),columns = ['Path','Label'])
df_augmentator

### Train the model on the data created using Augmentor

In [None]:
batch_size = 32
img_height = 180
img_width = 180

In [None]:
data_dir_train="Skin cancer ISIC The International Skin Imaging Collaboration/Train/"
train_ds_augmentor = tf.keras.preprocessing.image_dataset_from_directory(
  data_dir_train,
  seed=123,
  validation_split = 0.2,
  subset = "training",
  image_size=(img_height, img_width),
  batch_size=batch_size)

In [None]:
val_ds_augmentor = tf.keras.preprocessing.image_dataset_from_directory(
  data_dir_train,
  seed=123,
  validation_split = 0.2,
  subset = "validation",
  image_size=(img_height, img_width),
  batch_size=batch_size)

In [None]:
num_class = 9
img_size = 180
model_augmentor_bn = Sequential([
    layers.experimental.preprocessing.Rescaling(1./255,input_shape=(img_height, img_width, 3)),
    data_augmentation,
    layers.Conv2D(16, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.2),
    layers.Conv2D(32, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.2),
    layers.Conv2D(64, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.2),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.BatchNormalization(),
    layers.Dense(num_class,activation='softmax')
])

In [None]:
model_augmentor_bn.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'],
              )
model_augmentor_bn.summary()

In [None]:
tf.keras.backend.clear_session()

epochs = 30
# Lets the fit the model with batch size of 32 and 30 epochs
history_augmentor_bn = model_augmentor_bn.fit(
  train_ds_augmentor,
  validation_data=val_ds_augmentor,
  epochs=epochs,
  batch_size=32
)

In [None]:
acc = history_augmentor_bn.history['accuracy']
val_acc = history_augmentor_bn.history['val_accuracy']

loss = history_augmentor_bn.history['loss']
val_loss = history_augmentor_bn.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
accuracy_results_df = accuracy_results_df.append([{"Type":"Model3 using Balanced data with Normalization","Training Accuracy":acc[-1],"Validation Accuracy":val_acc[-1],"Epochs":epochs}])
accuracy_results_df

**Observations on model3 - Model using rectified data with Normalizationn**
- The training accuracy at the end we got is 79% and validation accuracy is 75%. We can observe a rise in accuracy without causing overfitting.
- We also observe in the graph that there are huge jerks in validation accuracy.

In [None]:
num_class = 9
img_size = 180
model_augmentor = Sequential([
    layers.experimental.preprocessing.Rescaling(1./255,input_shape=(img_height, img_width, 3)),
    data_augmentation,
    layers.Conv2D(16, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
#     layers.BatchNormalization(),
    layers.Conv2D(32, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
#     layers.BatchNormalization(),
    layers.Conv2D(64, 3, padding='same', activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Dense(128, activation='relu'),
#     layers.BatchNormalization(),
    layers.Dense(num_class,activation='softmax')
])

In [None]:
model_augmentor.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'],
              )
model_augmentor.summary()

In [None]:
tf.keras.backend.clear_session()

epochs = 30
# Lets the fit the model with batch size of 32 and 20 epochs
history_augmentor = model_augmentor.fit(
  train_ds_augmentor,
  validation_data=val_ds_augmentor,
  epochs=epochs,
  batch_size=32
)

In [None]:
acc = history_augmentor.history['accuracy']
val_acc = history_augmentor.history['val_accuracy']

loss = history_augmentor.history['loss']
val_loss = history_augmentor.history['val_loss']

epochs_range = range(epochs)

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range, acc, label='Training Accuracy')
plt.plot(epochs_range, val_acc, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range, loss, label='Training Loss')
plt.plot(epochs_range, val_loss, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

In [None]:
accuracy_results_df = accuracy_results_df.append([{"Type":"Model4 using Balanced data and without Normalization","Training Accuracy":acc[-1],"Validation Accuracy":val_acc[-1],"Epochs":epochs}])
accuracy_results_df

**Observations on model3 - Model using rectified data without Normalizationn**
- The training accuracy at the end we got is 76% and validation accuracy is 73%.
- We also observe in the graph that there are no huge jerks in validation accuracy.