<a href="https://colab.research.google.com/github/boolean0-0/Melanoma_Detection/blob/main/Melanoma_Detection_ConvNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Robert Lincoln Jones
# A system to detect skin cancer from images
# Started 10/18/2024

In [None]:
import kagglehub
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2


In [None]:
import pandas as pd
skin_csv = pd.read_csv(path + '/HAM10000_metadata.csv')

In [None]:
print(skin_csv.head())

     lesion_id      image_id   dx dx_type   age   sex localization
0  HAM_0000118  ISIC_0027419  bkl   histo  80.0  male        scalp
1  HAM_0000118  ISIC_0025030  bkl   histo  80.0  male        scalp
2  HAM_0002730  ISIC_0026769  bkl   histo  80.0  male        scalp
3  HAM_0002730  ISIC_0025661  bkl   histo  80.0  male        scalp
4  HAM_0001466  ISIC_0031633  bkl   histo  75.0  male          ear


In [None]:
print(skin_csv['dx'].value_counts())

dx
nv       6705
mel      1113
bkl      1099
bcc       514
akiec     327
vasc      142
df        115
Name: count, dtype: int64


# Combining the folders and making a dataset

In [None]:
# The HAM-10000 dataset is divided into two folders
# Combine the two folders into one
# Just for simplification purposes

import os
import shutil

print("Path to dataset files:", path)
skin_csv = pd.read_csv("/root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2/HAM10000_metadata.csv")


source_folder1 = '/root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2/HAM10000_images_part_1'
source_folder2 = '/root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2/HAM10000_images_part_2'
destination_folder = '/root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2/HAM10000_images_combined'

try:
  shutil.rmtree(destination_folder)
except:
  print("Creating a folder at ", destination_folder)
# Create the destination folder if it doesn't exist
os.makedirs(destination_folder, exist_ok=True)

# Function to copy files from source to destination
def copy_files(source, destination):
    for filename in os.listdir(source):
        source_file = os.path.join(source, filename)
        destination_file = os.path.join(destination, filename)
        if os.path.isfile(source_file):
            shutil.copy(source_file, destination_file)

# Copy files from the first source folder
copy_files(source_folder1, destination_folder)

# Copy files from the second source folder
copy_files(source_folder2, destination_folder)

print(f"Files combined into: {destination_folder}")

Path to dataset files: /root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2
Files combined into: /root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2/HAM10000_images_combined


In [None]:

import numpy as np
from PIL import Image

X_train = []
y_train = []
X_test = []
y_test = []
X_val = []
y_val = []

In [None]:
images_dir = '/root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2/HAM10000_images_combined'

for im in os.listdir(images_dir):
    image_path = images_dir + '/' + im
    image = Image.open(image_path)
    image = image.resize((64, 64))
    image = np.array(image)
    X_train.append(image)
    filtered_df = skin_csv[skin_csv['image_id'] == im.split('.')[0]]
    y_train.append(filtered_df['dx'].values[0])

len(y_train)

10015

In [None]:
i = 0
for x in y_train:
    if x == 'akiec':
        y_train[i] = 0
    elif x == 'bcc':
        y_train[i] = 1
    elif x == 'bkl':
        y_train[i] = 2
    elif x == 'df':
        y_train[i] = 3
    elif x == 'mel':
        y_train[i] = 4
    elif x == 'nv':
        y_train[i] = 5
    elif x == 'vasc':
        y_train[i] = 6
    elif x == 'scc':
        y_train[i] = 7
    i +=1

In [None]:
from collections import Counter
print(Counter(y_train))

Counter({5: 6705, 4: 1113, 2: 1099, 1: 514, 0: 327, 6: 142, 3: 115})


In [None]:
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train, random_state=4761)

X_train = np.array(X_train)
y_train = np.array(y_train)

print(X_train.shape,
      y_train.shape,)

(10015, 64, 64, 3) (10015,)


In [None]:
# Split X_train and y_train into testing and validation sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=12)

# Preprocessing

In [46]:
from re import X
from sklearn.decomposition import PCA

X_flattened = X_train.reshape(X_train.shape[0], -1)
X_test_flattened = X_test.reshape(X_test.shape[0], -1)
X_val_flattened = X_val.reshape(X_val.shape[0], -1)

# Step 2: Apply PCA whitening
pca = PCA(whiten=True)
X_whitened = pca.fit_transform(X_flattened)
X_test_whitened = pca.transform(X_test_flattened)
X_val_whitened = pca.transform(X_val_flattened)

X_whitened_reshaped = X_whitened.reshape(X_train.shape[0], pca.n_components_)
X_test_whitened_reshaped = X_test_whitened.reshape(X_test.shape[0], pca.n_components_)
X_val_whitened_reshaped = X_val_whitened.reshape(X_val.shape[0], pca.n_components_)

print(X_whitened_reshaped.shape)


(7010, 7010)


# Training

In [47]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

sgd_clf = SGDClassifier(loss='modified_huber', max_iter=1000, random_state=42)
sgd_clf.fit(X_whitened_reshaped, y_train)

In [86]:
import pickle

with open('sgd_clf.pkl', 'wb') as file:
    pickle.dump(sgd_clf, file)

In [48]:
y_pred = sgd_clf.predict(X_test_whitened_reshaped)
y_pred_val = sgd_clf.predict(X_val_whitened_reshaped)

print(classification_report(y_test, y_pred))
print(classification_report(y_val, y_pred_val))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       111
           1       1.00      1.00      1.00       188
           2       0.38      0.99      0.55       382
           3       0.30      1.00      0.46        38
           4       1.00      0.99      0.99       386
           5       1.00      0.71      0.83      2355
           6       1.00      1.00      1.00        45

    accuracy                           0.80      3505
   macro avg       0.81      0.96      0.83      3505
weighted avg       0.92      0.80      0.83      3505

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       117
           1       1.00      0.99      1.00       186
           2       0.40      0.99      0.57       410
           3       0.36      1.00      0.53        42
           4       1.00      0.99      0.99       351
           5       1.00      0.71      0.83      2354
           6       1.00 

In [80]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_val, y_pred_val))
print(classification_report(y_val, y_pred_val))

[[ 117    0    0    0    0    0    0]
 [   0  185    1    0    0    0    0]
 [   0    0  407    3    0    0    0]
 [   0    0    0   42    0    0    0]
 [   0    0    5    0  346    0    0]
 [   0    0  611   73    0 1670    0]
 [   0    0    0    0    0    0   45]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       117
           1       1.00      0.99      1.00       186
           2       0.40      0.99      0.57       410
           3       0.36      1.00      0.53        42
           4       1.00      0.99      0.99       351
           5       1.00      0.71      0.83      2354
           6       1.00      1.00      1.00        45

    accuracy                           0.80      3505
   macro avg       0.82      0.95      0.84      3505
weighted avg       0.92      0.80      0.83      3505



In [50]:
print('akiec 0', 'bcc 1', 'bkl 2', 'df 3', 'mel 4', 'nv 5', 'vasc 6', 'scc 7')

akiec 0 bcc 1 bkl 2 df 3 mel 4 nv 5 vasc 6 scc 7


# CONVNET

In [72]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

# Data Augmentation
datagen = ImageDataGenerator(
    rotation_range=20,         # Randomly rotate images by up to 20 degrees
    width_shift_range=0.2,     # Randomly shift images horizontally
    height_shift_range=0.2,    # Randomly shift images vertically
    shear_range=0.2,           # Apply shear transformations
    zoom_range=0.2,            # Randomly zoom in/out on images
    horizontal_flip=True,      # Randomly flip images horizontally
    fill_mode='nearest'        # Fill empty pixels after transformations
)

# CNN Model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')  # Assuming 10 classes for classification
])


# CNN Model
model = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')  # Assuming 10 classes for classification
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Assuming labels are integer encoded
              metrics=['accuracy'])


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [73]:
# Fit the model using the augmented data generator
# Assuming X_train and y_train are your training images and labels
from tensorflow.keras.callbacks import ReduceLROnPlateau

datagen.fit(X_train)  # Fit the data generator to your training images

lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3)
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=50, callbacks=[lr_scheduler], batch_sizes = 32)

Epoch 1/50
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 50ms/step - accuracy: 0.6234 - loss: 10.8379 - val_accuracy: 0.6910 - val_loss: 0.9436 - learning_rate: 0.0010
Epoch 2/50
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 46ms/step - accuracy: 0.6693 - loss: 1.0156 - val_accuracy: 0.6767 - val_loss: 1.3525 - learning_rate: 0.0010
Epoch 3/50
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 46ms/step - accuracy: 0.6696 - loss: 1.0446 - val_accuracy: 0.6822 - val_loss: 0.9000 - learning_rate: 0.0010
Epoch 4/50
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 45ms/step - accuracy: 0.6910 - loss: 0.8934 - val_accuracy: 0.6956 - val_loss: 0.8349 - learning_rate: 0.0010
Epoch 5/50
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 46ms/step - accuracy: 0.6886 - loss: 0.8671 - val_accuracy: 0.7121 - val_loss: 0.8007 - learning_rate: 0.0010
Epoch 6/50
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x79b8c1dcfac0>

In [87]:
with open('cnn_model_batch_def.pkl', 'wb') as file:
    pickle.dump(model, file)

In [79]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Assume y_test contains the true labels, and y_pred contains the predicted labels
# y_pred is the output from the model's prediction (e.g., a CNN model)

y_pred = model.predict(X_val)
y_pred = np.argmax(y_pred, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Calculate precision
precision = precision_score(y_val, y_pred, average='weighted')
print(f"Precision: {precision:.2f}")

# Calculate recall
recall = recall_score(y_val, y_pred, average='weighted')
print(f"Recall: {recall:.2f}")

# Calculate F1 score
f1 = f1_score(y_val, y_pred, average='weighted')
print(f"F1 Score: {f1:.2f}")

print(classification_report(y_val, y_pred))
print(confusion_matrix(y_val, y_pred))

[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
Accuracy: 0.98
Precision: 0.98
Recall: 0.98
F1 Score: 0.98
              precision    recall  f1-score   support

           0       0.99      0.92      0.96       117
           1       0.94      0.90      0.92       186
           2       0.93      0.98      0.95       410
           3       1.00      0.79      0.88        42
           4       1.00      1.00      1.00       351
           5       0.99      1.00      0.99      2354
           6       1.00      0.98      0.99        45

    accuracy                           0.98      3505
   macro avg       0.98      0.94      0.96      3505
weighted avg       0.98      0.98      0.98      3505

[[ 108    3    3    0    0    3    0]
 [   1  167   16    0    0    2    0]
 [   0    2  403    0    0    5    0]
 [   0    1    5   33    0    3    0]
 [   0    0    0    0  350    1    0]
 [   0    4    7    0    0 2343    0]
 [   0    0    0    0    0    1   44]]


In [63]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Initialize Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit on feature vectors (e.g., after PCA)
rf.fit(X_whitened_reshaped, y_train)

# Predict on the test set
y_pred = rf.predict(X_test_whitened_reshaped)

# Evaluate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Accuracy: {accuracy}")


Random Forest Accuracy: 1.0


In [88]:
with open('rf_model.pkl', 'wb') as file:
    pickle.dump(rf, file)

In [66]:
y_pred_val_f = rf.predict(X_val_whitened_reshaped)
print(classification_report(y_val, y_pred_val_f))
print(confusion_matrix(y_val, y_pred_val_f))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       117
           1       1.00      1.00      1.00       186
           2       1.00      1.00      1.00       410
           3       1.00      1.00      1.00        42
           4       1.00      1.00      1.00       351
           5       1.00      1.00      1.00      2354
           6       1.00      1.00      1.00        45

    accuracy                           1.00      3505
   macro avg       1.00      1.00      1.00      3505
weighted avg       1.00      1.00      1.00      3505

[[ 117    0    0    0    0    0    0]
 [   0  186    0    0    0    0    0]
 [   0    0  410    0    0    0    0]
 [   0    0    0   42    0    0    0]
 [   0    0    0    0  351    0    0]
 [   0    0    0    0    0 2354    0]
 [   0    0    0    0    0    0   45]]


In [81]:

# Data Augmentation
datagen = ImageDataGenerator(
    rotation_range=20,         # Randomly rotate images by up to 20 degrees
    width_shift_range=0.2,     # Randomly shift images horizontally
    height_shift_range=0.2,    # Randomly shift images vertically
    shear_range=0.2,           # Apply shear transformations
    zoom_range=0.2,            # Randomly zoom in/out on images
    horizontal_flip=True,      # Randomly flip images horizontally
    fill_mode='nearest'        # Fill empty pixels after transformations
)

# CNN Model
model_1 = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')  # Assuming 10 classes for classification
])


# CNN Model
model_1 = Sequential([
    Conv2D(32, (3, 3), activation='relu', input_shape=(64, 64, 3)),
    MaxPooling2D((2, 2)),
    Conv2D(64, (3, 3), activation='relu'),
    MaxPooling2D((2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(10, activation='softmax')  # Assuming 10 classes for classification
])

model_1.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Assuming labels are integer encoded
              metrics=['accuracy'])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [83]:
# Fit the model using the augmented data generator
# Assuming X_train and y_train are your training images and labels
from tensorflow.keras.callbacks import ReduceLROnPlateau

datagen.fit(X_train)  # Fit the data generator to your training images

lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3)
model_1.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=15, callbacks=[lr_scheduler], batch_size = 32)

Epoch 1/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 44ms/step - accuracy: 0.5964 - loss: 21.1138 - val_accuracy: 0.7161 - val_loss: 0.8457 - learning_rate: 0.0010
Epoch 2/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 44ms/step - accuracy: 0.7055 - loss: 0.8714 - val_accuracy: 0.7155 - val_loss: 0.7984 - learning_rate: 0.0010
Epoch 3/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 42ms/step - accuracy: 0.7023 - loss: 0.8770 - val_accuracy: 0.7472 - val_loss: 0.7224 - learning_rate: 0.0010
Epoch 4/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 42ms/step - accuracy: 0.7319 - loss: 0.7547 - val_accuracy: 0.7544 - val_loss: 0.6843 - learning_rate: 0.0010
Epoch 5/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 42ms/step - accuracy: 0.7560 - loss: 0.6607 - val_accuracy: 0.7963 - val_loss: 0.5770 - learning_rate: 0.0010
Epoch 6/15
[1m220/220[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m

<keras.src.callbacks.history.History at 0x79b8a31cc430>

In [85]:
y_pred_val_cnn_1 = model_1.predict(X_val)
y_pred_val_cnn_1 = np.argmax(y_pred_val_cnn_1, axis=1)
print(classification_report(y_val, y_pred_val_cnn_1))
print(confusion_matrix(y_val, y_pred_val_cnn_1))

[1m110/110[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step
              precision    recall  f1-score   support

           0       0.98      0.89      0.93       117
           1       0.94      0.85      0.90       186
           2       0.86      0.95      0.90       410
           3       0.92      0.81      0.86        42
           4       0.97      0.86      0.91       351
           5       0.97      0.98      0.97      2354
           6       0.85      0.91      0.88        45

    accuracy                           0.95      3505
   macro avg       0.93      0.89      0.91      3505
weighted avg       0.95      0.95      0.95      3505

[[ 104    3    8    0    0    2    0]
 [   0  159    8    0    2   14    3]
 [   2    1  389    1    0   17    0]
 [   0    1    4   34    0    3    0]
 [   0    1    7    0  301   42    0]
 [   0    4   36    2    6 2302    4]
 [   0    0    0    0    0    4   41]]


In [89]:
with open('cnn_model_batch_32.pkl', 'wb') as file:
    pickle.dump(model_1, file)