This Jupyter notebook is responsible for running the cancer detection algorithm.

# Data preprocessing

In [None]:
import pandas as pd
import methods
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, f1_score, recall_score, classification_report
from skimage.feature import graycomatrix, graycoprops
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from skimage import data
from skimage.feature import blob_dog, blob_log, blob_doh
from skimage.color import rgb2gray
import math
from skimage import data, feature, color, io
from skimage.draw import disk


%load_ext autoreload
%autoreload 2

# Local folders containing your images
# HAM10000_images_part_1_2 contains all 10k images while JS_Selection just cointains ~200 images

original_folder_path =  './dataverse_files/HAM10000_images_part_1'
#original_folder_path =  './dataverse_files/HAM10000_images_part_1_2'  # folder containing all 10k images
#original_folder_path =  './dataverse_files/JS_Selection'
processed_folder_path =  './preprocessed_images'

if not os.path.exists(processed_folder_path):
    os.makedirs(processed_folder_path)

In [None]:
# Create Ground Truth dataframe
df = pd.read_csv("./dataverse_files/HAM10000_metadata.csv")

df.dx.unique()

cancerous = ["akiec", "bcc", "mel"]
non_cancerous = ["bkl", "df", "nv", "vasc"]
df["cancer"] = False

# Assign True to 'cancer' where 'dx' matches the cancerous list
df.loc[df['dx'].isin(cancerous), 'cancer'] = True
df.loc[(df['dx'].isin(non_cancerous)), 'cancer'] = False

## Test Single Image

In [None]:
image_path = os.path.join(original_folder_path, os.listdir(original_folder_path)[4])
image = cv2.imread(image_path)
# Convert the color from BGR to RGB
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

# Convert the color from BGR to RGB
gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

blob = methods.detect_significant_blob(image, plot_image=True, plot_chosen_transformation=False)

#methods.calculate_glcm_features_for_blob(gray_image, blob)
#plt.imshow(image)

In [None]:
gabor_frequencies = [0.05, 0.1, 0.15, 0.2, 0.25]
gabor_thetas = [0, np.pi/4, np.pi/2]
gabor_sigmas = [1, 2, 3]

# Apply the Gabor filters and extract features
feature_results = methods.apply_gabor_filters_and_extract_features(image, gabor_frequencies, gabor_thetas, gabor_sigmas)
np.shape(feature_results)

In [None]:
hist = methods.create_histogram(image, color_space="HSV")
plt.plot(hist[0], "r")
plt.plot(hist[1], "g")
plt.plot(hist[2], "b")

## Feature Extraction (all images in folder)

In [None]:
histograms_rgb = []
histograms_hsv = []
graycomatrix_features = []
gabor_features = []
Y = []

for image_name in tqdm(os.listdir(original_folder_path)):
    image_path = os.path.join(original_folder_path, image_name)

    if image_path.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
        cancer = df.loc[df['image_id'] == image_name.split(".")[-2], 'cancer'].values[0]

        # Read the image using OpenCV
        image = cv2.imread(image_path)
        # Convert the color from BGR to RGB
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        #
        # Feature Extraction
        #

        # Histograms
        hist_rgb = methods.create_histogram(image, color_space="RGB")
        hist_hsv = methods.create_histogram(image, color_space="HSV")
        histograms_rgb.append(hist_rgb)
        histograms_hsv.append(hist_hsv)

        # Structure: GLCM Matrix
        graycomatrix_features.append(methods.calculate_glcm_features(image))

        # Structure: GLCM Matrix (With blob detection --> very slow)
        #blob = methods.detect_significant_blob(image, plot_image=False)
        #graycomatrix_features.append(methods.calculate_glcm_features_for_blob(gray_image, blob))
           
        # Gabor filters (also slow)
        #gabor_result = methods.apply_gabor_filters_and_extract_features(image, gabor_frequencies, gabor_thetas, gabor_sigmas)
        #gabor_features.append(gabor_result)

        Y.append(cancer)

histograms_rgb = np.array(histograms_rgb)
histograms_hsv = np.array(histograms_hsv)
Y = np.array(Y)

## Playground Model Training

In [None]:
features_rgb = histograms_rgb.reshape(len(Y), -1) #  flatten the RGB channels
features_hsv = histograms_hsv.reshape(len(Y), -1) #  flatten the RGB channels

## Generate the Feature Vector

In [None]:
X_features = np.concatenate((features_rgb, features_hsv), axis=1)  # do we need to z-normalize the features before?

# Add the structure feature
#X_features = np.concatenate((X_features, graycomatrix_features), axis=1)
#X_features = graycomatrix_features
#X_features = np.nan_to_num(gabor_features)

# Add the gabor features
#X_features = np.concatenate((X_features, np.nan_to_num(gabor_features)), axis=1) # add additional featuers

np.shape(X_features)

In [None]:
# Initialize PCA, let's say we want to keep 95% of the variance
pca = PCA(n_components=0.95)

# Fit and transform the data
X_pca = pca.fit_transform(X_features)

# Check the new shape of the data
print(X_pca.shape)


In [None]:
# Scatter plot of the first two PCA components
# Here, X_pca[:, 0] is the first component, X_pca[:, 1] is the second component
plt.figure(figsize=(10, 7))
plt.scatter(X_pca[Y == 0, 0], X_pca[Y == 0, 1], c='blue', label='Non-Cancerous')  # Non-cancerous in blue
plt.scatter(X_pca[Y == 1, 0], X_pca[Y == 1, 1], c='red', label='Cancerous')  # Cancerous labeled in red

# Adding labels and title
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Image Data')
plt.legend()


In [None]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_pca, Y, test_size=0.3, random_state=42)
#X_train, X_test, Y_train, Y_test = train_test_split(X_features, Y, test_size=0.3, random_state=42)

In [None]:
# Log Regression Model
logreg = LogisticRegression(max_iter=10000)

# Fit the logistic regression model on the training data
logreg.fit(X_train, Y_train)

# Predict the labels for the test set
Y_pred = logreg.predict(X_test)

In [None]:
# SVM model
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create an SVM classifier
clf = svm.SVC(kernel='poly', C=1.0, gamma=0.5, class_weight="balanced")

# Train the classifier
clf.fit(X_train, Y_train)

# Make predictions
Y_pred = clf.predict(X_test)


In [None]:
conf_matrix = confusion_matrix(Y_test, Y_pred)

# Initialize the ConfusionMatrixDisplay object with the confusion matrix
cmd = ConfusionMatrixDisplay(conf_matrix, display_labels=logreg.classes_)

# Plot the confusion matrix
cmd.plot(cmap=plt.cm.Blues)  # You can choose other color maps like 'viridis', 'plasma', etc.
plt.title('Confusion Matrix')
plt.show()

In [None]:
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, pos_label=1)
recall = recall_score(Y_test, Y_pred, pos_label=1)
f1 = f1_score(Y_test, Y_pred, pos_label=1)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

In [None]:
histograms_rgb_cancer = histograms_rgb[Y==1].squeeze()
histograms_rgb_benign = histograms_rgb[Y==0].squeeze()

histograms_rgb_benign_avg = np.mean(histograms_rgb_benign, axis=0)
histograms_rgb_cancer_avg = np.mean(histograms_rgb_cancer, axis=0)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14,5))

ax1 = axes[0]
ax1.plot(histograms_rgb_cancer_avg[0], "r", label="cancer")
ax1.plot(histograms_rgb_benign_avg[0], ":r", label="benign")
ax1.set_xlabel("Red Intensity (a.u.)")
ax1.set_ylabel("Counts (a.u.)")
ax1.legend()

ax2 = axes[1]
ax2.plot(histograms_rgb_cancer_avg[1], "g", label="cancer")
ax2.plot(histograms_rgb_benign_avg[2], ":g", label="benign")
ax2.set_xlabel("Green Intensity (a.u.)")
ax2.set_ylabel("Counts (a.u.)")
ax2.legend()

ax3 = axes[2]
ax3.plot(histograms_rgb_cancer_avg[2], "b", label="cancer")
ax3.plot(histograms_rgb_benign_avg[2], ":b", label="benign")
ax3.set_xlabel("Blue Intensity (a.u.)")
ax3.set_ylabel("Counts (a.u.)")
ax3.legend()

fig.tight_layout()
fig.savefig("figures/rgb_hist.png", dpi=300, transparent=False)