This Jupyter notebook is responsible for running the cancer detection algorithm.

# Data preprocessing

In [None]:
import pandas as pd
from data_augmentation import *
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, precision_score, f1_score, recall_score
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from os.path import join
import subprocess

%load_ext autoreload
%autoreload 2

In [None]:
# determine the git repo
repo_dir = subprocess.Popen(['git', 'rev-parse', '--show-toplevel'], stdout=subprocess.PIPE).communicate()[0].rstrip().decode('utf-8')

# Local folders containing your images
# HAM10000_images_part_1_2 contains all 10k images while JS_Selection just cointains ~200 images

#original_folder_path =  os.path.join(repo_dir, 'dataverse_files/HAM10000_images_part_1')
#original_folder_path =  os.path.join(repo_dir, 'dataverse_files/HAM10000_images_part_1_2')
original_folder_path =  os.path.join(repo_dir, 'dataverse_files/JS_Selection')
processed_folder_path =  os.path.join(repo_dir, 'preprocessed_images')

os.makedirs(processed_folder_path, exist_ok=True)

In [None]:
# Create Ground Truth dataframe
df = pd.read_csv(os.path.join(repo_dir, 'dataverse_files/', 'HAM10000_metadata.csv'))

df.dx.unique()

cancerous = ["akiec", "bcc", "mel"]
non_cancerous = ["bkl", "df", "nv", "vasc"]
df["cancer"] = False

# Assign True to 'cancer' where 'dx' matches the cancerous list
df.loc[df['dx'].isin(cancerous), 'cancer'] = True
df.loc[(df['dx'].isin(non_cancerous)), 'cancer'] = False
df

## Test Single Image

In [None]:
image_path = os.path.join(original_folder_path, os.listdir(original_folder_path)[2])
image = methods.load_image(image_path, BGR2RGB=True) # already in RGB

### Blob Detection

In [None]:
image = crop_rotate(image,90)

# Convert the color from BGR to RGB
gray_image = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY)

blob = methods.detect_significant_blob(image, plot_image=True, plot_chosen_transformation=False)

#methods.calculate_glcm_features_for_blob(gray_image, blob)
#plt.imshow(image)

In [None]:
hist = methods.create_histogram(image, color_space="HSV")
plt.plot(hist[0], "r")
plt.plot(hist[1], "g")
plt.plot(hist[2], "b")

## Feature Extraction (all images in folder)

In [None]:
TRAIN_SIZE = 0.8
OVERSAMPLE = True #  if set to false will not oversample the minority class
try:
    split_data_and_oversample(original_folder_path, processed_folder_path, df, TRAIN_SIZE, oversample=OVERSAMPLE)
except OSError as e:
    print("delete the Folder preprocessed_images and try again")
    print(e)
    

In [None]:
# Process training and testing images
histograms_rgb_train, histograms_hsv_train, graycomatrix_features_train, Y_train = methods.extract_individual_features(df, join(processed_folder_path, "train"))
histograms_rgb_test, histograms_hsv_test, graycomatrix_features_test, Y_test = methods.extract_individual_features(df, join(processed_folder_path, "test"))


## Generate the Feature Vector

In [None]:
X_train, X_test = methods.generate_feature_vector(
    [histograms_hsv_train, histograms_hsv_train, graycomatrix_features_train], 
    [histograms_rgb_test, histograms_hsv_test, graycomatrix_features_test],)
np.shape(histograms_hsv_train)
np.shape(X_train), np.shape(X_test), np.shape(Y_train), np.shape(Y_test)

## Playground Model Training

In [None]:
# Initialize PCA, let's say we want to keep 95% of the variance
pca = PCA(n_components=0.999995)

# Fit and transform the data
X_train_pca = pca.fit_transform(X_train)

# Check the new shape of the data
print(X_train_pca.shape)


In [None]:
# Scatter plot of the first two PCA components
# Here, X_pca[:, 0] is the first component, X_pca[:, 1] is the second component
plt.figure(figsize=(10, 7))
plt.scatter(X_train_pca[Y_train == 0, 0], X_train_pca[Y_train == 0, 1], c='blue', label='Non-Cancerous', alpha=0.1)  # Non-cancerous in blue
plt.scatter(X_train_pca[Y_train == 1, 0], X_train_pca[Y_train == 1, 1], c='red', label='Cancerous', alpha=0.1)  # Cancerous labeled in red

# Adding labels and title
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.title('PCA of Image Data')
plt.legend()


In [None]:
# Log Regression Model
logreg = LogisticRegression(max_iter=10000)

# Fit the logistic regression model on the training data
logreg.fit(X_train, Y_train)

# Predict the labels for the test set
Y_pred = logreg.predict(X_test)

In [None]:
# SVM model
# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Create an SVM classifier
#clf = svm.SVC(kernel='poly', C=1.0, gamma=0.5, class_weight="balanced")
clf = svm.SVC(kernel='rbf', C=1.0, gamma=0.5, class_weight="balanced")

# Train the classifier
clf.fit(X_train, Y_train)

# Make predictions
Y_pred = clf.predict(X_test)


In [None]:
conf_matrix = confusion_matrix(Y_test, Y_pred)

# Initialize the ConfusionMatrixDisplay object with the confusion matrix
cmd = ConfusionMatrixDisplay(conf_matrix, display_labels=logreg.classes_)

# Plot the confusion matrix
cmd.plot(cmap=plt.cm.Blues)  # You can choose other color maps like 'viridis', 'plasma', etc.
plt.title('Confusion Matrix')
plt.show()

In [None]:
accuracy = accuracy_score(Y_test, Y_pred)
precision = precision_score(Y_test, Y_pred, pos_label=1)
recall = recall_score(Y_test, Y_pred, pos_label=1)
f1 = f1_score(Y_test, Y_pred, pos_label=1)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

In [None]:
histograms_rgb_cancer = histograms_rgb_train[Y_train==1].squeeze()
histograms_rgb_benign = histograms_rgb_train[Y_train==0].squeeze()

histograms_rgb_benign_avg = np.mean(histograms_rgb_benign, axis=0)
histograms_rgb_cancer_avg = np.mean(histograms_rgb_cancer, axis=0)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=3, figsize=(14,5))

ax1 = axes[0]
ax1.plot(histograms_rgb_cancer_avg[0], "r", label="cancer")
ax1.plot(histograms_rgb_benign_avg[0], ":r", label="benign")
ax1.set_xlabel("Red Intensity (a.u.)")
ax1.set_ylabel("Counts (a.u.)")
ax1.legend()

ax2 = axes[1]
ax2.plot(histograms_rgb_cancer_avg[1], "g", label="cancer")
ax2.plot(histograms_rgb_benign_avg[2], ":g", label="benign")
ax2.set_xlabel("Green Intensity (a.u.)")
ax2.set_ylabel("Counts (a.u.)")
ax2.legend()

ax3 = axes[2]
ax3.plot(histograms_rgb_cancer_avg[2], "b", label="cancer")
ax3.plot(histograms_rgb_benign_avg[2], ":b", label="benign")
ax3.set_xlabel("Blue Intensity (a.u.)")
ax3.set_ylabel("Counts (a.u.)")
ax3.legend()

fig.tight_layout()
fig.savefig("figures/rgb_hist.png", dpi=300, transparent=False)