## Dependencies

In [None]:
# Dependencies to Visualize the model
%matplotlib inline
from IPython.display import Image, SVG
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(0)

In [None]:
# Filepaths, pandas, numpy, Tensorflow, and scikit-image
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import skimage as sk


In [None]:
# Sklearn scaling
from sklearn.preprocessing import MinMaxScaler

### Keras Specific Dependencies

In [None]:
# Keras
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Dense
from tensorflow.keras.datasets import mnist

## Loading and Preprocessing our Data

### Load the Skin Cancer Image Dataset and Metadata

In [None]:
from PIL import Image

# Define folder path where images are stored
folder_path = "Resources/Skin Cancer/Split"

# Load images
def load_images_from_folder(folder_path, image_size=(224, 224)):
    """
    Load images from all subfolders of a given folder and resize them to a specified size.
    
    Args:
    - folder_path (str): The path to the folder containing the subfolders.
    - image_size (tuple, optional): The desired size of the images. Defaults to (224, 224).

    
    Returns:
    - images (list): A list of tuples, where each tuple contains a filename and its corresponding, resized image.
    """
    images = []
    # Iterate over all subfolders and their contents
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            if file.endswith(".jpg") or file.endswith(".jpeg"):
                file_path = os.path.join(root, file)
                img = Image.open(file_path)
                if img is not None:
                    img = img.resize(image_size)
                    images.append((file, img))  # Storing filename along with the image
    return images

# Call function to define images
images = load_images_from_folder(folder_path)

# Check that all 10,015 images were loaded
print("Number of images loaded:", len(images))

# Check first image
if images:
    # Extract the first image from the list
    first_filename, first_image = images[0]
    
    # Display the first image
    plt.imshow(first_image)
    plt.title("First Image: " + first_filename)  # Add title with filename
    plt.axis('off')  # Turn off axis
    plt.show()

In [None]:
# Define csv file name
csv_file = "Resources/HAM10000_metadata.csv"

# Load metadata csv
def load_metadata_from_csv(csv_file):
    """
    Load metadata from a CSV file and return it as a DataFrame.
    
    Args:
    - csv_file (str): The path to the CSV file containing the metadata.
    
    Returns:
    - metadata (DataFrame): A DataFrame containing the metadata read from the CSV file.
    """
    metadata = pd.read_csv(csv_file)
    return metadata

# Call function to define metadata
metadata = load_metadata_from_csv(csv_file)

# Preview pandas dataframe to confirm metadata loaded correctly
metadata.head()

In [None]:
# Match images with metadata based on filenames
def match_images_with_metadata(images, metadata):
    """
    Match images with metadata based on filenames.

    Args:
    - images (list): A list of tuples containing (filename, image).
    - metadata (DataFrame): A DataFrame containing metadata.

    Returns:
    - matched_data (list): A list of tuples containing (filename, image, matched metadata row).
    """
    matched_data = []
    for filename, img in images:
        # Extract the filename without extension
        filename_without_ext = os.path.splitext(filename)[0]
        # Find corresponding row in metadata with matching filename
        matched_row = metadata.loc[metadata['image_id'] == filename_without_ext]
        # If a match is found, store the image and metadata together
        if not matched_row.empty:
            matched_data.append((filename, img, matched_row))
    return matched_data

# Call function to define matched_data
matched_data = match_images_with_metadata(images, metadata)

# Check that all 10,015 images were matched with metadata
print("Number of matched images and metadata:", len(matched_data))

In [None]:
# Create pandas dataframe of metadata matched with corresponding image
def create_metadata_image_dataframe(matched_data):
    """
    Create a DataFrame containing metadata with corresponding images.
    
    Args:
    - matched_data (list): A list of tuples containing (filename, image, metadata_row).
    
    Returns:
    - df (DataFrame): A DataFrame where each row corresponds to a matched image and its metadata.
    """
    data = []
    for filename, img, metadata_row in matched_data:
        metadata_dict = metadata_row.to_dict(orient='records')[0]  # Convert metadata row to dictionary
        metadata_dict['Image'] = img  # Add the image to the metadata dictionary
        data.append(metadata_dict)
    df = pd.DataFrame(data)
    return df

# Call function to define metadata_image_df
metadata_image_df = create_metadata_image_dataframe(matched_data)

# Preview dataframe
metadata_image_df.head()

## Image Color Analysis

### HSV Color Analysis

In [None]:
from skimage import color

def calculate_hsv_histogram(image):
    """
    Calculate HSV histograms for an image.

    Parameters:
    - image: PIL.Image.Image object representing the input image.

    Returns:
    - hue_histogram: NumPy array containing the histogram of hue values.
    - saturation_histogram: NumPy array containing the histogram of saturation values.
    - value_histogram: NumPy array containing the histogram of value (brightness) values.
    """
    # Convert PIL Image to NumPy array
    image_array = np.array(image)

    # Convert the image to HSV color space
    hsv_image = color.rgb2hsv(image_array)

    # Calculate histograms for each component (Hue, Saturation, Value)
    hue_histogram, _ = np.histogram(hsv_image[:,:,0], bins=180, range=(0, 1))
    saturation_histogram, _ = np.histogram(hsv_image[:,:,1], bins=256, range=(0, 1))
    value_histogram, _ = np.histogram(hsv_image[:,:,2], bins=256, range=(0, 1))

    return hue_histogram, saturation_histogram, value_histogram

# Replace 'Image' with the actual column name containing images
metadata_image_df[['hue_histogram', 'saturation_histogram', 'value_histogram']] = metadata_image_df['Image'].apply(calculate_hsv_histogram).apply(pd.Series)

# Display the updated DataFrame
metadata_image_df.head()

### RGB Color Analysis

In [None]:
def calculate_rgb_histogram(image):
    """
    Calculate RGB histograms for an image.

    Parameters:
    - image: PIL.Image.Image object representing the input image.

    Returns:
    - red_histogram: NumPy array containing the histogram of red values.
    - green_histogram: NumPy array containing the histogram of green values.
    - blue_histogram: NumPy array containing the histogram of blue values.
    """
    # Convert PIL Image to NumPy array
    image_array = np.array(image)

    # Calculate histograms for each RGB channel
    red_histogram, _ = np.histogram(image_array[:,:,0], bins=256, range=(0, 255))
    green_histogram, _ = np.histogram(image_array[:,:,1], bins=256, range=(0, 255))
    blue_histogram, _ = np.histogram(image_array[:,:,2], bins=256, range=(0, 255))

    return red_histogram, green_histogram, blue_histogram

# Assuming df is your DataFrame containing metadata and images
# Replace 'Image' with the actual column name containing images
metadata_image_df[['red_histogram', 'green_histogram', 'blue_histogram']] = metadata_image_df['Image'].apply(calculate_rgb_histogram).apply(pd.Series)

# Display the updated DataFrame
metadata_image_df.head()
