In [None]:
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import os
import requests
import pandas as pd
from skimage import morphology
from skimage.color import rgb2hsv
from scipy.spatial.distance import cdist
from scipy.stats.stats import mode
from PIL import Image
from collections import Counter
import cv2
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import average_precision_score
import seaborn as sns

import concurrent.futures

In [None]:
#step 1: import the data into the notebook
image_folder = '../data/example_image/'
segmentation_folder = '../data/example_segmentation/'
ground_truth = '../data/example_ground_truth.csv'
features = '../features/features.csv'

In [None]:
image_files = os.listdir(image_folder)
segmentation_files = os.listdir(segmentation_folder)

df_ground_truth = pd.read_csv(ground_truth)
df_size_features = pd.read_csv(features)

<h4>Adding the extra dataset</h4>
<p>They must be manually unzipped and placed under the data folder!!!</p>

In [None]:
url = 'https://isic-challenge-data.s3.amazonaws.com/2017/ISIC-2017_Training_Data.zip'
r = requests.get(url , allow_redirects = True)
open('ISIC-2017_Training_Data.zip', 'wb').write(r.content)

In [None]:
url = 'https://isic-challenge-data.s3.amazonaws.com/2017/ISIC-2017_Training_Part1_GroundTruth.zip'
r = requests.get(url, allow_redirects=True)
open('ISIC-2017_Validation_Part1_GroundTruth.zip', 'wb').write(r.content)

In [None]:
extra_image_folder = '../data/ISIC-2017_Training_Data/'
extra_segmentation = '../data/ISIC-2017_Training_Part1_GroundTruth/'
extra_ground_truth = '../data/ISIC-2017_Training_Part3_GroundTruth.csv'
age_and_sex = '../features/ISIC-2017_Training_Data_metadata.csv'

extra_images_files = os.listdir(extra_image_folder)
extra_seg_files = os.listdir(extra_segmentation)
df_extra_gt = pd.read_csv(extra_ground_truth)
df_age_and_sex = pd.read_csv(age_and_sex)

df_extra_gt = df_extra_gt.set_index('image_id')

extra_keratosis_mask = df_extra_gt['seborrheic_keratosis']==1.0
extra_melanoma_mask = df_extra_gt['melanoma']==1.0
extra_healthy_mask = (df_extra_gt['seborrheic_keratosis']==0.0) & (df_extra_gt['melanoma']==0.0)

extra_healthy_im = df_extra_gt.index[extra_healthy_mask].tolist()
extra_melanoma_im = df_extra_gt.index[extra_melanoma_mask].tolist()
extra_keratosis_im = df_extra_gt.index[extra_keratosis_mask].tolist()

ex_keratosis = [] #path to all keratosis images
ex_keratosis_seg = []
for i in extra_keratosis_im:
    ex_keratosis_path = os.path.join(extra_image_folder, i + '.jpg')
    ex_keratosis_seg_path = os.path.join(extra_segmentation, i + '_segmentation.png')
    ex_keratosis.append(ex_keratosis_path)
    ex_keratosis_seg.append(ex_keratosis_seg_path)


ex_melanoma = [] #paths to all melanoma images
ex_melanoma_seg = []
for i in extra_melanoma_im:
    ex_melanoma_path = os.path.join(extra_image_folder, i + '.jpg')
    ex_melanoma_seg_path = os.path.join(extra_segmentation, i + '_segmentation.png')
    ex_melanoma.append(ex_melanoma_path)
    ex_melanoma_seg.append(ex_melanoma_seg_path)


ex_healthy = [] #paths to all healthy images
ex_healthy_seg = []
for i in extra_healthy_im:
    ex_healthy_path = os.path.join(extra_image_folder, i + '.jpg')
    ex_healthy_seg_path = os.path.join(extra_segmentation, i + '_segmentation.png')
    ex_healthy.append(ex_healthy_path)
    ex_healthy_seg.append(ex_healthy_seg_path)

ex_final_keratosis = list(zip(ex_keratosis, ex_keratosis_seg))
ex_final_melanoma = list(zip(ex_melanoma, ex_melanoma_seg))
ex_final_healthy = list(zip(ex_healthy, ex_healthy_seg))

<h1>Funtions</h1>

In [None]:
def check_null_values(file, string):
        if file.isnull().values.any():
            print('There are null values in {} dataset'.format(string))
        else:
            print('There are no null values in {} dataset'.format(string))

In [None]:
def crop(image):
    y_nonzero, x_nonzero, _ = np.nonzero(image)
    return image[np.min(y_nonzero):np.max(y_nonzero), np.min(x_nonzero):np.max(x_nonzero)]

In [None]:
def RGB2HEX(color):
     return "#{:02x}{:02x}{:02x}".format(int(color[0]), int(color[1]), int(color[2]))

In [None]:
def color_reader(img,mk):
    im = plt.imread(img)
    #mask is the gray segmentation mask
    mask = plt.imread(mk)
    #putting the color over the mask
    im1 = im.copy()
    im1[mask==0] = 0 #again python dark magic
    #now for better performance we have to crop the image to it's extremities by calling the crop function
    img2 = crop(im1)
    #we have to get the coordinates of every pixel in the image so
    xy_coords = np.flip(np.column_stack(np.where(img2 >= 0)), axis=1)
    #if we print xy_coords they will appear three times because every pixel has three colour coordinates, namely RGB,thus
    #I have to delete the first column(which is 0,1,2) and the duplicates
    a_del = np.delete(xy_coords, 0, 1)
    a_del = a_del[::3][:, [0, 1]] #python dark magic
    #now we get all the rgb colours from every pixel in our picture
    image = Image.fromarray(img2)
    rgb_image = image.convert('RGB')
    rgb1 = [rgb_image.getpixel((int(i[0]),int(i[1]))) for i in a_del]
    #now to reduce it as much as we can we turn it to hexcodes so we don't have tuples of 3 values and we eliminate duplicates
    dd = [RGB2HEX(i) for i in rgb1]
    ss = list(set(dd)) #ye,I know
    #now just a nice thing, to count how many colours appear in our picture, first one is useless since it will always be black
    counter_colours =  Counter(dd)
    #popping the black color
    counter_colours.pop('#000000')
    return counter_colours

In [None]:
def hsv(img,seg):    
    im = plt.imread(img)
    mask = plt.imread(seg)
    im1 = im.copy()
    im1[mask==0] = 0
    new_arr_no_0 = im1[np.where(im1!=0)]
    img2 = crop(im1)
    image_copy = img2.copy()
    non_black_pixels_mask = np.any(img2 != [0, 0, 0], axis=-1)  
    no_black = image_copy[non_black_pixels_mask]
    hsv_image = rgb2hsv(no_black)
    min_max = [np.amax(hsv_image[:,1]) - np.amin(hsv_image[:,1])]
    return min_max

In [None]:
def lightness(img,seg):
    im = plt.imread(img)
    mask = plt.imread(seg)
    im1 = im.copy()
    im1[mask==0] = 0
    new_arr_no_0 = im1[np.where(im1!=0)]
    img2 = crop(im1)
    image_copy = img2.copy()
    non_black_pixels_mask = np.any(img2 != [0, 0, 0], axis=-1)  
    no_black = image_copy[non_black_pixels_mask]
    hsv_image = rgb2hsv(no_black)
    min_max = [np.amax(hsv_image[:,2]) - np.amin(hsv_image[:,2])]
    return min_max



In [None]:
def asymmetry_level(im): 
    #Read image
    im = plt.imread(im)
    
    #Crop the picture
    #Center of the shape is the center of the image
    #The borders of the shape are the borders of the image
    y_nonzero, x_nonzero = np.nonzero(im)
    im = im[np.min(y_nonzero):np.max(y_nonzero), np.min(x_nonzero):np.max(x_nonzero)]
    
    #Cut the image in halves 
    #Find the point of cutoff
    height, width = im.shape
    width_cutoff = width // 2
    height_cutoff = height // 2
    
    #Cut the image vertically and horizontally in two 
    imVertical1 = im[:, :width_cutoff]
    imVertical2 = im[:, width_cutoff:]
    imHorizontal1 = im[:height_cutoff, :]
    imHorizontal2 = im[height_cutoff:, :]
    
    #Flip image 
    #Interting one of the images both vertically and horizontally   
    indexerVertical = [slice(None)] * imVertical2.ndim
    indexerHorizontal = [slice(None)] * imHorizontal2.ndim
    indexerVertical[1] = slice(None, None, -1)
    indexerHorizontal[0] = slice(None, None, -1) 
    imVertical2 = imVertical2[tuple(indexerVertical)]
    imHorizontal2 = imHorizontal2[tuple(indexerHorizontal)]

    #Cut the biggest image, if the images don't have the same shape 
    #This can happen if the shape of the original shape was an odd number 
    imVertical2 = imVertical2[0:imVertical1.shape[0], 0:imVertical1.shape[1]]
    imHorizontal2 = imHorizontal2[0:imHorizontal1.shape[0], 0:imHorizontal1.shape[1]]

    img_bwxVertical = cv2.bitwise_xor(imVertical1,imVertical2)
    img_bwxHorizontal = cv2.bitwise_xor(imHorizontal1,imHorizontal2)
    
    areaVertical = np.sum(img_bwxVertical == 1)
    areaHorizontal = np.sum(img_bwxHorizontal == 1)
    areaMean = (areaVertical + areaHorizontal) // 2
    
    #The asymmetry level (AS) is calculated as a percentage of the non-zero pixels in the overlapped image over the lesion area 
    return (areaMean / np.sum(im == 1)) *100

In [None]:
def perimeter_area(im):
    image = plt.imread(im)
    mask = image.copy()
    area = np.sum(mask)
    struct_el = morphology.disk(1)
    mask_eroded = morphology.binary_erosion(mask, struct_el)
    image_perimeter = mask - mask_eroded
    perimeter = np.sum(image_perimeter)
    return [area, perimeter]

In [None]:
def make_knn_prediction(k,train,classes,test):
    neigh = KNeighborsClassifier(n_neighbors = k)
    neigh.fit(train,classes.ravel())
    clas_pred = neigh.predict(test)
    return clas_pred

In [None]:
def accuracy_test(k,train,classes,test,classes_test):
    performance = []
    perform1 =[]
    for i in range(1,k):
        a = make_knn_prediction(i,train,classes,test)
        performance.append(a)
        for j in performance:
            b = accuracy_score(classes_test,j)
        perform1.append(b)
    fig, axes = plt.subplots()
    axes.plot(perform1)
    plt.title("Classification Accuracy of KNN for Different Values of k")
    plt.ylabel("Test Accuracy")
    plt.xlabel("Value of k");
    plt.grid(color='grey', linestyle='--', linewidth=0.5)
    return np.mean(perform1)



In [None]:
def scatter_data(x1, x2, y, ax=None):
    # scatter_data displays a scatterplot of featuress x1 and x2, and gives each point
    # a different color based on its label in y

    class_labels, indices1, indices2 = np.unique(y, return_index=True, return_inverse=True)
    if ax is None:
        fig = plt.figure(figsize=(8, 8))
        ax = fig.add_subplot(111)
        ax.grid()

    colors = cm.rainbow(np.linspace(0, 1, len(class_labels)))
    for i, c in zip(np.arange(len(class_labels)), colors):
        idx2 = indices2 == class_labels[i]
        lbl = 'Class ' + str(i)
        ax.scatter(x1[idx2], x2[idx2], color=c, label=lbl)

    return ax

<h1>Task 0: Explore the data</h1>
<h2>Data Checking and filtering</h2>

<h3>Checking csv files for missing values</h3>

In [None]:
check_null_values(df_ground_truth, 'ground_truth')
check_null_values(df_size_features, 'features')

In [None]:
if True in list(df_ground_truth.duplicated()):
    print("Duplicated rows")
else:
    print('No duplicate rows')

if True in list(df_size_features.duplicated()):
    print("Duplicated rows")
else:
    print('No duplicate rows')


In [None]:
df_ground_truth = df_ground_truth.set_index('image_id')
df_features = df_size_features.set_index('id')

In [None]:
color_img_path_list = []
segm_path_list = []

#getting img paths
for i in image_files[1:]:
    img_path = os.path.join(image_folder, i)
    color_img_path_list.append(img_path)

for i in segmentation_files:
    img_path1 = os.path.join(segmentation_folder, i)
    segm_path_list.append(img_path1)

both_images = list(zip(color_img_path_list, segm_path_list))

In [None]:
#split images by lesion type
keratosis_mask = df_ground_truth['seborrheic_keratosis']==1.0
melanoma_mask = df_ground_truth['melanoma']==1.0
healthy_mask = (df_ground_truth['seborrheic_keratosis']==0.0) & (df_ground_truth['melanoma']==0.0)

keratosis_images = df_ground_truth.index[keratosis_mask].tolist()
melanoma_images = df_ground_truth.index[melanoma_mask].tolist()
healthy_images = df_ground_truth.index[healthy_mask].tolist()

keratosis = [] #path to all keratosis images
keratosis_seg = []
for i in keratosis_images:
    keratosis_path = os.path.join(image_folder, i + '.jpg')
    keratosis_seg_path = os.path.join(segmentation_folder, i + '_segmentation.png')
    keratosis.append(keratosis_path)
    keratosis_seg.append(keratosis_seg_path)


melanoma = [] #paths to all melanoma images
melanoma_seg = []
for i in melanoma_images:
    melanoma_path = os.path.join(image_folder, i + '.jpg')
    melanoma_seg_path = os.path.join(segmentation_folder, i + '_segmentation.png')
    melanoma.append(melanoma_path)
    melanoma_seg.append(melanoma_seg_path)


healthy = [] #paths to all healthy images
healthy_seg = []
for i in healthy_images:
    healthy_path = os.path.join(image_folder, i + '.jpg')
    healthy_seg_path = os.path.join(segmentation_folder, i + '_segmentation.png')
    healthy.append(healthy_path)
    healthy_seg.append(healthy_seg_path)

In [None]:
keratosis_final = list(zip(keratosis, keratosis_seg))
melanoma_final = list(zip(melanoma, melanoma_seg))
healthy_final = list(zip(healthy, healthy_seg))

In [None]:
shape_list = []
for i in keratosis:
    b = plt.imread(i)
    shape_list.append(b.shape)

df_img_shape = pd.DataFrame(shape_list, columns=['Height', 'Width', 'Channels'])
df_img_shape.sort_values('Height',ascending=False).head()

In [None]:
shape_list = []
for i in ex_keratosis:
    b = plt.imread(i)
    shape_list.append(b.shape)

df_img_shape = pd.DataFrame(shape_list, columns=['Height', 'Width', 'Channels'])
df_img_shape.sort_values('Height',ascending=False).head()

<h1>Task 2: Feature Analysis

<h3>1.1 Color Analysis</h3>

In [None]:
im = plt.imread(keratosis[1])
mask = plt.imread(keratosis_seg[1])



In [None]:
plt.imshow(mask)

In [None]:
area = np.sum(mask)
struct_el = morphology.disk(1)
mask_eroded = morphology.binary_erosion(mask, struct_el)
image_perimeter = mask - mask_eroded
perimeter = np.sum(image_perimeter)
plt.imshow(image_perimeter)
print('The area is: ', area, '\nThe perimeter is: ', perimeter)

In [None]:
ex_k_area_per = [perimeter_area(i) for i in ex_keratosis_seg]
ex_m_area_per = [perimeter_area(i) for i in ex_melanoma_seg]
ex_h_area_per = [perimeter_area(i) for i in ex_healthy_seg]

In [None]:
ex_k_area = [i[0] for i in ex_k_area_per]
ex_k_per = [i[1] for i in ex_k_area_per]

ex_m_area = [i[0] for i in ex_m_area_per]
ex_m_per = [i[1] for i in ex_m_area_per]

ex_h_area = [i[0] for i in ex_h_area_per]
ex_h_per = [i[1] for i in ex_h_area_per]

ex_all_area = np.array(ex_k_area + ex_m_area + ex_h_area)
ex_all_per = np.array(ex_k_per + ex_m_per + ex_h_per)

In [None]:
im1 = im.copy()
im1[mask==0] = 0
new_arr_no_0 = im1[np.where(im1!=0)]

im1 = crop(im1)
plt.imshow(im1)

In [None]:
xy_cords = np.flip(np.column_stack(np.where(im1 >= 0)), axis=1)
a_del = np.delete(xy_cords, 0, 1)
a_del = a_del[::3][:,[0,1]]

print(a_del)

In [None]:
image = Image.fromarray(im1)
rgb_image = image.convert('RGB')

rgb1 = []
for i in a_del:
    aux_val = rgb_image.getpixel((int(i[0]), int(i[1])))
    rgb1.append(aux_val)

hex_codes = [RGB2HEX(i) for i in rgb1]
counted_colors = Counter(hex_codes)
counted_colors.pop('#000000')

Getting the final colors per each category.
WARNING! This code block will take a while to run! I recommend puting your laptop in performance mode and letting it sit somewhere between 40-80 minutes

In [None]:
final_k = [color_reader(i[0], i[1]) for i in keratosis_final]
final_m = [color_reader(i[0], i[1]) for i in melanoma_final]
final_h = [color_reader(i[0], i[1]) for i in healthy_final]

In [None]:
k_len = [len(i) for i in final_k]
h_len = [len(i) for i in final_h]
m_len = [len(i) for i in final_m]

In [None]:
final_k

In [None]:
extra_final_k = [color_reader(i[0], i[1]) for i in ex_final_keratosis]
extra_final_m = [color_reader(i[0], i[1]) for i in ex_final_melanoma]
extra_final_h = [color_reader(i[0], i[1]) for i in ex_final_healthy]

In [None]:
ex_k_len = [len(i) for i in extra_final_k]
ex_m_len = [len(i) for i in extra_final_m]
ex_h_len = [len(i) for i in extra_final_h]

<h3>Color Analysis: Saturation</h3>

In [None]:
image_copy = im1.copy()
non_black_pixels_mask = np.any(im1 != [0, 0, 0], axis=-1)

In [None]:
hsv_pic = rgb2hsv(image_copy[non_black_pixels_mask])
maximum = np.max(hsv_pic[:,1])
minimum = np.min(hsv_pic[:,1])

In [None]:
k_hsv = [hsv(i[0],i[1]) for i in keratosis_final]
m_hsv = [hsv(i[0],i[1]) for i in melanoma_final]
h_hsv = [hsv(i[0],i[1]) for i in healthy_final]

In [None]:
ex_k_hsv = [hsv(i[0],i[1]) for i in ex_final_keratosis]
ex_m_hsv = [hsv(i[0],i[1]) for i in ex_final_melanoma]
ex_h_hsv = [hsv(i[0],i[1]) for i in ex_final_healthy]

<h3>Symmetry analysis</h3>

In [None]:
im_sym = plt.imread(keratosis_seg[1])
y_nonzero, x_nonzero = np.nonzero(im_sym)
im_sym = im_sym[np.min(y_nonzero):np.max(y_nonzero), np.min(x_nonzero):np.max(x_nonzero)]

In [None]:
plt.imshow(im_sym)

In [None]:
height, width = im_sym.shape
width_slice = width//2
height_slice = height//2

In [None]:
print(width_slice, height_slice)

In [None]:
verticalLeft = im_sym[:,:width_slice]
verticalRight = im_sym[:, width_slice:]

horizontalUp = im_sym[:height_slice, :]
horizontalDown = im_sym[height_slice:, :]

plt.imshow(verticalLeft)

In [None]:
verticalIndex = [slice(None)]*verticalRight.ndim
horizontalIndex = [slice(None)]*horizontalDown.ndim

verticalIndex[1] = slice(None, None, -1)
horizontalIndex[0] = slice(None, None, -1)

verticalRight = verticalRight[tuple(verticalIndex)]
horizontalDown = horizontalDown[tuple(horizontalIndex)]


In [None]:
verticalRight = verticalRight[0:verticalLeft.shape[0], 0:verticalLeft.shape[1]]
horizontalDown = horizontalDown[0:horizontalUp.shape[0], 0:horizontalUp.shape[1]]

In [None]:
img_bwxVertical = cv2.bitwise_xor(verticalLeft,verticalRight)
img_bwxHorizontal = cv2.bitwise_xor(horizontalUp,horizontalDown)
fig, axes = plt.subplots(1, 2, figsize =(10,10))
axes[0].imshow(img_bwxVertical, cmap='gray')
axes[1].imshow(img_bwxHorizontal, cmap='gray')

In [None]:
areaVertical = np.sum(img_bwxVertical == 1)
areaHorizontal = np.sum(img_bwxHorizontal == 1)
areaMean = (areaVertical + areaHorizontal) // 2

The asymmetry level (AS) is calculated as a percentage of the non-zero pixels in the overlapped image over the lesion area

In [None]:
print(areaMean / np.sum(im_sym == 1) *100)

Preparing k_sym and h_sym for the prediction algorithm

k_sym - is the asymmetry level of every segmentation picture in the keratosis segmentation dataset

m_sym - is the asymmetry level of every segmentation picture in the melanoma segmentation dataset

h_sym - is the asymmetry level of every segmentation picture in the healthy segmentation dataset


In [None]:
k_sym = [asymmetry_level(i) for i in keratosis_seg]
m_sym = [asymmetry_level(i) for i in melanoma_seg]
h_sym = [asymmetry_level(i) for i in healthy_seg]

In [None]:
ex_k_sym = [asymmetry_level(i) for i in ex_keratosis_seg]
ex_m_sym = [asymmetry_level(i) for i in ex_melanoma_seg]
ex_h_sym = [asymmetry_level(i) for i in ex_healthy_seg]

<h2>Adding every feature to a DataFrame</h2>

In [None]:
all_len = np.array(k_len + m_len + h_len)
all_image = np.array(keratosis_images + melanoma_images + healthy_images)
all_color = np.array(final_k + final_m + final_h)
df_color_counter = pd.DataFrame({"image_id": all_image, "Nb_of_Colors": all_len, "Color_Counter":all_color})
df_color_counter.to_csv("../features/final_features/features_all_colors.csv", index=False)

all_hsv = np.array(k_hsv + m_hsv + h_hsv)
df_saturation = pd.DataFrame({"image_id": all_image, "Sat_Value": all_hsv.reshape(-1)})
df_features = pd.merge(df_color_counter, df_saturation, on="image_id", how="left")

all_asym = np.array(k_sym + m_sym + h_sym)
df_asym = pd.DataFrame({"image_id": all_image, "Asymmetry": all_asym})
df_features = df_features.merge(df_asym, on="image_id", how="left")

df_size_features = df_size_features.rename(columns={"id":"image_id"})
df_features = df_features.merge(df_size_features, on="image_id", how="left")
df_features = pd.merge(df_features, df_ground_truth, on="image_id", how="left")

conditions = [
    (df_features["melanoma"] == 0) & (df_features["seborrheic_keratosis"] == 0),
    (df_features["melanoma"] == 1) & (df_features["seborrheic_keratosis"] == 0),
    (df_features["melanoma"] == 0) & (df_features["seborrheic_keratosis"] == 1),
]
values_nb = [0,1,2]
values_name = ["healthy", "melanoma", "keratosis"]
df_features["Lesion_Type"] = np.select(conditions, values_nb)
df_features["Lesion_Name"] = np.select(conditions, values_name)

df_features = df_features.drop(columns="Color_Counter")


df_features.to_csv("../features/final_features/features_original_image_dataset.csv", index=False)

In [None]:
ex_all_len = np.array(ex_k_len + ex_m_len + ex_h_len)
ex_all_image = np.array(extra_keratosis_im + extra_melanoma_im + extra_healthy_im)
ex_all_color = np.array(extra_final_k + extra_final_m + extra_final_h)
df_color_counter_extra = pd.DataFrame({"image_id": ex_all_image, "Nb_of_Colors": ex_all_len, "Color_Counter":ex_all_color})
df_color_counter_extra.to_csv("../features/extra_final_features/extra_features_all_colors.csv", index=False)

ex_all_hsv = np.array(ex_k_hsv + ex_m_hsv + ex_h_hsv)
df_saturation_extra = pd.DataFrame({"image_id": ex_all_image, "Sat_Value": ex_all_hsv.reshape(-1)})
df_features_extra = pd.merge(df_color_counter_extra, df_saturation_extra, on="image_id", how="left")

ex_all_asym = np.array(ex_k_sym + ex_m_sym + ex_h_sym)
df_asym_extra = pd.DataFrame({"image_id": ex_all_image, "Asymmetry": ex_all_asym})
df_features_extra = df_features_extra.merge(df_asym_extra, on="image_id", how="left")

df_size_features_extra = pd.DataFrame({"image_id": ex_all_image, "area":ex_all_area, "perimeter": ex_all_per })
df_features_extra = df_features_extra.merge(df_size_features_extra, on="image_id", how="left")
df_features_extra = pd.merge(df_features_extra, df_extra_gt, on="image_id", how="left")

conditions = [
    (df_features_extra["melanoma"] == 0) & (df_features_extra["seborrheic_keratosis"] == 0),
    (df_features_extra["melanoma"] == 1) & (df_features_extra["seborrheic_keratosis"] == 0),
    (df_features_extra["melanoma"] == 0) & (df_features_extra["seborrheic_keratosis"] == 1),
]
values_nb = [0,1,2]
values_name = ["healthy", "melanoma", "keratosis"]
df_features_extra["Lesion_Type"] = np.select(conditions, values_nb)
df_features_extra["Lesion_Name"] = np.select(conditions, values_name)

df_features_extra = df_features_extra.drop(columns="Color_Counter")


df_features_extra.to_csv("../features/extra_final_features/features_extra_image_dataset.csv", index=False)

<h3>Read the imported DataFrame </h3>
<p>It has been placed on Github, it is the same dataframe used in the upper cells in this notebook, skip the dataframe generation and just read this file, it will save you a lot of time.</p>

<p>Files: <br> ../data/final_features/features_original_dataset.csv - contains all the feature for the 150 images <br><br> ../data/final_features/features_all_colors.csv - contains all the colors and total number of colors for each image
</p>

In [None]:
df_features = pd.read_csv("../features/final_features/features_original_image_dataset.csv")

In [None]:
df_features

In [None]:
df_features_extra = pd.read_csv("../features/extra_final_features/features_extra_image_dataset.csv")

<h3>Visualization</h3>

<h5> Visualization of original 150 photo's data</h5>

In [None]:
features_area = np.array(df_features['area'])
features_perimeter = np.array(df_features['perimeter'])
features_lesion_type = np.array(df_features['Lesion_Type'])
axs = scatter_data(features_area, features_perimeter, features_lesion_type)
axs.set_title(" Scatter Plot: Area - Perimeter")
axs.set_xlabel('X1 = Area')
axs.set_ylabel('X2 = Perimeter')
axs.legend()

In [None]:
df_aux = df_features[["area", "perimeter", "Lesion_Name"]]
sns.pairplot(df_aux, height=4, hue='Lesion_Name')

In [None]:
features_color = np.array(df_features['Nb_of_Colors'])
features_saturation = np.array(df_features['Sat_Value'])
features_lesion_type = np.array(df_features['Lesion_Type'])
axs = scatter_data(features_saturation,features_color,  features_lesion_type)
axs.set_title(" Scatter Plot: Nb. of Colors per pixel - Image Saturation")
axs.set_xlabel('X2 = Image Saturation')
axs.set_ylabel('X1 = Nb. of Colors')
axs.legend()

In [None]:
df_aux1 = df_features[["Sat_Value", "Nb_of_Colors",  "Lesion_Name"]]
sns.pairplot(df_aux1, height=4, hue='Lesion_Name')

<h5> Visualization of the extra data (2000 photos) </h5>

In [None]:
df_aux2 = df_features_extra[["Sat_Value", "Nb_of_Colors", "Lesion_Type",  "Lesion_Name"]]
sns.pairplot(df_aux2.sort_values(by=["Lesion_Type"], ascending=True).drop(columns=["Lesion_Type"]), height=4, hue='Lesion_Name')

<h3>Diagnosis prediction</h3>

In [None]:
# classifier_healthy = np.array([0 for i in range(len(ex_healthy))])
# classifier_melanoma = np.array([1 for i in range(len(ex_melanoma))])
# classifier_keratosis = np.array([2 for i in range(len(ex_keratosis))])



# clas = np.array([x for y in [classifier_keratosis, classifier_melanoma, classifier_healthy] for x in y])
# same as:
# for y in [classifier_keratosis, classifier_melanoma, classifier_healthy]:
#   for x in y:
#       yeild x

clas = df_features_extra["Lesion_Type"].to_numpy()

<h4>1.Color diagnosis</h4>

In [None]:
#color_data = np.array(ex_k_len + ex_m_len + ex_h_len)
color_data = df_features_extra["Nb_of_Colors"].to_numpy()

color_train, color_test, clas_train, clas_test = train_test_split(color_data,clas,test_size=0.6, random_state=4)

color_train = color_train.reshape(-1, 1)
clas_train = clas_train.reshape(-1, 1)
color_test = color_test.reshape(-1, 1)

In [None]:
print(make_knn_prediction(7, color_train, clas_train, color_test))
print(clas_test)

In [None]:
accuracy_test(380,color_train,clas_train,color_test,clas_test)

<h4>2.Saturation</h4>

In [None]:
#sat_data = np.array([x for y in [ex_k_hsv, ex_m_hsv, ex_h_hsv] for x in y])
sat_data = df_features_extra["Sat_Value"].to_numpy()

sat_train, sat_test, clas_train, clas_test = train_test_split(sat_data, clas, test_size=0.6, random_state = 4)

sat_train = sat_train.reshape(-1, 1)
clas_train = clas_train.reshape(-1, 1)
sat_test = sat_test.reshape(-1, 1)

In [None]:
print(make_knn_prediction(7,sat_train,clas_train,sat_test))
print(clas_test)

In [None]:
accuracy_test(150,sat_train,clas_train,sat_test,clas_test)

<h4>3.Asymmetry</h4>

In [None]:
#asym_data = np.array([x for y in [ex_k_sym, ex_m_sym, ex_h_sym] for x in y])
asym_data = df_features_extra["Asymmetry"].to_numpy()

asym_train, asym_test, clas_train, clas_test = train_test_split(asym_data, clas, test_size=0.6, random_state=4)

asym_train = asym_train.reshape(-1, 1)
clas_train = clas_train.reshape(-1, 1)
asym_test = asym_test.reshape(-1, 1)

In [None]:
print(make_knn_prediction(7,asym_train,clas_train,asym_test))
print(clas_test)

In [None]:
accuracy_test(800,asym_train,clas_train,asym_test,clas_test)

Single image prediction: see more under the scripts folder

In [None]:
df_features_extra[df_features_extra["Lesion_Type"]!=2]

In [None]:
from sklearn.model_selection import train_test_split

X = df_features_extra[df_features_extra["Lesion_Type"]!=2].iloc[:, 1:6].values
y = df_features_extra[df_features_extra["Lesion_Type"]!=2].iloc[:, 8].values 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [None]:
from sklearn.preprocessing import Normalizer

scaler = Normalizer()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

classifier = KNeighborsClassifier(n_neighbors=7)
classifier.fit(X_train, y_train)

In [None]:
y_pred = classifier.predict(X_test)

In [None]:
result = {}  
img_pred = classifier.predict(df_features_extra[df_features_extra["image_id"]=="ISIC_0012261"].iloc[:, 1:6].values)
score = classifier.predict_proba(df_features_extra[df_features_extra["image_id"]=="ISIC_0012261"].iloc[:, 1:6].values)
model_classes = classifier.classes_
index = np.where(model_classes == img_pred[0])[0][0]
result["predicted"] = img_pred[0]
result["score"] = score[0][index]
result

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

<h3> Open Question </h3>

<p>
Our open question is in regards with the saturation ( which in our case measures the difference between the most saturated and the least saturated colours in the lession thus showing us the saturation variation) and the lightness value ( which in our case measures the diffrence between the brightest and the darkest colours in the lession thus showing us the lightness variation ). Therefore the question is which one would be more fitted to be a good feature to predict the diagnosis of a skin lession.
</p>

In [None]:
ex_k_light = [lightness(i[0],i[1]) for i in ex_final_keratosis]
ex_m_light = [lightness(i[0],i[1]) for i in ex_final_melanoma]
ex_h_light = [lightness(i[0],i[1]) for i in ex_final_healthy]

In [None]:
light_data = np.array([x for y in [ex_k_light, ex_m_light, ex_h_light] for x in y])

df_light_extra = pd.DataFrame({"image_id": ex_all_image, "Lightness": light_data.flatten()})
df_light_extra.to_csv("../features/extra_final_features/extra_lightness_data.csv", index=False)

In [None]:
df_light_extra = pd.read_csv("../features/extra_final_features/extra_lightness_data.csv")

light_data = df_light_extra["Lightness"].to_numpy()

In [None]:
light_data

In [None]:
light_train, light_test, clas_train, clas_test = train_test_split(light_data, clas, test_size=0.6, random_state=4)

In [None]:
light_train = light_train.reshape(-1, 1)
clas_train = clas_train.reshape(-1, 1)
light_test = light_test.reshape(-1, 1)

In [None]:
print(make_knn_prediction(7, light_train, clas_train, light_test))
print(clas_test)

In [None]:
accuracy_test(150, light_train, clas_train, light_test, clas_test)

In [None]:
accuracy_test(150,sat_train,clas_train,sat_test,clas_test)

Now we compare the accuracy scores

In [None]:
print(accuracy_test(150, light_train, clas_train, light_test, clas_test) - accuracy_test(150,sat_train,clas_train,sat_test,clas_test))

From this analysis we can conclude (if there are no flaws in our implementation) that the lightness variation is slighty more precise than the saturation variation. Though this conclusion raises other questions:
Is the very small difference significant? Can it affect the results of the model in any way considering the large input of photos?

