# EDA

In [2]:
import os
import io
import cv2
from PIL import Image
import h5py
import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn import preprocessing 

## 2) IMPORT DATA

### 2.1 - Declare file paths

In [3]:
#General file paths
projectDir = os.getcwd() + "/"
parentDir = os.path.abspath(os.path.join(projectDir, os.pardir)) + "/"
dataPath = os.path.abspath(os.path.join(projectDir, os.pardir)) + "/isic-2024-challenge/"

#Metadata file paths
metaPath = dataPath + "train-metadata.csv"
#metaPath = dataPath + "sample-metadata.csv"

#Image file path
hdf5_file = dataPath + "train-image.hdf5"
#hdf5_file = dataPath + "sample-image.hdf5"

### 2.2 - Load metadata from csv

In [4]:
#Import metadata
metadata = pd.read_csv(metaPath, sep=",")

#METADATA: color and size features having no NAs
metadata = metadata[["isic_id",
                     "target",
                     "age_approx",
                     "anatom_site_general",
                     "sex",
                     "clin_size_long_diam_mm",
                     "tbp_lv_areaMM2",
                     "tbp_lv_area_perim_ratio",
                     "tbp_lv_eccentricity",
                     "tbp_lv_minorAxisMM",
                     "tbp_lv_color_std_mean",
                     "tbp_lv_deltaLBnorm",
                     "tbp_lv_radial_color_std_max"]]

#Verify that there are no NAs
print("-- X_meta NA counts --")
print(metadata.isna().sum())

FileNotFoundError: [Errno 2] No such file or directory: 'c:\\Users\\marti\\Documents\\Martial/isic-2024-challenge/train-metadata.csv'

## 3) PLOTS

3.1 - AGE

In [None]:
# Set the size of the plot
plt.figure(figsize=(12, 8))

# Create age groups for clearer grouping
bins = [0,40, 60, 100]  
labels = ['0-39', '40-59', '60+']
metadata['age_group'] = pd.cut(metadata['age_approx'], bins=bins, labels=labels)

# Normalize the counts of each age group by target
age_group_freq = metadata.groupby('target')['age_group'].value_counts(normalize=True).unstack()

# plot the distribution of target by age group
age_group_freq.plot(kind='bar', stacked=True, color=['#1f77b4', '#ff7f0e', '#2ca02c'])


# Set x and y axis labels
plt.xlabel('Target (0 = Benign, 1 = Malignant)', fontsize=12)
plt.ylabel('Proportion', fontsize=12)

# Set the title of the plot
plt.title("Distribution of Target Variable by Age Group (Approximate Age)", fontsize=16)
plt.grid(True)

# Display plot
plt.show()

- The older age groups ("40-59", "60-79", "80+") tend to have more cases. This could indicate that as age increases, there is a higher frequency of cases.
- The age group "60-79" shows some notable contribution to malignant cases, suggesting that this age group may be at a higher risk

3.2 - SEX

In [None]:
# Set a larger figure size for better visibility
plt.figure(figsize=(12, 8))


# Normalize the counts of each age group by target
sex_group_freq = metadata.groupby('target')['sex'].value_counts(normalize=True).unstack()

# plot the distribution of target by age group
sex_group_freq.plot(kind='bar', stacked=True, color=['#1f77b4', '#ff7f0e'])


# Set x and y axis labels
plt.xlabel('Target (0 = Benign, 1 = Malignant)', fontsize=12)
plt.ylabel('Proportion', fontsize=12)

# Set the title of the plot
plt.title("Distribution of Target Variable by sex", fontsize=16)
plt.grid(True)

# Display plot
plt.show()

Based on the plot, we can say that the difference in the distribution of benign and malignant cases between males and females is not extremely large. While there are more males in both benign and malignant categories, the difference is not substantial. Both sexes show a higher occurrence of benign cases, and the proportion of malignant cases seems similar for both.

3.2 - Anatomical Site

In [None]:
# Set a larger figure size for better visibility
plt.figure(figsize=(12, 8))


# Normalize the counts of each age group by target
sex_group_freq = metadata.groupby('target')['anatom_site_general'].value_counts(normalize=True).unstack()

# plot the distribution of target by age group
sex_group_freq.plot(kind='bar', stacked=True, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'])


# Set x and y axis labels
plt.xlabel('Target (0 = Benign, 1 = Malignant)', fontsize=12)
plt.ylabel('Proportion', fontsize=12)

# Set the title of the plot
plt.title("Distribution of Target Variable by Anatomical Site", fontsize=16)
plt.grid(True)

# Display plot
plt.show()


Posterior torso seems to have a relatively higher number of malignant cases, whereas lower extremity and anterior torso show a more balanced distribution between benign and malignant cases.
Head/neck stands out for having more malignant cases despite the lower overall count, which might suggest a need for focused attention in this region.
Overall, this distribution suggests certain anatomical sites, like the posterior torso and head/neck, may have higher malign