# GENERATION OF A SUBSET OF INPUT DATA

In [1]:
#Import libraries
import random
import csv
import os
import io
import cv2
from PIL import Image
import h5py
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline

## 1. Declare file paths

In [2]:
#IMPORT FILES
#Directory for data files - FULL DATA
dataPath = "/Users/awieber/Documents/Data Projects/Deep Learning Cancerous Lesions/isic-2024-challenge/" #slash required at end
#Metadata file paths
metaPath = dataPath + "cleaned_metadata_VF.csv"
#Image input file path
hdf5_file_in = dataPath + "train-image.hdf5"

#SAVE FILES
#Directory for saved files
savePath = "/Users/awieber/Documents/Data Projects/Deep Learning Cancerous Lesions/saves/" #slash required at end
#Image output file path
hdf5_file_out = dataPath + "sample-image.hdf5"
#CSV output file path
csv_file_out = dataPath + "sample-metadata.csv"

## 2. Hyperparameters

In [3]:
#Select number of samples of malignant and benign samples (max values indicated)
n_samples_malignant = 300 # Max = 393
n_sample_benign = 3000 # Max = 400666

## 3. Retrieve a list of all isic_ids for malignant and benign cases

In [4]:
#Import metadata (corresponds to the objects in the HDF5 file)
metadata = pd.read_csv(metaPath, sep=",")

In [None]:
#Split into malignant and benign
meta_malign = metadata[metadata["target"]==1]
meta_benign = metadata[metadata["target"]==0]

isic_ids_malign = meta_malign["isic_id"].reset_index(drop=True)
isic_ids_benign = meta_benign["isic_id"].reset_index(drop=True)

print("Number of malignant ids:", len(isic_ids_malign))
print("Number of benign ids:", len(isic_ids_benign))

In [None]:
'''
n=66
random.seed(125)
subset_indices = range(len(isic_ids_malign)-1) #Ordered list of the indices in isic_ids
subset_indices = random.sample(subset_indices, n) #Sample of size n taken from the indices
subset = isic_ids_malign[subset_indices] #isic_ids associated to these indices
'''

## 4. Take a random sample of isic_ids from malignant and benign lists

In [7]:
#Generate subset of isic ids to export to hdf5 file
random.seed(125)

#n = number of samples to take
def generate_isic_list(isic_ids, n=66):
    subset_indices = range(len(isic_ids)-1) #Ordered list of the indices in isic_ids
    subset_indices = random.sample(subset_indices, n) #Sample of size n taken from the indices
    return  isic_ids[subset_indices].tolist() #isic_ids associated to these indices


isic_ids_mal_subset = generate_isic_list(isic_ids_malign, n_samples_malignant)
isic_ids_ben_subset = generate_isic_list(isic_ids_benign, n_sample_benign)
isic_ids_subset = isic_ids_mal_subset + isic_ids_ben_subset

## 5. Import the random samples from the hdf5 file

In [8]:
#Retrieve raw data from inital HDF5 file and save in a list containing
#the tuples (isic_id, data)
bytes_imgs = []
with h5py.File(hdf5_file_in, 'r') as h5file:
    for isic_id in isic_ids_subset:
        bytes_imgs.append((isic_id, h5file[isic_id][()]))

## 6. Export the sampled images to a new hdf5 file

In [9]:
#Export the file for each image one by one
with h5py.File(hdf5_file_out, 'w') as h5file:
    for item in bytes_imgs:
        id = item[0]
        img = item[1]
        h5file.create_dataset(id, data=img)

## 7. Export the associated metadata to a new csv file

In [10]:
#Export the target data in csv form
meta = metadata[metadata["isic_id"].isin(isic_ids_subset)]

In [11]:
meta.to_csv(csv_file_out, index=False)

## 8. Confirm that data was properly copied

In [12]:
#Function to show image
def show_img(image):
    plt.imshow(image, interpolation=None)
    plt.grid(None)
    plt.show()

In [13]:
#Image chosen for comparison
isic_id = isic_ids_subset[0]

In [None]:
#Display of image using INPUT data file
f = h5py.File(hdf5_file_in, 'r')
image = np.array(
        Image.open(io.BytesIO(f[isic_id][()]))
        )
show_img(image)
print("Native form:", f[isic_id][()])

In [None]:
#Display of image using OUTPUT data file
f = h5py.File(hdf5_file_out, 'r')
image = np.array(
        Image.open(io.BytesIO(f[isic_id][()]))
        )
show_img(image)
print("Native form:", f[isic_id][()])