In [None]:
from random import shuffle
import glob
import pandas as pd
import numpy as np

import time
import h5py
import matplotlib.pyplot as plt
import scipy
from PIL import Image
from scipy import ndimage
#from dnn_app_utils_v3 import *
import cv2

%matplotlib inline
plt.rcParams['figure.figsize'] = (5.0, 4.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

%load_ext autoreload
%autoreload 2

np.random.seed(1)

# This file is used to create an H5 pile that contains the images and labels of 0 and 1 for each image. 
# This was performed by loading in the metadata with labels and using that as the "labels" variable

In [None]:
import tensorflow as tf

In [None]:
shuffle_data = True #shuffles
hdf5_path = 'C:/Users/Jose/Desktop/ECE-6397 ML Project/Data/dataset.h5'

In [None]:
images_path = 'C:/Users/Jose/Desktop/ECE-6397 ML Project/Data/ISIC-images/ISIC-images/HAM10000/*.jpg'
metadata = pd.read_csv('C:/Users/Jose/Desktop/ECE-6397 ML Project/Data/ISIC-images/ISIC-images/metadata.csv')

In [None]:
addrs = glob.glob(images_path)
print(metadata.head())
print(metadata['meta.clinical.benign_malignant'][4])


In [None]:
#Here we create the benign and malignant labels from the metadata probably dont need to do this and we can just use the metadata CSV
benign_mal_labels = []
image_name = []
labels = []

for index in metadata.index:
    if 'malignant' == metadata['meta.clinical.benign_malignant'][index]:
        labels.append(1)
        image_name.append(metadata['name'][index])
        benign_mal_labels.append(metadata['meta.clinical.benign_malignant'][index])
    else:
        labels.append(0)
        image_name.append(metadata['name'][index])
        benign_mal_labels.append(metadata['meta.clinical.benign_malignant'][index])

print(labels)



In [None]:
df_labels = pd.DataFrame(labels, index = image_name)
df_benign_mal_labels = pd.DataFrame(benign_mal_labels)
df_image_name = pd.DataFrame(image_name)
print(df_labels.head())


In [None]:
if shuffle_data:
    c = list(zip(addrs,labels)) # use zip() to bind the images and labels together
    shuffle(c)
 
    (addrs,labels) = zip(*c)  # *c is used to separate all the tuples in the list c,  
                               # "addrs" then contains all the shuffled paths and 
                               # "labels" contains all the shuffled labels.
  

In [None]:
train_addrs = addrs[0:int(0.8*len(addrs))]
train_labels = labels[0:int(0.8*len(labels))]


test_addrs = addrs[int(0.8*len(addrs)):]
test_labels = labels[int(0.8*len(labels)):]

In [None]:
train_shape = (len(train_addrs), 128, 128, 3)
test_shape = (len(test_addrs), 128, 128, 3)

# open a hdf5 file and create arrays 
f = h5py.File(hdf5_path, mode='w')

# PIL.Image: the pixels range is 0-255,dtype is uint.
# matplotlib: the pixels range is 0-1,dtype is float.
f.create_dataset("train_img", train_shape, np.uint8)
f.create_dataset("test_img", test_shape, np.uint8)  

# the ".create_dataset" object is like a dictionary, the "train_labels" is the key. 
f.create_dataset("train_labels", (len(train_addrs),), np.uint8)
f["train_labels"][...] = train_labels

f.create_dataset("test_labels", (len(test_addrs),), np.uint8)
f["test_labels"][...] = test_labels

In [None]:
# JOSE: Could not figure out how to resize for a different size that 128 pixels. If one of you guys can try that please
# loop over train paths
for i in range(len(train_addrs)):
  
    if i % 1000 == 0 and i > 1:
        print ('Train data: {}/{}'.format(i, len(train_addrs)) )

    addr = train_addrs[i]
    img = cv2.imread(addr)
    img = cv2.resize(img, (128, 128), interpolation=cv2.INTER_CUBIC)# resize to (128,128)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # cv2 load images as BGR, convert it to RGB
    f["train_img"][i, ...] = img[None] 

# loop over test paths
for i in range(len(test_addrs)):

    if i % 1000 == 0 and i > 1:
        print ('Test data: {}/{}'.format(i, len(test_addrs)) )

    addr = test_addrs[i]
    img = cv2.imread(addr)
    img = cv2.resize(img, (128, 128), interpolation=cv2.INTER_CUBIC)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    f["test_img"][i, ...] = img[None]

f.close()