Install new packages: Image from PIL

In [None]:
import pandas as pd
import os
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
import imageio
from PIL import Image

### Data retrieval and exploration

In [None]:
path = '~/code/cecyprice/skin_lesion_detection/dataset/'
metadata = pd.read_csv(path + 'HAM10000_metadata.csv')
dim1_L = pd.read_csv(path + 'hmnist_8_8_L.csv')
dim1_RGB = pd.read_csv(path + 'hmnist_8_8_RGB.csv')
dim2_L = pd.read_csv(path + 'hmnist_28_28_L.csv')
dim2_RGB = pd.read_csv(path + 'hmnist_28_28_RGB.csv')

In [None]:
metadata.head()

### Create dictionary of images and labels

In [None]:
base_skin_dir = os.path.join('..','dataset')
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

In [5]:
lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

### Merge all datasets

In [6]:
skin_df = pd.read_csv(os.path.join(base_skin_dir, 'HAM10000_metadata.csv'))

In [7]:
skin_df['path'] = skin_df['image_id'].map(imageid_path_dict.get)
skin_df['cell_type'] = skin_df['dx'].map(lesion_type_dict.get) 
skin_df['cell_type_idx'] = pd.Categorical(skin_df['cell_type']).codes

In [8]:
skin_df['path'].dropna(inplace=True)

In [11]:
skin_df['images'] = skin_df['path'].map(lambda x: np.asarray(Image.open(x))).apply(lambda x : x.reshape(810000))

In [17]:
skin_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx,images
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,..\dataset\HAM10000_images_part_1\ISIC_0027419...,Benign keratosis-like lesions,2,"[188, 147, 191, 186, 148, 189, 187, 150, 191, ..."
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,..\dataset\HAM10000_images_part_1\ISIC_0025030...,Benign keratosis-like lesions,2,"[25, 15, 23, 25, 14, 22, 25, 14, 22, 25, 14, 2..."
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,..\dataset\HAM10000_images_part_1\ISIC_0026769...,Benign keratosis-like lesions,2,"[186, 128, 140, 188, 128, 136, 183, 126, 133, ..."
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,..\dataset\HAM10000_images_part_1\ISIC_0025661...,Benign keratosis-like lesions,2,"[24, 9, 16, 22, 11, 15, 23, 11, 15, 26, 11, 16..."
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,..\dataset\HAM10000_images_part_2\ISIC_0031633...,Benign keratosis-like lesions,2,"[122, 80, 102, 124, 82, 104, 127, 83, 106, 130..."


In [16]:
skin_df.images.apply(lambda x: x/255)

MemoryError: Unable to allocate 6.18 MiB for an array with shape (810000,) and data type float64

### Scaling

#### Create train and test split

In [None]:
X = skin_df.drop(columns=['cell_type_idx'],axis=1)
y = skin_df['cell_type_idx']

In [None]:
from sklearn.model_selection import train_test_split
X_train_tmp, X_test_tmp, y_train_tmp, y_test_tmp = train_test_split(X, y, test_size=0.2,random_state=7)

In [None]:
X_train_tmp.head()

In [None]:
np.asarray(X_train_tmp['image_100_75_reshaped'].tolist())

In [None]:
X_train = np.asarray(X_train_tmp['image_100_75_reshaped'].tolist())
X_test = np.asarray(X_test_tmp['image_100_75_reshaped'].tolist())

#### Zero-centering

In [None]:
X_train.shape

In [None]:
X_train_zero = X_train - X_train.mean(axis=0)

In [None]:
X_test_zero = X_test - X_test.mean(axis=0)

In [None]:
X_train_zero = (X_train_zero - X_train_zero.min()) / (X_train_zero.max() - X_train_zero.min())

In [None]:
X_test_zero = (X_test_zero - X_test_zero.min()) / (X_test_zero.max() - X_test_zero.min())

#### Normalization

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_train)
X_test_norm = scaler.fit_transform(X_test)

#### Standardization

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.fit_transform(X_test)

#### Hot-encoding

#### Split training and validation set

In [None]:
x_train, x_validate, y_train, y_validate = train_test_split(x_train, y_train, test_size = 0.1, random_state = 2)

In [None]:
x_train = x_train.reshape(x_train.shape[0], *(75, 100, 3))
x_test = x_test.reshape(x_test.shape[0], *(75, 100, 3))
x_validate = x_validate.reshape(x_validate.shape[0], *(75, 100, 3))