In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import os
from glob import glob
import matplotlib.pyplot as plt
import sklearn.neighbors
from imblearn.under_sampling import RandomUnderSampler
import imageio
from PIL import Image

In [2]:
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler

In [3]:
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import Dense, Input, Flatten, Dropout, Activation, MaxPooling2D, Conv2D, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import concatenate
import locale
import warnings

get data

In [4]:
base_skin_dir = os.path.join('..','dataset')
imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                 for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

lesion_type_dict = {
'nv': 'Melanocytic nevi',
'mel': 'Melanoma',
'bkl': 'Benign keratosis-like lesions ',
'bcc': 'Basal cell carcinoma',
'akiec': 'Actinic keratoses',
'vasc': 'Vascular lesions',
'df': 'Dermatofibroma'
}

df = pd.read_csv(os.path.join(base_skin_dir, 'HAM10000_metadata.csv'))

df['path'] = df['image_id'].map(imageid_path_dict.get)
df['cell_type'] = df['dx'].map(lesion_type_dict.get)
df['cell_type_idx'] = pd.Categorical(df['cell_type']).codes

df.dropna(inplace=True)

df['images'] = df['path'].map(lambda x: np.asarray(Image.open(x))).apply(lambda x : x.reshape(810000))
df['images_resized'] = df['path'].map(lambda x: np.asarray(Image.open(x).resize((100,75)))).apply(lambda x : x.reshape(22500))

## fill missing values with mean age
df['age'].fillna((df['age'].mean()), inplace = True)

## drop duplicates
df = df.drop_duplicates(subset=['lesion_id'], keep = 'first')

# convert categorical columns to numeric values
df.localization = df.localization.astype('category')
df.dx_type = df.dx_type.astype('category')
df.sex = df.sex.astype('category')

## isolate nv rows
data_nv = df[df['dx'] == 'nv']

# define scaling parameters
scaling = 1000 / data_nv.shape[0]

# stratified sampling
rus = RandomUnderSampler(sampling_strategy={'lower extremity' : int(1224*scaling),
                                            'trunk' : int(1153*scaling),
                                            'back' : int(1058*scaling),
                                            'abdomen' : int(719*scaling),
                                            'upper extremity' : int(504*scaling) ,
                                            'foot' : int(209*scaling),
                                            'unknown' : int(175*scaling),
                                            'chest' : int(112*scaling),
                                            'face' : int(61*scaling),
                                            'neck' : int(60*scaling),
                                            'genital' : int(43*scaling),
                                            'hand' : int(39*scaling),
                                            'scalp' : int(24*scaling),
                                            'ear' : int(19*scaling),
                                            'acral' : int(3*scaling)+1
                                           },
                           random_state=None,
                           replacement=False,
                        )

## fit strtaified sampling model
n_x, n_y = rus.fit_resample(data_nv, data_nv['localization'])

## delete nv rows from original dataset
no_nv_data = df[df.dx != 'nv']

df = pd.concat([n_x, no_nv_data], axis=0)

df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,path,cell_type,cell_type_idx,images,images_resized
0,HAM_0002054,ISIC_0031685,nv,follow_up,50.0,female,abdomen,..\dataset\HAM10000_images_part_2\ISIC_0031685...,Melanocytic nevi,4,"[253, 190, 198, 255, 189, 199, 254, 192, 197, ...","[254, 193, 198, 254, 194, 200, 254, 195, 203, ..."
1,HAM_0001690,ISIC_0032361,nv,follow_up,50.0,female,abdomen,..\dataset\HAM10000_images_part_2\ISIC_0032361...,Melanocytic nevi,4,"[248, 173, 180, 245, 174, 182, 247, 171, 181, ...","[247, 172, 182, 245, 171, 179, 246, 172, 181, ..."
2,HAM_0005332,ISIC_0027389,nv,follow_up,45.0,female,abdomen,..\dataset\HAM10000_images_part_1\ISIC_0027389...,Melanocytic nevi,4,"[201, 108, 116, 201, 103, 118, 208, 108, 118, ...","[204, 108, 117, 204, 108, 116, 204, 111, 116, ..."
3,HAM_0000078,ISIC_0030853,nv,follow_up,35.0,male,abdomen,..\dataset\HAM10000_images_part_2\ISIC_0030853...,Melanocytic nevi,4,"[222, 138, 162, 223, 139, 163, 225, 141, 165, ...","[222, 140, 162, 227, 143, 165, 229, 145, 168, ..."
4,HAM_0004764,ISIC_0025270,nv,follow_up,60.0,male,abdomen,..\dataset\HAM10000_images_part_1\ISIC_0025270...,Melanocytic nevi,4,"[218, 121, 140, 218, 120, 135, 220, 122, 137, ...","[218, 121, 134, 217, 118, 132, 218, 119, 131, ..."


In [3]:
df.shape

NameError: name 'df' is not defined

In [6]:
def get_data(random_state=1, **kwargs):
  '''
  Import and merge dataframes, pass n_rows arg to pd.read_csv to get a sample dataset
  '''

  base_skin_dir = os.path.join('..','dataset')
  imageid_path_dict = {os.path.splitext(os.path.basename(x))[0]: x
                     for x in glob(os.path.join(base_skin_dir, '*', '*.jpg'))}

  lesion_type_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
  }

  df = pd.read_csv(os.path.join(base_skin_dir, 'HAM10000_metadata.csv'))

  df['path'] = df['image_id'].map(imageid_path_dict.get)
  df['cell_type'] = df['dx'].map(lesion_type_dict.get)
  df['cell_type_idx'] = pd.Categorical(df['cell_type']).codes

  df.dropna(inplace=True)

  df['images'] = df['path'].map(lambda x: np.asarray(Image.open(x))).apply(lambda x : x.reshape(810000))
  df['images_resized'] = df['path'].map(lambda x: np.asarray(Image.open(x).resize((100,75)))).apply(lambda x : x.reshape(22500))

  return df.head(50)

clean data

In [7]:
def clean_df(df):
  ## fill missing values with mean age
  df['age'].fillna((df['age'].mean()), inplace = True)

  ## drop duplicates
  df = df.drop_duplicates(subset=['lesion_id'], keep = 'first')

  # convert categorical columns to numeric values
  df.localization = df.localization.astype('category')
  df.dx_type = df.dx_type.astype('category')
  df.sex = df.sex.astype('category')

  return df

balance nv

In [8]:
def balance_nv(df, under_sample_size):

        ## isolate nv rows
        data_nv = df[df['dx'] == 'nv']

        # define scaling parameters
        sample_size = under_sample_size
        scaling = under_sample_size / data_nv.shape[0]

        # stratified sampling
        rus = RandomUnderSampler(sampling_strategy={'lower extremity' : int(1224*scaling),
                                                    'trunk' : int(1153*scaling),
                                                    'back' : int(1058*scaling),
                                                    'abdomen' : int(719*scaling),
                                                    'upper extremity' : int(504*scaling) ,
                                                    'foot' : int(209*scaling),
                                                    'unknown' : int(175*scaling),
                                                    'chest' : int(112*scaling),
                                                    'face' : int(61*scaling),
                                                    'neck' : int(60*scaling),
                                                    'genital' : int(43*scaling),
                                                    'hand' : int(39*scaling),
                                                    'scalp' : int(24*scaling),
                                                    'ear' : int(19*scaling),
                                                    'acral' : int(3*scaling)+1
                                                   },
                                   random_state=None,
                                   replacement=False,
                                )

        ## fit strtaified sampling model
        n_x, n_y = rus.fit_resample(data_nv, data_nv['localization'])

        ## delete nv rows from original dataset
        no_nv_data = df[df.dx != 'nv']

        df = pd.concat([n_x, no_nv_data], axis=0)

        return df

data augmentation

In [9]:
def data_augmentation(df, image_size = 'resized'):

    df = df.reset_index(drop=True)
    ## Define random image modifications
    aug = ImageDataGenerator(
    rotation_range=30,
    zoom_range=0.15,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.15,
    horizontal_flip=True,
    fill_mode="nearest")

    if image_size == 'resized':
        target_images = 'images_resized'
        input_size = (75,100,3)
        df = df.drop(['images'], axis =1)
        new_df = df.copy()
        new_df = new_df.drop(['images_resized'], axis=1)


    elif image_size == 'full_size':
        target_images = 'images'
        input_size = (450,600,3)
        df = df.drop(['images_resized'], axis =1)
        new_df = df.copy()
        new_df = new_df.drop(['images'], axis=1)

    ## Create np.array of augmented images from original images dataframe. Reshape to feed into dataGen
    images_array = np.array([i.reshape(input_size) for i in df[target_images].values])
    # import ipdb; ipdb.set_trace()
    print("images_array.shape")
    #construct the actual Python generator, iterate over imagegenerator object
    dataGen = aug.flow(images_array, batch_size = images_array.shape[0])
    for i in dataGen:
        break

    ## flatten i before concatenating it into new dataframe copy
    i = i.reshape(len(df), input_size[0]*input_size[1]*input_size[2])

    ## turn i from array into list so it can be converted into pd
    im_list = []
    for im in i:
        im_list.append(im)

    # convert i into the pandas i_df
    i_df = pd.DataFrame({target_images: im_list})


    ## concatenate new_df numpy array and new augmented image array
    com_new_df = pd.concat((new_df, i_df), axis = 1)
    print(com_new_df)

    ## vertically concatenate new dataframes

    frames = [df, com_new_df]
    df = pd.concat(frames)
    df.reset_index(drop=True, inplace=True)
    return df

In [10]:
class ImageScaler():
  def __init__(self, scaler='normalization',image_size='full_size'):
    self.scaler=scaler
    self.image_size=image_size

  def transform(self, X, y=None):
    if self.scaler=='normalization':
      X['pixels_scaled'] = X.image_size.apply(lambda x: x/255)
    if self.scaler=='standardization':
      scaler = StandardScaler()
      X['pixels_scaled'] = X.image_size.apply(lambda x: (x - x.mean(axis=0))/x.std(axis=0))
    if self.scaler=='centering':
      X['pixels_scaled'] = X.image_size.apply(lambda x: ((x - x.mean(axis=0))-(x - x.mean(axis=0)).min())/((x - x.mean(axis=0)).max()-(x - x.mean(axis=0)).min()))

    return X[['pixels_scaled']]


  def fit(self, X, y=None):
    return self

In [11]:
class Trainer(object):

    def __init__(self, X, y, **kwargs):

        self.pipeline = None
        self.kwargs = kwargs
        self.X = X
        self.y = y
        self.split = self.kwargs.get("split", True)

        # Image dimension attributes
        self.scaler = self.kwargs.get('scaler', 'normalization')
        self.image_size = self.kwargs.get('image_size', 'full_size')
        if self.image_size == 'full_size':
          self.target_images = 'images'
          self.input_shape = (450, 600, 3)
        elif self.image_size == 'resized':
          self.target_images = 'images_resized'
          self.input_shape = (75, 100, 3)
        self.input_dim = len(X)

        # Training attributes
        self.history = None
        self.train_results = None
        self.test_results = None


    def get_estimator(self):
        # get different models as self.model
        if self.estimator=='baseline_model':
            self.model = BaselineModel().merge_compile_models(input_dim=self.input_dim, input_shape=self.input_shape, num_labels=self.num_labels)
        elif self.estimator=='tl_vgg':
            self.model = TLModels().tl_merge_compile_models(input_dim=self.input_dim, input_shape=self.input_shape, selection='vgg16', num_labels=self.num_labels)
        elif self.estimator=='tl_resnet':
            self.model = TLModels().tl_merge_compile_models(input_dim=self.input_dim, input_shape=self.input_shape, selection='resnet', num_labels=self.num_labels)
        elif self.estimator=='tl_densenet':
            self.model = TLModels().tl_merge_compile_models(input_dim=self.input_dim, input_shape=self.input_shape, selection='densenet', num_labels=self.num_labels)


    def set_pipeline(self):

        # Define feature engineering pipeline blocks
        self.ohe = OneHotEncoder(handle_unknown='ignore')
        self.rs = RobustScaler()
        self.imsc = ImageScaler(scaler=self.scaler, image_size=self.image_size)

        pipe_cat_feats = make_pipeline(self.ohe)
        pipe_cont_feats = make_pipeline(self.rs)
        pipe_photo_feats = make_pipeline(self.imsc)

        # Define default feature engineering blocs
        feateng_blocks = [
            ('cat_feats', pipe_cat_feats, ['localization', 'dx_type', 'sex']),
            ('cont_features', pipe_cont_feats, ['age']),
            ('photo_feats', pipe_photo_feats, [self.target_images]),
        ]

        self.features_encoder = ColumnTransformer(feateng_blocks, n_jobs=None, remainder="drop")

        self.pipeline = Pipeline(steps=[
            ('features', self.features_encoder)
            ])


    def add_grid_search(self):
        """"
        Apply Gridsearch on self.params defined in get_estimator - using RegressionHyperModel?
        """
        pass


    #@simple_time_tracker
    def preprocess(self, gridsearch=False, image_type="full_size"):
        """
        Add time tracker - if we want?
        """
        # categorise y
        ohe = OneHotEncoder(handle_unknown='ignore')
        import ipdb; ipdb.set_trace()
        self.num_labels = len(np.unique(self.y.values))
        self.y = ohe.fit_transform(self.y.values.reshape(-1, 1)).toarray()
        print("-----------STATUS UPDATE: Y CATEGORISED'-----------")

        # scale/encode X features (metadata + pixel data) via pipeline
        self.set_pipeline()
        self.X = self.pipeline.fit_transform(self.X)

        # convert self.X to pd.df
        self.col_list = []
        list_arrays = self.features_encoder.transformers_[0][1].named_steps['onehotencoder'].categories_
        for i in list_arrays:
            for col_name in i:
                self.col_list.append(col_name)
        self.col_list.append('age_scaled')
        self.col_list.append('pixels_scaled')

        self.X = pd.DataFrame(self.X, columns=self.col_list)
        print("-----------STATUS UPDATE: PIPELINE FITTED'-----------")

        # create train vs test dataframes
        if self.split:
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y, random_state=1, test_size=0.3)

        self.pixels_to_array(image_type=self.image_size)
        print("-----------STATUS UPDATE: DATA SPLIT INTO X/Y TEST/TRAIN MET/IM'-----------")


    def pixels_to_array(self, image_type="full_size"):
        """
        Convert X_train and X_test into [X_met_train + X_im_train] and [X_met_test + X_im_test] respectively
        """
        self.X_met_train = self.X_train.drop(columns=['pixels_scaled']).astype('float64')
        self.X_met_test = self.X_test.drop(columns=['pixels_scaled']).astype('float64')

        if image_type == "full_size":
            self.X_im_train = np.array([i.reshape(450, 600, 3) for i in self.X_train['pixels_scaled'].values])
            self.X_im_test = np.array([i.reshape(450, 600, 3) for i in self.X_test['pixels_scaled'].values])
        elif image_type == "resized":
            self.X_im_train = np.array([i.reshape(75, 100, 3) for i in self.X_train['pixels_scaled'].values])
            self.X_im_test = np.array([i.reshape(75, 100, 3) for i in self.X_test['pixels_scaled'].values])
        print("-----------STATUS UPDATE: PIXEL ARRAGYS EXTRACTED'-----------")


    #@simple_time_tracker
    def train(self, gridsearch=False, estimator='baseline_model'):
        # assign self.estimator as desired estimator and set self.model via get_estimator()
        self.estimator=estimator
        self.get_estimator()

        # define es criteria and fit model
        es = EarlyStopping(monitor='val_loss', mode='min', patience=25, verbose=1, restore_best_weights=True)
        self.history = self.model.fit(x=[self.X_met_train, self.X_im_train], y=self.y_train,
            validation_split=0.3,
            epochs=200,
            callbacks = [es],
            batch_size=8,
            verbose = 1)


    def evaluate(self):
      ## SEE TRAINING MODEL ACCURACY
      self.train_results = self.model.evaluate(x=[self.X_met_train, self.X_im_train], y=self.y_train, verbose=1)
      print('Train Loss: {} - Train Accuracy: {}'.format(self.train_met_results[0], self.train_met_results[1]))
      # print('Train Loss: {} - Train Accuracy: {} - Train Recall: {} - Train Precision: {}'.format(self.train_met_results[0], self.train_met_results[1], self.train_met_results[2], self.train_met_results[3]))

      ## TEST DATA ACCURACY
      self.test_results = self.model.evaluate([self.X_met_test, self.X_im_test], self.y_test, verbose=0)
      print('Train Loss: {} - Train Accuracy: {}'.format(self.test_met_results[0], self.test_met_results[1]))
      # print('Test Loss: {} - Test Accuracy: {} - Test Recall: {} - Test Precision: {}'.format(self.test_met_results[0], self.test_met_results[1], self.test_met_results[2], self.test_met_results[3]))


    def plot_loss_accuracy(history):

        fig, axs = plt.subplots(2)

        axs[0].plot(history.history['loss'])
        axs[0].plot(history.history['val_loss'])
        plt.title("Model Loss")
        plt.xlabel("Epochs")
        plt.legend(['Train', 'val_test'], loc='best')

        axs[1].plot(history.history['accuracy'])
        axs[1].plot(history.history['val_accuracy'])
        plt.title("Model Accuracy")
        plt.xlabel("Epochs")
        plt.legend(['Train', 'val_test'], loc='best')


    def save_model(self):
        """
        Save the model into a .joblib
        """
        joblib.dump(self.pipeline, 'model.joblib')
        print(colored("model.joblib saved locally", "green"))
        pass


In [12]:
class BaselineModel():

    def create_mlp(self, input_dim):
        """
        Create Multi-Layer Perceptron as left_hand fork of mixed neural network for numeric and categorical explanatory variables
        """
        model = Sequential()
        model.add(Dense(8, input_dim=input_dim, activation="relu"))
        model.add(Dense(4, activation="relu"))
        return model

    def create_cnn(self, input_shape, filters=(16, 32, 64)):
        """
        Create Convolutional Neural Network as right-hand fork of mixed neural network for pixel data
        """
        # initialize the input shape and channel dimension, assuming TensorFlow/channels-last ordering
        chanDim = -1

        # define the model input
        inputs = Input(shape=input_shape)

        # loop over the number of filters
        for (i, f) in enumerate(filters):
            # if this is the first CONV layer then set the input appropriately
            if i == 0:
                x = inputs
            # add aspects of each CONV interation: CONV => RELU => BN => POOL
            x = Conv2D(f, (3, 3), padding="same")(x)
            x = Activation("relu")(x)
            x = BatchNormalization(axis=chanDim)(x)
            x = MaxPooling2D(pool_size=(2, 2))(x)

        # flatten then FC => RELU => BN => DROPOUT
        x = Flatten()(x)
        x = Dense(16)(x)
        x = Activation("relu")(x)
        x = BatchNormalization(axis=chanDim)(x)
        x = Dropout(0.5)(x)

        # apply another FC layer tto match the number of nodes coming out of the MLP
        x = Dense(4)(x)
        x = Activation("relu")(x)

        # construct the CNN and return model
        model = Model(inputs, x)
        return model


    def merge_compile_models(self, input_dim, input_shape, filters=(16, 32, 64), num_labels=7):
        """
        Join forks of network to combine models for all data types
        """
        # create the MLP and CNN models
        mlp = self.create_mlp(input_dim)
        cnn = self.create_cnn(input_shape)

        # create the input to our final set of layers as the output of both the MLP and CNN
        combinedInput = concatenate([mlp.output, cnn.output])

        # add final FC layer head with 2 dense layers with final layer as the multi-classifier head
        x = Dense(4, activation="relu")(combinedInput)
        x = Dense(num_labels, activation="softmax")(x)

        # yield final model integrating categorical/numerical data and images into single diagnostic prediction
        model = Model(inputs=[mlp.input, cnn.input], outputs=x)

        # compile the model using BCE as loss
        opt = Adam(lr=1e-3, decay=1e-3 / 200)
        model.compile(loss="categorical_crossentropy",
          optimizer=opt,
          metrics=['accuracy'])

        #NB have removed  'precision', 'f1'

        return model

In [13]:
if __name__ == "__main__":

    warnings.simplefilter(action='ignore', category=FutureWarning)

    # Get and clean data
    df = get_data()
    print("-----------STATUS UPDATE: DATA IMPORTED-----------")

    df = clean_df(df)
    print("-----------STATUS UPDATE: DATA CLEANED-----------")

    df = balance_nv(df, 1000)
    print("-----------STATUS UPDATE: DATA BALANCED-----------")

    df = data_augmentation(df, image_size='resized')
    print("-----------STATUS UPDATE: DATA AUGMENTED-----------")

    # Assign X and y and instanciate Trainer Class
    X = df.drop(columns=['dx', 'lesion_id', 'image_id'])
    y = df['dx']
    t = Trainer(X, y, image_size='resized')

    # Preprocess data: transfrom and scale
    print("############  Preprocessing data   ############")
    t.preprocess(image_type=t.image_size)

    # Train model
    print("############  Training model   ############")
    t.train(estimator='baseline_model')

    # Evaluate model on X_test/y_preds vs y_test
    print("############  Evaluating model   ############")
    t.evaluate()

-----------STATUS UPDATE: DATA IMPORTED-----------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


-----------STATUS UPDATE: DATA CLEANED-----------


ZeroDivisionError: division by zero