In [1]:
import cv2, numpy as np, pandas as pd, os, shutil, pickle as pkl
import matplotlib.pyplot as plt
from glob import glob

In [2]:
from sklearn.metrics import roc_auc_score, mean_squared_error as mse, roc_curve

In [3]:
from sklearn.utils import shuffle

In [4]:
# !pip3 install keras==2.3.1

In [5]:
from keras.models import Model, load_model
from keras.applications.mobilenet_v2 import MobileNetV2
try: from keras.callbacks.callbacks import ModelCheckpoint
except:  from keras.callbacks import ModelCheckpoint
from keras.layers import Input, Dense, Conv2D, MaxPooling2D, UpSampling2D, LeakyReLU, BatchNormalization, Conv1D
from keras import backend as K
from keras.optimizers import Adam
from keras.layers import Lambda

Using TensorFlow backend.


In [6]:
try:
    # to run on colab
    from google.colab import drive
    def mount_drive():
        drive.mount('/content/drive/')
    mount_drive()
    print("Runnning on GOOGLE COLAB")
    isCloud = True
except:
    print("Runnning on LOCAL SYSTEM")
    isCloud = False

Runnning on LOCAL SYSTEM


In [7]:
class HAM_Data:
    def __init__(self, images_path = "data/HAM10000_images/", data_info = "data/HAM10000_metadata.csv", split_files_path = "data_files/", resolution = (128, 128), toNormalize = True):
        self.extra = '/content/drive/My Drive/' if isCloud else  ""
        self.images_path = self.extra + images_path
        self.total_data_info = pd.read_csv(self.extra + data_info)
        images_labels = self.total_data_info[['image_id', 'dx']]
        self.labels_dict = dict([(label, idx) for idx, label in enumerate(np.unique(images_labels['dx']))])
        self.n_classes = len(self.labels_dict)
        self.data_info = self.total_data_info.where(self.total_data_info['dx'] == 'nv').dropna()
        self.operating_resolution = resolution
        self.toNormalize = toNormalize
        self.split_files_path = self.extra + split_files_path
        self.get_training_info()
        self.get_non_training_data()
        
    def get_training_info(self, isShuffle = True):
        # change this according to data
        images_labels = self.data_info[['image_id', 'dx']]
        self.val_first = True
        
        if (self.split_files_path == self.extra + "") or (self.split_files_path == None):
            self.data = list()
            for idx, row in images_labels.iterrows():
                self.data.append([row[0], row[-1]])
            self.data = np.array(self.data)
            if isShuffle: np.random.shuffle(self.data)
            self.test_data = self.data[:int(len(self.data)*self.test_size)]
            self.train_data = self.data[int(len(self.data)*self.test_size):]
            self.val_data = self.train_data[:int(len(self.data)*self.val_size)]
            self.train_data = self.train_data[int(len(self.data)*self.val_size):]

            self.train_data = self.read_data(self.train_data)
            print("Read train data to memory")
            self.test_data = self.read_data(self.test_data)
            print("Read test data to memory")
            self.val_data = self.read_data(self.val_data)
            print("Read validation data to memory")
            for type_, data_ in zip(["train", "test", "val"], [self.train_data, self.test_data, self.val_data]):
                with open(os.path.join(self.save_path, type_ + ".pkl"), "wb") as f:
                    pkl.dump(data_, f)
        else:
            self.train_data = pkl.load(open(os.path.join(self.split_files_path, "train.pkl"), "rb"))
            self.test_data = pkl.load(open(os.path.join(self.split_files_path, "test.pkl"), "rb"))
            self.val_data = pkl.load(open(os.path.join(self.split_files_path, "val.pkl"), "rb"))
    
    def read_data(self, data):
        return np.array([np.array([self.read_image(image_id), self.labels_dict[label]]) for image_id, label in data])

    def path_from_id(self, image_id, images_path = None):
        if images_path == None: images_path = self.images_path
        if os.path.splitext(image_id)[-1] not in [".jpg", ".jpeg", ".png"]: image_id += ".jpg"
        return os.path.join(images_path, image_id)
    
    def read_image(self, image_id, images_path = None):
        image = cv2.resize(cv2.imread(self.path_from_id(image_id, images_path)), self.operating_resolution)
        if self.toNormalize: image = image.astype('float32') / 255.
        return image
                           
    def train_batch_generator(self, images_only = False):
        while True:
            start = self.batch_read_count
            end = (self.batch_read_count + self.batch_size) if (self.batch_read_count + self.batch_size) < len(self.train_data) else len(self.train_data)
            X, y = np.array([i for i in self.train_data[start:end][:,0]]), np.array([i for i in self.train_data[start:end][:,1]])
            self.batch_read_count += self.batch_size
            if self.batch_read_count >= len(self.train_data): self.batch_read_count = 0
            if not images_only: yield(X, y)
            else: yield(X, X)
            
    def val_batch_generator(self, images_only = False):
        while True:
            if self.val_first:
                np.random.shuffle(self.val_data)
                self.val_X, self.val_y = np.array([i for i in self.val_data[:,0]]), np.array([i for i in self.val_data[:,1]])
                self.val_first = False
            if not images_only: yield(self.val_X, self.val_y)
            else: yield(self.val_X, self.val_X)
                
    def get_non_training_data(self, n_samples = 100):
        self.non_training_data_info = self.total_data_info.where(self.total_data_info['dx']!='nv').dropna()
        # change this according to data
        images_labels = self.non_training_data_info[['image_id', 'dx']]
        shuffle(images_labels)
        try:
            self.nt_data = pkl.load(open(os.path.join(self.split_files_path, "non_train.pkl"), "rb"))
        except:
            self.nt_data = list()
            for idx, row in images_labels.iterrows():
                self.nt_data.append([row[0], row[-1]])
                if idx == n_samples: break
            self.nt_data = np.array(self.nt_data)

            self.nt_data = self.read_data(self.nt_data)
            print("Read non-train data to memory")
            with open(os.path.join(self.save_path, "non_train.pkl"), "wb") as f:
                pkl.dump(self.nt_data, f)
        return self.nt_data

In [8]:
class AutoEncoder:
    def __init__(self, encoding_dim = 300 , resolution = (128,128)):
        self.encoding_dimension = encoding_dim
        self.input_shape = tuple(list(resolution) + [3])
    
    def get_model(self):
        input_layer = Input(shape=(self.input_shape))
        conv1 = Conv2D(16, (3,3), activation = 'relu', padding = 'same', name = "conv1")(input_layer)
        mp1 = MaxPooling2D((2,2), padding = 'same', name = "mp1")(conv1)
        conv2 = Conv2D(8, (3,3), activation = 'relu', padding = 'same', name = "conv2")(mp1)
        mp2 = MaxPooling2D((2,2), padding = 'same', name = "mp2")(conv2)
        conv3 = Conv2D(8, (3,3), activation = 'relu', padding = 'same', name = "conv3")(mp2)
        encoding = MaxPooling2D((4,4), padding = 'same', name = "encoding")(conv3)

        conv4 = Conv2D(8, (3, 3), activation='relu', padding='same', name = "conv4")(encoding)
        up1 = UpSampling2D((4, 4), name = "up1")(conv4)
        conv5 = Conv2D(8, (3, 3), activation='relu', padding='same', name = "conv5")(up1)
        up2 = UpSampling2D((2, 2), name = "up2")(conv5)
        conv6 = Conv2D(16, (3, 3), activation='relu', padding='same', name = "conv6")(up2)
        up3 = UpSampling2D((2, 2), name = "up3")(conv6)
        recons = Conv2D(3, (3, 3), activation='relu', padding='same', name = "recons")(up3)
        return Model(input_layer, recons)

class ContractiveAutoEncoder:
    def __init__(self, encoding_dim = 300 , resolution = (128,128)):
        self.encoding_dimension = encoding_dim
        self.input_shape = tuple(list(resolution) + [3])
    
    def get_model(self):
        input_layer = Input(shape=(self.input_shape))
        conv1 = Conv2D(128, (3,3), padding = 'same', name = "conv1")(input_layer)
        lr1 = LeakyReLU(name = 'lr1')(conv1)
        mp1 = MaxPooling2D((4,4), padding = 'same', name = "mp1")(lr1)
        
        conv2 = Conv2D(512, (2,2), padding = 'same', name = "conv2")(mp1)
        bn1 = BatchNormalization(name = 'bn1')(conv2)
        lr2 = LeakyReLU(name = 'lr2')(bn1)
        mp2 = MaxPooling2D((4,4), padding = 'same', name = "mp2")(lr2)
        
        conv3 = Conv2D(2048, (2,2), padding = 'same', name = "conv3")(mp2)
        bn2 = BatchNormalization(name = 'bn2')(conv3)
        lr3 = LeakyReLU(name = 'lr3')(bn2)
        mp3 = MaxPooling2D((4,4), padding = 'same', name = "mp3")(lr3)
        
        encoding = Conv2D(300, (2,2), padding = 'same', name = 'encoding')(mp3)
        
        conv5 = Conv2D(2048, (2, 2), padding='same', name = "conv5")(encoding)
        bn3 = BatchNormalization(name = 'bn3')(conv5)
        lr4 = LeakyReLU(name = 'lr4')(bn3)
        up1 = UpSampling2D((4, 4), name = "up1")(lr4)
        
        conv6 = Conv2D(512, (2, 2), padding='same', name = "conv6")(up1)
        bn4 = BatchNormalization(name = 'bn4')(conv6)
        lr5 = LeakyReLU(name = 'lr5')(bn4)
        up2 = UpSampling2D((4, 4), name = "up2")(lr5)
        
        conv7 = Conv2D(128, (2, 2), padding='same', name = "conv7")(up2)
        bn5 = BatchNormalization(name = 'bn5')(conv7)
        lr6 = LeakyReLU(name = 'lr6')(bn5)
        up3 = UpSampling2D((4, 4), name = "up3")(lr6)
        
        decoding = Conv2D(3, (2,2), padding = 'same', name = 'decoding')(up3)
        
        return Model(inputs = input_layer, outputs = decoding)
    
    def contractive_loss(self, y_pred, y_true, enc_layer_name = 'encoding', lam = 1e-3):
        mse = K.mean(K.square(y_true - y_pred), axis=1)

        W = K.transpose(K.variable(value=model.get_layer(enc_layer_name).get_weights()[0])) # N_hidden x N
        h = model.get_layer(enc_layer_name).output
        dh = h * (1 - h)  # N_batch x N_hidden

        # N_batch x N_hidden * N_hidden x 1 = N_batch x 1
        contractive = lam * K.sum(dh**2 * K.sum(W**2, axis=1), axis=1)

        return mse + contractive

class ACNet:
    def __init__(self, resolution = (128,128)):
        self.input_shape = tuple(list(resolution) + [3])
        
    # ACNet
    def convACB(self, input_layer, n_kernels, kernels_size, activation = 'relu', padding = 'same'):
        out_dd = Conv2D(n_kernels, (kernels_size,kernels_size), activation = activation, padding = padding)(input_layer) # dxd
        bn1 = BatchNormalization()(out_dd)
        out_d1 = Conv2D(n_kernels, (kernels_size,1), activation = activation, padding = padding)(input_layer) # dx1
        bn2 = BatchNormalization()(out_d1)
        out_1d = Conv2D(n_kernels, (1,kernels_size), activation = activation, padding = padding)(input_layer) # 1xd
        bn3 = BatchNormalization()(out_1d)
        out_sum = Lambda(lambda a: a[0] + a[1] + a[2])([bn1, bn2, bn3])
        return out_sum
    
    def get_model(self,):
        input_layer = Input(shape=self.input_shape)
        acb1 = convACB(input_layer, 16, 3)
        mp1 = MaxPooling2D((2,2), padding = 'same', name = "mp1")(acb1)
        acb2 = convACB(mp1, 8, 3)
        mp2 = MaxPooling2D((2,2), padding = 'same', name = "mp2")(acb2)
        acb3 = convACB(mp2, 8, 3)
        encoding = MaxPooling2D((4,4), padding = 'same', name = "encoding")(acb3)

        acb4 = convACB(encoding, 8, 3)
        up1 = UpSampling2D((4, 4), name = "up1")(acb4)
        acb5 = convACB(up1, 8, 3)
        up2 = UpSampling2D((2, 2), name = "up2")(acb5)
        acb6 = convACB(up2, 16, 3)
        up3 = UpSampling2D((2, 2), name = "up3")(acb6)
        recons = convACB(up3, 1, 3)
        return Model(inputs = input_layer, outputs = recons)

class ImageClassifier:
    def __init__(self, n_classes, model_type = "mobilenet", resolution = (128,128)):
        self.model_type = model_type
        if "mobilenet" in self.model_type.lower():
            self.model = MobileNetV2(input_shape = tuple(list(resolution) + [3]), weights = None, classes = n_classes)

    def get_model(self):
        return self.model    

In [12]:
class Trainer(HAM_Data):
    def __init__(self, nn_type, save_path = "output/"):
        self.nn_type = nn_type
        extra = '/content/drive/My Drive/' if isCloud else  ""
        self.save_path = extra + save_path
        if not os.path.exists(self.save_path): os.mkdir(self.save_path)
        self.train_config()
        HAM_Data.__init__(self)
        self.determine_model()
    
    def determine_model(self):
        if "autoencoder" in self.nn_type.lower():
            self.ae = AutoEncoder(resolution = self.operating_resolution)
            self.model = self.ae.get_model()
            self.train = self.train_autoencoder
        elif "contractive_autoencoder" in self.nn_type.lower():
            self.cae = ContractiveAutoEncoder()
            self.model = self.cae.get_model()
            self.train = self.train_contractive_autoencoder
        elif "classifier" in self.nn_type.lower():
            self.ic = ImageClassifier(n_classes = self.n_classes, model_type = "mobilenet", resolution = self.operating_resolution)
            self.model = self.ic.get_model()
            self.train = self.train_classifier
        elif "ACNet" in self.nn_type.lower():
            self.acn_ae = ACNet()
            self.model = self.acn_ae.get_model()
            self.train = self.train_ACNet_autoencoder
    
    def train_config(self, model_name = ""):
        self.batch_size = 64
        self.mini_batch_size = 16
        self.epochs = 70
        self.test_size = 0.015
        self.val_size = 0.01
        
        self.operating_resolution = (128,128)
        self.batch_read_count = 0
        
        self.model_name = os.path.join(self.save_path, self.nn_type + "-{epoch:02d}-{val_accuracy:.2f}.hdf5")
        self.callback = [ModelCheckpoint(self.model_name, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')]

        self.adam = Adam(learning_rate = 1e-4)

    
    def train_classifier(self):
        self.model.compile(loss='sparse_categorical_crossentropy', optimizer=self.adam, metrics=['accuracy'])        
        self.model.fit_generator(self.train_batch_generator(), validation_data = self.val_batch_generator(), epochs = self.epochs, steps_per_epoch = int(len(self.train_data) / self.batch_size), shuffle = True, validation_steps = 1, callbacks = callback)
    
    def train_autoencoder(self):
        self.model.compile(loss = 'mse', optimizer = self.adam, metrics = ['accuracy'])
        self.model.fit_generator(self.train_batch_generator(True), validation_data = self.val_batch_generator(True), epochs = self.epochs, steps_per_epoch = int(len(self.train_data) / self.batch_size), shuffle = True, validation_steps = 1, callbacks = self.callback)
        
    def train_contractive_autoencoder(self):
        contractive_loss = self.cae.contractive_loss
        self.model.compile(loss = 'contractive_loss', optimizer = self.adam, metrics = ['accuracy'])
        self.model.fit_generator(self.train_batch_generator(True), validation_data = self.val_batch_generator(True), epochs = self.epochs, steps_per_epoch = int(len(self.train_data) / self.batch_size), shuffle = True, validation_steps = 1, callbacks = self.callback)
    
    def train_ACNet_autoencoder(self):
        self.model.compile(loss = 'mse', optimizer = self.adam, metrics = ['accuracy'])
        self.model.fit_generator(self.train_batch_generator(True), validation_data = self.val_batch_generator(True), epochs = self.epochs, steps_per_epoch = int(len(self.train_data) / self.batch_size), shuffle = True, validation_steps = 1, callbacks = self.callback)

In [13]:
class Tester(HAM_Data):
    def __init__(self, model_path = "models/autoencoder_512-49-0.95.hdf5", save_path = "output/"):
        extra = '/content/drive/My Drive/' if isCloud else  ""
        self.save_path = extra + save_path

        self.model_path = model_path
        self.model = load_model(self.model_path)
        HAM_Data.__init__(self)
        if "autoencoder" in self.model_path:
            self.test = self.test_autoencoder
        elif "classifier" in self.model_path:
            self.test = self.test_classifier
    
    def display_AE_result(self, original_image, reconstructed_image):
        fig, ax = plt.subplots(1, 2)
        ax[0].imshow(original_image)
        ax[1].imshow(reconstructed_image)
        plt.show()         
    
    def test_autoencoder(self, images, read_images = True, show_result = True):
        if not isinstance(images, np.ndarray): images = np.array(images)
        mse_results = list()
        reconstruction_results = list()
        for image in images:
            original_image = self.read_image(image) if read_images else image
            reconstructed_image = np.squeeze(self.model.predict(np.expand_dims(original_image, axis = 0)))
            mse_ = mse(original_image.flatten(), reconstructed_image.flatten())
            if show_result: self.display_AE_result(original_image, reconstructed_image); print("MSE:", mse_)
            mse_results.append(mse_)
            reconstruction_results.append(reconstructed_image)
        return np.array(mse_results), np.array(reconstruction_results)
    
    def test_roc_score(self):
        anomalous_data = self.get_non_training_data()
        non_anomalous_images = self.test_data[:, 0]
        non_anomalous_labels = np.zeros(len(non_anomalous_images))
        anomalous_images = anomalous_data[:, 0]
        anomalous_labels = np.ones(len(anomalous_images))
        test_images = np.hstack((non_anomalous_images, anomalous_images))
        test_labels = np.hstack((non_anomalous_labels, anomalous_labels))
        mse_results, reconstruction_results = self.test(test_images, read_images = False, show_result = False)
        AUCROC_score = roc_auc_score(test_labels, mse_results)
        print("ROC AUC Score:", AUCROC_score)

In [None]:
acn_ae_tr = Trainer(nn_type = "ACNet", save_path = "ACN_AE_12042020")
acn_ae_tr.train()

In [13]:
# cae_tr = Trainer(nn_type = "contractive_autoencoder", save_path = "CAE_05042020")
# cae_tr.train()

In [14]:
# # Testing De-Noising AutoEncoder
# ts = Tester(model_path = "/content/drive/My Drive/AE_05042020/autoencoder-70-0.94.hdf5", save_path = "data_files/")
# ts.test_roc_score()

In [15]:
# # Testing Contractive AutoEncoder
# ts = Tester(model_path = "/content/drive/My Drive/CAE_05042020/contractive_autoencoder-70-0.95.hdf5", save_path = "data_files/")
# ts.test_roc_score()

In [83]:
m = get_ACNet()

In [84]:
m.summary()

Model: "model_20"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_21 (InputLayer)           (None, 128, 128, 1)  0                                            
__________________________________________________________________________________________________
conv2d_325 (Conv2D)             (None, 128, 128, 16) 160         input_21[0][0]                   
__________________________________________________________________________________________________
conv2d_326 (Conv2D)             (None, 128, 128, 16) 64          input_21[0][0]                   
__________________________________________________________________________________________________
conv2d_327 (Conv2D)             (None, 128, 128, 16) 64          input_21[0][0]                   
___________________________________________________________________________________________