Import necessary modules

In [1]:
import numpy as np
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, BatchNormalization, AveragePooling2D, Input
from keras import layers
from keras import Sequential, optimizers, layers
from keras.models import load_model
from keras import backend as K
import tensorflow.random as random_tf
from imblearn.over_sampling import RandomOverSampler
import random

2024-01-12 12:28:06.063958: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-12 12:28:07.761864: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-12 12:28:07.763069: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-12 12:28:07.885943: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-12 12:28:08.251479: I tensorflow/core/platform/cpu_feature_guar

Load Data from npz file,
.npz is used to save numpy arrays

In [2]:
data = np.load("bloodmnist.npz")
train_images = data["train_images"]
print(np.shape(data["train_images"]))
val_images = data["val_images"]
print(np.shape(data["val_images"]))
test_images = data["test_images"]
print(np.shape(data["test_images"]))
train_labels = data["train_labels"]
print(np.shape(data["train_labels"]))
val_labels = data["val_labels"]
print(np.shape(data["val_labels"]))
test_labels = data["test_labels"]
print(np.shape(data["test_labels"]))

(11959, 28, 28, 3)
(1712, 28, 28, 3)
(3421, 28, 28, 3)
(11959, 1)
(1712, 1)
(3421, 1)


Set random seeds for reproduceability

In [3]:
random.seed(0)
np.random.seed(0)

Add Dataset Class

In [4]:
class ImageDataset:
    def __init__(self, images, labels):
        self.images = images/255
        self.labels = labels
        self.class_num = len(np.unique(labels))
        self.counts = []
        self.proportions = []
        self.length = np.shape(images)[0]
        self.width = np.shape(images)[1]
        self.one_hot_labels = self.one_hot_encode()
        self.update_counts()

    def update_counts(self):
        self.counts = []
        self.proportions = []
        
        for i in range(self.class_num):
            self.counts.append(len(np.where(self.labels == i)[0]))
        
        self.proportions = [count/self.length for count in self.counts]

    def oversample(self):
        ros = RandomOverSampler(random_state=0)
        self.images= self.images.reshape((self.length, self.width*self.width*3))
        self.images, self.labels = ros.fit_resample(self.images, self.labels)
        self.length = self.images.shape[0]
        print(self.length)
        print(np.shape(self.images))
        print(self.labels.shape)
        self.images = self.images.reshape((self.length, self.width, self.width, 3))
        # print(self.images[0])
        self.one_hot_labels = self.one_hot_encode()
        self.update_counts()

    def one_hot_encode(self):
        one_hot_labels = np.array([np.zeros(self.class_num) for i in range(self.length)])
        for i in range(self.length):
            one_hot_labels[i][self.labels[i]] = 1
        return one_hot_labels
    
    # def apply_CNN(self, model):

    def shuffle(self):
        p = np.random.permutation(self.length)
        self.images, self.labels, self.one_hot_labels = self.images[p], self.labels[p], self.one_hot_labels[p]

Initialise classes for training, validation and testing validations

In [5]:
# Initialise Class for training, validation, test
train_dataset = ImageDataset(train_images, train_labels)
val_dataset = ImageDataset(val_images, val_labels)
test_dataset = ImageDataset(test_images, test_labels)

# print counts and proportions to see if data needs to be balanced
print(train_dataset.counts, train_dataset.proportions)
print(val_dataset.counts, val_dataset.proportions)
print(test_dataset.counts, test_dataset.proportions)

[852, 2181, 1085, 2026, 849, 993, 2330, 1643] [0.07124341500125428, 0.182373108119408, 0.09072664938540012, 0.16941215820720795, 0.07099255790617945, 0.08303369846977172, 0.1948323438414583, 0.1373860690693202]
[122, 312, 155, 290, 122, 143, 333, 235] [0.07126168224299065, 0.1822429906542056, 0.0905373831775701, 0.169392523364486, 0.07126168224299065, 0.08352803738317757, 0.19450934579439252, 0.13726635514018692]
[244, 624, 311, 579, 243, 284, 666, 470] [0.07132417421806489, 0.1824028061970184, 0.09090909090909091, 0.16924875767319497, 0.0710318620286466, 0.08301666179479684, 0.19467991815258695, 0.1373867290266004]


Datset found to be imbalanced, oversample training dataset:

In [6]:
train_dataset.oversample()
# print(train_dataset.counts)
# print(np.shape(train_dataset.images))
# print(np.shape(train_dataset.one_hot_labels))
# one hot encode labels

# def one_hot_encode(label):
#     return utils.to_categorical(label, num_classes=train_dataset.class_num)
# one_hot_train = np.array([utils.to_categorical(label, num_classes=train_dataset.class_num) for label in train_dataset.labels])
# print(np.shape(one_hot_train))
# one_hot_train = one_hot_train.reshape(np.shape(one_hot_train)[0], train_dataset.class_num)
# print(np.shape(one_hot_train))

train_dataset.shuffle()

18640
(18640, 2352)
(18640,)


Add Data Augmentation Layers

In [7]:
def add_preprocessing(model):
    # model.add(layers.RandomBrightness(factor=0.2))
    model.add(layers.RandomFlip(mode="horizontal_and_vertical"))
    model.add(layers.RandomZoom(height_factor=0.2))
    # model.add(layers.RandomRotation(factor=0.2))
    model.add(layers.RandomContrast(factor=0.2))
    return model

Make model

In [8]:
model = Sequential()
# model.add(BatchNormalization())
model.add(Input(shape = (28, 28, 3)))
model.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu', input_shape = (28, 28, 3), strides=1))
model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
model.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu', strides=1))
model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
model.add(Conv2D(filters=32, kernel_size=(3,3), activation='relu', strides=1))
model.add(MaxPooling2D(pool_size=(2, 2), strides=2))
model.add(Flatten())
model.add(Dropout(.1))
model.add(Dense(100, activation='relu'))
model.add(Dense(8, activation='softmax'))
optimizer = optimizers.SGD(0.005)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['AUC', 'categorical_accuracy'])

Train Model

In [9]:
model.fit(train_dataset.images, train_dataset.one_hot_labels, validation_data=(val_dataset.images, val_dataset.one_hot_labels), batch_size=8, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.src.callbacks.History at 0x7f355cf3dd80>

In [10]:
model.evaluate(val_dataset.images, val_dataset.one_hot_labels)
model.save("./CNN_model.h5")



  saving_api.save_model(


Prepare data for ensemble model:

In [11]:
class OVRDataset:
    def __init__(self, images, labels):
        self.class_num = len(np.unique(labels))
        self.images = images/255
        self.labels = labels
        self.splits = self.create_splits()

    def create_splits(self):
        splits = []
        
        for i in range(self.class_num):
            binary_labels = np.array([1 if label == i else 0 for label in self.labels])
            splits.append(ImageDataset(self.images.copy(), binary_labels))
        
        return splits
    
    def oversample_dataset(self):
        for split in self.splits:
            split.oversample()
    

In [12]:
model = load_model("./CNN_model.h5")

In [13]:
train_

NameError: name 'train_' is not defined

In [None]:
# from sklearn.datasets import make_classification
# from sklearn.linear_model import LogisticRegression
# # define dataset
# X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, n_redundant=5, n_classes=3, random_state=1)
# # define model
# model = LogisticRegression(multi_class='ovr')
# # fit model
# model.fit(X, y)
# # make predictions
# yhat = model.predict(X)

# trainOVR.oversample_dataset()

: 