In [1]:
import numpy as np
from keras.layers import Dense, Conv2D, MaxPooling2D, Flatten, Dropout, BatchNormalization, AveragePooling2D
from keras import Sequential, optimizers, layers, Model
from keras.models import load_model
from keras import backend as K
import tensorflow.random as random_tf
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression, SGDOneClassSVM
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.naive_bayes import GaussianNB

2024-01-13 12:38:16.256428: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-13 12:38:16.307388: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-13 12:38:16.307436: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-13 12:38:16.308690: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-13 12:38:16.316130: I tensorflow/core/platform/cpu_feature_guar

In [2]:
data = np.load("bloodmnist.npz")
train_images = data["train_images"]
print(np.shape(data["train_images"]))
val_images = data["val_images"]
print(np.shape(data["val_images"]))
test_images = data["test_images"]
print(np.shape(data["test_images"]))
train_labels = data["train_labels"]
print(np.shape(data["train_labels"]))
val_labels = data["val_labels"]
print(np.shape(data["val_labels"]))
test_labels = data["test_labels"]
print(np.shape(data["test_labels"]))

(11959, 28, 28, 3)
(1712, 28, 28, 3)
(3421, 28, 28, 3)
(11959, 1)
(1712, 1)
(3421, 1)


In [3]:
class ImageDataset:
    def __init__(self, images, labels):
        self.images = images/255
        self.labels = labels
        self.class_num = len(np.unique(labels))
        self.counts = []
        self.proportions = []
        self.length = np.shape(images)[0]
        self.width = np.shape(images)[1]
        self.one_hot_labels = self.one_hot_encode()
        self.update_counts()
        # self.oversample()

    def update_counts(self):
        self.counts = []
        self.proportions = []
        
        for i in range(self.class_num):
            self.counts.append(len(np.where(self.labels == i)[0]))
        
        self.proportions = [count/self.length for count in self.counts]

    def oversample(self):
        ros = RandomOverSampler(random_state=0)
        self.images= self.images.reshape((self.length, self.width*self.width*3))
        self.images, self.labels = ros.fit_resample(self.images, self.labels)
        self.length = self.images.shape[0]
        self.images = self.images.reshape((self.length, self.width, self.width, 3))
        self.one_hot_labels = self.one_hot_encode()
        self.update_counts()
        self.image_features = self.images

    def one_hot_encode(self):
        one_hot_labels = np.array([np.zeros(self.class_num) for i in range(self.length)])
        for i in range(self.length):
            one_hot_labels[i][self.labels[i]] = 1
        return one_hot_labels
    
    # def apply_CNN(self, model):

    def shuffle(self):
        p = np.random.permutation(self.length)
        self.images, self.labels, self.one_hot_labels = self.images[p], self.labels[p], self.one_hot_labels[p]

    def get_features(self, model):
        print(np.shape(self.images), np.shape(self.one_hot_labels))
        self.image_features = model.predict(self.images)

In [4]:
cnn_model = load_model("./CNN_model.h5")
model_use = Model(
    inputs = cnn_model.input,
    outputs = cnn_model.layers[-3].output
)

In [5]:
train_dataset = ImageDataset(train_images, train_labels)
val_dataset = ImageDataset(val_images, val_labels)

In [6]:
train_dataset.oversample()
train_dataset.get_features(model_use)
val_dataset.get_features(model_use)

(18640, 28, 28, 3) (18640, 8)
(1712, 28, 28, 3) (1712, 8)


In [7]:
# define dataset
# model = LogisticRegression()
# model = DecisionTreeClassifier(splitter='random')
model = RandomForestClassifier()
# model = GaussianNB()
# model = KNeighborsClassifier()
# model = SGDOneClassSVM()
# fit model
model = BaggingClassifier()
# model = AdaBoostClassifier(model)
# make predictions
model.fit(train_dataset.image_features, train_dataset.labels)
yhat = model.predict(train_dataset.image_features)
# print(yhat)
# models = []
# for i in range(train_dataset.class_num * train_dataset.class_num-1):
#     models.add(Dense(100, activation='relu'))
#     models.add(Dense(1, activation='sigmoid'))

In [8]:
# get accuracy and auc
acc = metrics.accuracy_score(train_dataset.labels, yhat)
# metrics.auc(acc)
print(acc)

0.9952253218884121


In [9]:
valhat = model.predict(val_dataset.image_features)
acc = metrics.accuracy_score(val_dataset.labels, valhat)
print(acc)

0.9001168224299065
