Import Required Models

In [1]:
import pandas as pd
import numpy as np
from keras import Sequential, optimizers, layers, Model
from keras.models import load_model
from imblearn.pipeline import Pipeline
from sklearn import metrics
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression, SGDOneClassSVM
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier, HistGradientBoostingClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV, StratifiedKFold, GridSearchCV
from sklearn.metrics import auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix, balanced_accuracy_score, f1_score

2024-01-15 00:47:05.109830: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-01-15 00:47:07.019206: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-15 00:47:07.019898: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-15 00:47:07.318405: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-15 00:47:08.353723: I tensorflow/core/platform/cpu_feature_guar

Load data from npz file, this is already broken down into training validation and testing datasets, put these into numpy arrays

In [2]:
data = np.load("bloodmnist.npz")
train_images = data["train_images"]
print(np.shape(data["train_images"]))
val_images = data["val_images"]
print(np.shape(data["val_images"]))
test_images = data["test_images"]
print(np.shape(data["test_images"]))
train_labels = data["train_labels"]
print(np.shape(data["train_labels"]))
val_labels = data["val_labels"]
print(np.shape(data["val_labels"]))
test_labels = data["test_labels"]
print(np.shape(data["test_labels"]))

combine_train_images = np.append(train_images, val_images, axis=0)
combine_train_labels = np.append(train_labels, val_labels, axis=0)

(11959, 28, 28, 3)
(1712, 28, 28, 3)
(3421, 28, 28, 3)
(11959, 1)
(1712, 1)
(3421, 1)


In [3]:
class ImageDataset:
    def __init__(self, images, labels):
        self.images = images/255
        self.labels = labels
        self.class_num = len(np.unique(labels))
        self.counts = []
        self.proportions = []
        self.length = np.shape(images)[0]
        self.width = np.shape(images)[1]
        # self.iamge_features = []
        self.one_hot_labels = self.one_hot_encode()
        self.update_counts()
        # self.oversample()

    def update_counts(self):
        self.counts = []
        self.proportions = []
        
        for i in range(self.class_num):
            self.counts.append(len(np.where(self.labels == i)[0]))
        
        self.proportions = [count/self.length for count in self.counts]

    def oversample(self):
        ros = RandomOverSampler(random_state=0)
        self.images= self.images.reshape((self.length, self.width*self.width*3))
        self.images, self.labels = ros.fit_resample(self.images, self.labels)
        self.length = self.images.shape[0]
        self.images = self.images.reshape((self.length, self.width, self.width, 3))
        self.one_hot_labels = self.one_hot_encode()
        self.update_counts()
        self.image_features = self.images

    def one_hot_encode(self):
        one_hot_labels = np.array([np.zeros(self.class_num) for i in range(self.length)])
        for i in range(self.length):
            one_hot_labels[i][self.labels[i]] = 1
        return np.argmax(one_hot_labels, axis=1)
    
    # def apply_CNN(self, model):

    def shuffle(self):
        p = np.random.permutation(self.length)
        self.images, self.labels, self.one_hot_labels = self.images[p], self.labels[p], self.one_hot_labels[p]

    def get_features(self, model):
        print(np.shape(self.images), np.shape(self.one_hot_labels))
        self.image_features = model.predict(self.images)

In [4]:
cnn_model = load_model("./CNN_model.h5")
model_use = Model(
    inputs = cnn_model.input,
    outputs = cnn_model.layers[-3].output
)

In [5]:
train_dataset = ImageDataset(train_images, train_labels)
val_dataset = ImageDataset(val_images, val_labels)
combine_train_dataset = ImageDataset(combine_train_images, combine_train_labels)

In [6]:
train_dataset.oversample()
train_dataset.get_features(model_use)
val_dataset.get_features(model_use)
combine_train_dataset.get_features(model_use)

(18640, 28, 28, 3) (18640,)
(1712, 28, 28, 3) (1712,)
(13671, 28, 28, 3) (13671,)


In [11]:
# define dataset
# bcl = LogisticRegression(max_iter=1000)
# model = DecisionTreeClassifier(splitter='random')
# estimators = [
#     ('rf', RandomForestClassifier(max_depth=10, n_estimators=50)),
#     # ('lr', LogisticRegression(C=0.5, tol=0.001, solver="sag", max_iter=10000)),
#     ('lr', LogisticRegression(max_iter=10000, C=0.5))
#     # ('knn', KNeighborsClassifier())
# ]
# bcl = RandomForestClassifier(max_depth=10)
# model = GaussianNB()
# model = KNeighborsClassifier()
# model = SGDOneClassSVM()
# fit model
# bcl = HistGradientBoostingClassifier(random_state=0, max_depth=2, max_iter=20, learning_rate=0.1, scoring='loss', l2_regularization=0.1, max_leaf_nodes=20)
# bcl = AdaBoostClassifier()
param_grid = {
    'classification__learning_rate': [0.01, 0.1, 1],
    'classification__n_estimators': [10, 20, 50],
    'classification__estimator__max_depth': [2,5,10],
    'classification__estimator__min_samples_split': [5, 10, 20],
    'classification__estimator__min_samples_leaf': [5]
}

scorers = {
    'balanced_accuracy_score': make_scorer(balanced_accuracy_score)
}

model = Pipeline([
    ('sampling', RandomOverSampler()),
    ('classification', AdaBoostClassifier(DecisionTreeClassifier()))
])

# bcl = GridSearchCV(estimator=model, param_grid=param_grid, scoring='balanced_accuracy', cv=5, refit=True)
# bcl = BaggingClassifier(DecisionTreeClassifier(min_samples_leaf=20, max_depth=3, min_samples_split=50), n_estimators=100)
bcl = LogisticRegression(solver='sag', max_iter=1000)
bcl.fit(train_dataset.image_features, train_dataset.one_hot_labels)
# print(bcl.best_params_)
# make predictions
# results_df = pd.DataFrame(bcl.results)
# results_df.to_csv('results.csv', encoding='utf-8', index=False)
# yhat = bcl.predict(train_dataset.image_features)
# print(yhat)
# models = []
# for i in range(train_dataset.class_num * train_dataset.class_num-1):
#     models.add(Dense(100, activation='relu'))
#     models.add(Dense(1, activation='sigmoid'))



In [12]:
# get accuracy and auc
# results_df = pd.DataFrame(bcl.results)
# results_df.to_csv('results.csv', encoding='utf-8', index=False)
# yhat = bcl.predict(combine_train_dataset.image_features)
yhat = bcl.predict(train_dataset.image_features)

acc = metrics.accuracy_score(train_dataset.labels, yhat)
# metrics.auc(acc)
print(acc)

0.9746244635193133


In [15]:
# test_dataset = ImageDataset(test_images, test_labels)
# test_dataset.get_features(model_use)
valhat = bcl.predict(val_dataset.image_features)
acc = metrics.accuracy_score(val_dataset.one_hot_labels, valhat)
# print(search.best_params_)
print(acc)

(3421, 28, 28, 3) (3421,)
0.9193218357205496


In [None]:
import pylab as plt

In [None]:
train_scores = bcl.train_score_
plt.plot(np.arange(train_scores.shape[0]), train_scores, 'b-')

AttributeError: 'AdaBoostClassifier' object has no attribute 'train_score_'