# Image Classification with Hybrid Ensemble

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import tensorflow as tf
import os
import csv

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


2023-04-19 10:08:24.647845: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-19 10:08:24.847446: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-04-19 10:08:28.059103: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-04-19 10:08:28.059264: W tensorflow/strea

In [2]:
import os
print(os.cpu_count())

16


### Load Dataset

In [3]:
%%time

# Load the compressed array from disk
images_mini = np.load('CNN_Xs_downsample_forvgg16.npz')['data']

CPU times: user 10.5 s, sys: 878 ms, total: 11.3 s
Wall time: 11.3 s


In [4]:
# load the labels from disk
with open('CNN_labels_downsampled_forvgg16.csv', 'r') as file:
    reader = csv.reader(file)
    labels_mini = list(reader)[0]
    labels_mini = [int(x) for x in labels_mini]
    y_mini= np.array(labels_mini)

In [5]:
print('images_mini.shape:', images_mini.shape)
print('\nnumber of labels:', len(labels_mini))
print('\nnumber of positive cancer cases:', sum(labels_mini))

images_mini.shape: (5600, 224, 224, 3)

number of labels: 5600

number of positive cancer cases: 600


### Normalize

In [6]:
print('Original max pixel value:', images_mini.max())
print('Confirm min pixel value is 0:', images_mini.min())

Original max pixel value: 255.0
Confirm min pixel value is 0: 0.0


In [7]:
#Normalize image pixel values
images_mini_norm = images_mini / images_mini.max() 

In [8]:
print('New max pixel value:', images_mini_norm.max())
print('Confirm min pixel value is 0:', images_mini_norm.min())

New max pixel value: 1.0
Confirm min pixel value is 0: 0.0


### Reshape

In [9]:
images_mini_norm.shape

(5600, 224, 224, 3)

In [10]:
arr_mean = np.mean(images_mini_norm, axis=-1)
arr_mean.shape
resized_images_mini = np.expand_dims(arr_mean, axis=-1)
resized_images_mini.shape

(5600, 224, 224, 1)

In [11]:
print('Original max pixel value:', resized_images_mini.max())
print('Confirm min pixel value is 0:', resized_images_mini.min())

Original max pixel value: 1.0
Confirm min pixel value is 0: 0.0


## Settings

In [12]:
random_state = 1234

In [13]:
random.seed(random_state)
tf.random.set_seed(random_state)
np.random.seed(random_state)

### Inputs

In [14]:
X= resized_images_mini
y = y_mini

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
X_train.shape

(4480, 224, 224, 1)

In [16]:
def preprocess_augm(X_train, y_train):
    
    # Augment only cancer=1 data
    X_train_w_cancer = X_train[y_train == 1, :, :, :]
    y_train_w_cancer = y_train[y_train == 1]
    
    ### image augmentation on training data ###
    ###########################################
    # adjust brightness
    X_train_augm1 = tf.image.adjust_brightness(X_train_w_cancer, delta=.1)
    X_train_augm2 = tf.image.adjust_brightness(X_train_w_cancer, delta=.2)
    
    # adjust contrast
    X_train_augm5 = tf.image.adjust_contrast(X_train_w_cancer, contrast_factor=1)
    X_train_augm6 = tf.image.adjust_contrast(X_train_w_cancer, contrast_factor=2)

    # random flip
    X_train_augm8 = tf.image.random_flip_left_right(X_train_w_cancer)
    
    # concatenate augmented X_train data
    X_train_augm = tf.concat([X_train_augm1, X_train_augm2, X_train_augm5,
                        X_train_augm6, X_train_augm8],axis=0)
    order = []
    for i in range(5):
        order += [j for j in range(len(X_train_w_cancer))]
    
    # concatenate y_train (note the label is preserved)
    y_train_augm = tf.convert_to_tensor(np.ones(len(X_train_augm)))
    
    # shuffle X_train and y_train, i.e., shuffle two tensors in the same order
    shuffle = tf.random.shuffle(tf.range(tf.shape(X_train_augm)[0], dtype=tf.int32))
    X_train_augm = tf.gather(X_train_augm, shuffle)
    y_train_augm = tf.gather(y_train_augm, shuffle)
    
    shuffled_order = tf.gather(order, shuffle)

    return X_train_augm, y_train_augm, shuffled_order.numpy()



2023-04-19 10:08:52.560695: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-04-19 10:08:52.560749: W tensorflow/stream_executor/cuda/cuda_driver.cc:263] failed call to cuInit: UNKNOWN ERROR (303)
2023-04-19 10:08:52.560788: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (099efb1cae5d): /proc/driver/nvidia/version does not exist
2023-04-19 10:08:52.561255: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in 

In [17]:
def preprocess_augm_part2(X_train, y_train, X_train_augm, y_train_augm, order, n_augm):
    X_train_augm = X_train_augm[0:n_augm]
    y_train_augm = y_train_augm[0:n_augm]
    selected_order = order[0:n_augm]
    
    X_train_order = [j for j in range(tf.shape(X_train)[0])]
    selected_order = np.concatenate((np.array(X_train_order), selected_order))
    
    
    X_train = tf.concat([X_train, X_train_augm],axis=0)
    y_train = tf.concat([y_train, y_train_augm],axis=0)
    
    # shuffle X_train and y_train, i.e., shuffle two tensors in the same order
    shuffle = tf.random.shuffle(tf.range(tf.shape(X_train)[0], dtype=tf.int32))
    X_train = tf.gather(X_train, shuffle)
    y_train = tf.gather(y_train, shuffle).numpy() #also transforms y_train to numpy array
    
    groups = tf.gather(selected_order, shuffle)
    
    return X_train, y_train, groups.numpy()

In [None]:
# Run the function
# preprocess_augm(X_train, y_train)
X_train_augm, y_train_augm, order = preprocess_augm(X_train, y_train)

In [18]:
num_aug = 1800
X_train, y_train, groups = preprocess_augm_part2(X_train, y_train, X_train_augm, y_train_augm, order, n_augm=num_aug)

In [19]:
# reshape
X_train = np.reshape(X_train, (len(X_train), -1))
X_test = np.reshape(X_test, (len(X_test), -1))

In [20]:
X_train.shape

(6280, 50176)

## Build Model

In [21]:
lr = LogisticRegression(max_iter = 1000)

knn = KNeighborsClassifier(
    n_jobs=-1,
    **{'weights': 'distance', 'p': 1, 'n_neighbors': 25, 'algorithm': 'auto'}
)


gnb = GaussianNB(
    **{'var_smoothing': 0.0013768459590682242}
)

dt = DecisionTreeClassifier(
    **{'min_samples_split': 2, 'max_features': 0.1, 'max_depth': 15, 'criterion': 'gini'}
)

rf = RandomForestClassifier(
    **{'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 30}
)

lsvm = LinearSVC(
    **{'tol': 0.00031622776601683794, 'penalty': 'l2', 'loss': 'squared_hinge', 'dual': False, 'C': 0.01}
)


In [26]:
classifiers = [
    ['KNN', knn],
    ['LogisticRegression', lr],
    ['NaiveBayes', gnb],
    ['DecisionTree',dt],
    ['RandomForest', rf], 
    ['Linear SVM', lsvm],
    # ['XGB :', xgb],
]

voting_clf = VotingClassifier(classifiers, voting='hard')

In [27]:
X_train.shape

(6280, 50176)

In [28]:
y_train.shape

(6280,)

In [29]:

voting_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier(estimators=[['KNN',
                              KNeighborsClassifier(n_jobs=-1, n_neighbors=25,
                                                   p=1, weights='distance')],
                             ['LogisticRegression',
                              LogisticRegression(max_iter=1000)],
                             ['NaiveBayes',
                              GaussianNB(var_smoothing=0.0013768459590682242)],
                             ['DecisionTree',
                              DecisionTreeClassifier(max_depth=15,
                                                     max_features=0.1)],
                             ['RandomForest',
                              RandomForestClassifier(max_depth=30,
                                                     max_features='log2',
                                                     min_samples_split=5)],
                             ['Linear SVM',
                              LinearSVC(C=0.01, dual=False,
            

### Scoring

### Hard Voting

In [30]:
train_roc = voting_clf.score(X_train, y_train)
print(f"Train acc score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = voting_clf.score(X_test, y_test)
print(f"Test acc score: {test_roc:.3f}")

Train acc score: 0.995
Test acc score: 0.892


In [31]:
y_pred_train = voting_clf.predict(X_train)
y_pred_test = voting_clf.predict(X_test)


In [32]:
train_roc = f1_score(y_train, y_pred_train)
print(f"Train f1 score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = f1_score(y_test, y_pred_test)
print(f"Test f1 score: {test_roc:.3f}")

Train f1 score: 0.993
Test f1 score: 0.000


In [33]:
train_roc = precision_score(y_train, y_pred_train)
print(f"Train precision score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = precision_score(y_test, y_pred_test)
print(f"Test precision score: {test_roc:.3f}")


Train precision score: 1.000
Test precision score: 0.000
Train recall score: 0.987
Test recall score: 0.000


In [None]:
train_roc = recall_score(y_train, y_pred_train)
print(f"Train recall score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = recall_score(y_test, y_pred_test)
print(f"Test recall score: {test_roc:.3f}")

In [1]:
from sklearn.metrics import roc_auc_score
train_roc = roc_auc_score(y_train, y_pred_train)
print(f"Train f1 score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = roc_auc_score(y_test, y_pred_test)
print(f"Test f1 score: {test_roc:.3f}")

NameError: name 'y_train' is not defined

### Soft Voting

In [39]:
classifiers = [
    ['KNN', knn],
    ['LogisticRegression', lr],
    ['NaiveBayes', gnb],
    ['DecisionTree',dt],
    ['RandomForest', rf], 
    # ['Linear SVM', lsvm],
    # ['XGB :', xgb],
]

In [40]:
soft_voting_clf = VotingClassifier(classifiers, voting='soft')

In [41]:
soft_voting_clf.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


VotingClassifier(estimators=[['KNN',
                              KNeighborsClassifier(n_jobs=-1, n_neighbors=25,
                                                   p=1, weights='distance')],
                             ['LogisticRegression',
                              LogisticRegression(max_iter=1000)],
                             ['NaiveBayes',
                              GaussianNB(var_smoothing=0.0013768459590682242)],
                             ['DecisionTree',
                              DecisionTreeClassifier(max_depth=15,
                                                     max_features=0.1)],
                             ['RandomForest',
                              RandomForestClassifier(max_depth=30,
                                                     max_features='log2',
                                                     min_samples_split=5)]],
                 voting='soft')

In [42]:
y_pred_train = soft_voting_clf.predict(X_train)
y_pred_test = soft_voting_clf.predict(X_test)

In [43]:
train_roc = f1_score(y_train, y_pred_train)
print(f"Train f1 score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = f1_score(y_test, y_pred_test)
print(f"Test f1 score: {test_roc:.3f}")

Train f1 score: 0.997
Test f1 score: 0.030


In [44]:
train_roc = precision_score(y_train, y_pred_train)
print(f"Train precision score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = precision_score(y_test, y_pred_test)
print(f"Test precision score: {test_roc:.3f}")


Train precision score: 1.000
Test precision score: 0.125
Train recall score: 0.994
Test recall score: 0.017


In [None]:

train_roc = recall_score(y_train, y_pred_train)
print(f"Train recall score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = recall_score(y_test, y_pred_test)
print(f"Test recall score: {test_roc:.3f}")

In [45]:
train_roc = roc_auc_score(y_train, y_pred_train)
print(f"Train ROC_AUC score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = roc_auc_score(y_test, y_pred_test)
print(f"Test ROC_AUC score: {test_roc:.3f}")

Train f1 score: 0.997
Test f1 score: 0.501
