# Image Classification with Hybrid Ensemble

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import tensorflow as tf
import os
import csv

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report


2023-04-19 06:59:26.944246: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-19 06:59:28.820093: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-04-19 06:59:35.471847: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-04-19 06:59:35.472233: W tensorflow/strea

In [2]:
import os
print(os.cpu_count())

16


### Load Dataset

In [3]:
%%time

# Load the compressed array from disk
images_mini = np.load('CNN_Xs_downsample_forvgg16.npz')['data']

CPU times: user 10.7 s, sys: 932 ms, total: 11.7 s
Wall time: 12 s


In [4]:
# load the labels from disk
with open('CNN_labels_downsampled_forvgg16.csv', 'r') as file:
    reader = csv.reader(file)
    labels_mini = list(reader)[0]
    labels_mini = [int(x) for x in labels_mini]
    y_mini= np.array(labels_mini)

In [5]:
print('images_mini.shape:', images_mini.shape)
print('\nnumber of labels:', len(labels_mini))
print('\nnumber of positive cancer cases:', sum(labels_mini))

images_mini.shape: (5600, 224, 224, 3)

number of labels: 5600

number of positive cancer cases: 600


### Normalize

In [6]:
print('Original max pixel value:', images_mini.max())
print('Confirm min pixel value is 0:', images_mini.min())

Original max pixel value: 255.0
Confirm min pixel value is 0: 0.0


In [7]:
#Normalize image pixel values
images_mini_norm = images_mini / images_mini.max() 

In [8]:
print('New max pixel value:', images_mini_norm.max())
print('Confirm min pixel value is 0:', images_mini_norm.min())

New max pixel value: 1.0
Confirm min pixel value is 0: 0.0


### Reshape

In [9]:
images_mini_norm.shape

(5600, 224, 224, 3)

In [10]:
arr_mean = np.mean(images_mini_norm, axis=-1)
arr_mean.shape
resized_images_mini = np.expand_dims(arr_mean, axis=-1)
resized_images_mini.shape

(5600, 224, 224, 1)

In [11]:
print('Original max pixel value:', resized_images_mini.max())
print('Confirm min pixel value is 0:', resized_images_mini.min())

Original max pixel value: 1.0
Confirm min pixel value is 0: 0.0


## Settings

In [12]:
random_state = 1234

In [13]:
random.seed(random_state)
tf.random.set_seed(random_state)
np.random.seed(random_state)

### Inputs

In [14]:
X= resized_images_mini
y = y_mini

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)
X_train.shape

(4480, 224, 224, 1)

In [18]:
# X_train, y_train, groups = preprocess_augm_part2(X_train, y_train, X_train_augm, y_train_augm, order, n_augm=200)

In [19]:
# reshape
X_train = np.reshape(X_train, (len(X_train), -1))
X_test = np.reshape(X_test, (len(X_test), -1))

In [20]:
X_train.shape

(4480, 50176)

## Build Model

In [21]:
lr = LogisticRegression(max_iter = 1000)

knn = KNeighborsClassifier(
    n_jobs=-1,
    **{'weights': 'distance', 'p': 1, 'n_neighbors': 10, 'algorithm': 'auto'}
)


gnb = GaussianNB(
    **{'var_smoothing': 0.011289431757440382}
)

dt = DecisionTreeClassifier(
    **{'min_samples_split': 249, 'max_features': 0.1, 'max_depth': 9, 'criterion': 'entropy'}
)

rf = RandomForestClassifier(
    **{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_features': 'log2', 'max_depth': 15}
)

lsvm = LinearSVC(
    **{'tol': 1.778279410038923e-05, 'penalty': 'l2', 'loss': 'squared_hinge', 'dual': False, 'C': 10000.0}
)


In [35]:
classifiers = [
    ['KNN', knn],
    ['LogisticRegression', lr],
    ['NaiveBayes', gnb],
    ['DecisionTree',dt],
    ['RandomForest', rf], 
    ['Linear SVM', lsvm],
    # ['XGB :', xgb],
]

voting_clf = VotingClassifier(classifiers, voting='hard')

In [23]:
X_train.shape

(4480, 50176)

In [24]:
y_train.shape

(4480,)

In [25]:
voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[['KNN',
                              KNeighborsClassifier(n_jobs=-1, n_neighbors=10,
                                                   p=1, weights='distance')],
                             ['LogisticRegression',
                              LogisticRegression(max_iter=1000)],
                             ['NaiveBayes',
                              GaussianNB(var_smoothing=0.011289431757440382)],
                             ['DecisionTree',
                              DecisionTreeClassifier(criterion='entropy',
                                                     max_depth=9,
                                                     max_features=0.1,
                                                     min_samples_split=249)],
                             ['RandomForest',
                              RandomForestClassifier(max_depth=15,
                                                     max_features='log2',
                                            

### Scoring

### Hard Voting

In [None]:
train_roc = voting_clf.score(X_train, y_train)
print(f"Train acc score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = voting_clf.score(X_test, y_test)
print(f"Test acc score: {test_roc:.3f}")

In [26]:
y_pred_train = voting_clf.predict(X_train)
y_pred_test = voting_clf.predict(X_test)


In [29]:
train_roc = f1_score(y_train, y_pred_train)
print(f"Train f1 score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = f1_score(y_test, y_pred_test)
print(f"Test f1 score: {test_roc:.3f}")

Train f1 score: 0.726
Test f1 score: 0.000


In [30]:
train_roc = precision_score(y_train, y_pred_train)
print(f"Train precision score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = precision_score(y_test, y_pred_test)
print(f"Test precision score: {test_roc:.3f}")



Train precision score: 1.000
Test precision score: 0.000
Train recall score: 0.570
Test recall score: 0.000


In [None]:

train_roc = recall_score(y_train, y_pred_train)
print(f"Train recall score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = recall_score(y_test, y_pred_test)
print(f"Test recall score: {test_roc:.3f}")

In [31]:
from sklearn.metrics import roc_auc_score
train_roc = roc_auc_score(y_train, y_pred_train)
print(f"Train f1 score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = roc_auc_score(y_test, y_pred_test)
print(f"Test f1 score: {test_roc:.3f}")

Train f1 score: 0.785
Test f1 score: 0.500


In [27]:
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3999
           1       1.00      0.57      0.73       481

    accuracy                           0.95      4480
   macro avg       0.98      0.78      0.85      4480
weighted avg       0.96      0.95      0.95      4480



In [28]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

           0       0.89      1.00      0.94      1001
           1       0.00      0.00      0.00       119

    accuracy                           0.89      1120
   macro avg       0.45      0.50      0.47      1120
weighted avg       0.80      0.89      0.84      1120



### Soft Voting

In [36]:
soft_voting_clf = VotingClassifier(classifiers, voting='soft')

In [37]:
soft_voting_clf.fit(X_train, y_train)

VotingClassifier(estimators=[['KNN',
                              KNeighborsClassifier(n_jobs=-1, n_neighbors=10,
                                                   p=1, weights='distance')],
                             ['LogisticRegression',
                              LogisticRegression(max_iter=1000)],
                             ['NaiveBayes',
                              GaussianNB(var_smoothing=0.011289431757440382)],
                             ['DecisionTree',
                              DecisionTreeClassifier(criterion='entropy',
                                                     max_depth=9,
                                                     max_features=0.1,
                                                     min_samples_split=249)],
                             ['RandomForest',
                              RandomForestClassifier(max_depth=15,
                                                     max_features='log2',
                                            

In [38]:
y_pred_train = soft_voting_clf.predict(X_train)
y_pred_test = soft_voting_clf.predict(X_test)

In [39]:
train_roc = f1_score(y_train, y_pred_train)
print(f"Train f1 score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = f1_score(y_test, y_pred_test)
print(f"Test f1 score: {test_roc:.3f}")

Train f1 score: 0.776
Test f1 score: 0.000


In [40]:
train_roc = precision_score(y_train, y_pred_train)
print(f"Train precision score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = precision_score(y_test, y_pred_test)
print(f"Test precision score: {test_roc:.3f}")



Train precision score: 1.000
Test precision score: 0.000
Train recall score: 0.634
Test recall score: 0.000


In [None]:

train_roc = recall_score(y_train, y_pred_train)
print(f"Train recall score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = recall_score(y_test, y_pred_test)
print(f"Test recall score: {test_roc:.3f}")

In [41]:
train_roc = roc_auc_score(y_train, y_pred_train)
print(f"Train f1 score: {train_roc:.3f}")

# Evaluate the test accuracy
test_roc = roc_auc_score(y_test, y_pred_test)
print(f"Test f1 score: {test_roc:.3f}")

Train f1 score: 0.817
Test f1 score: 0.499
