Uploading the dataset

In [1]:
from google.colab import drive
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
drive.mount('/content/drive')

# List files in a directory
!ls '/content/drive/My Drive/AN2DL'


# Load the NPZ file
data = np.load('/content/drive/My Drive/AN2DL/training_set.npz')

Mounted at /content/drive
training_set.npz


Preprocessing Dataset - Removing shrek and troll images

In [2]:
images = data['images']
labels = data["labels"]

print(images.shape)
print(labels.shape)

(13759, 96, 96, 3)
(13759, 1)


In [3]:
labels # labels are already numerical values so no need to one-hot-encode them or anything

array([[7],
       [3],
       [6],
       ...,
       [5],
       [5],
       [5]], dtype=uint8)

In [4]:
def show_example(index_example):
  for i in range(0,100):
    if labels[i] == [index_example]:
      plt.imshow(images[i]/255.0)
      plt.show()
      break

In [None]:
show_example(0)
show_example(1)
show_example(2)
show_example(3)
show_example(4)
show_example(5)
show_example(6)
show_example(7)

Check the class imbalance

In [5]:
# check to see the number of data for each label
data_labels = np.array(["basophil", "eosinophil", "erythroblast", "immature granulocytes", "lymphocyte", "monocyte", "neutrophil", "platelet"])

# Count occurrences of each label
unique_labels, counts = np.unique(labels, return_counts=True)

# Print the counts for each label
for label, count in zip(unique_labels, counts):
    print(f"{label}: {count}")

0: 1052
1: 2381
2: 1285
3: 2226
4: 1049
5: 1393
6: 2530
7: 1843


In [6]:
import numpy as np
import tensorflow as tf

# Indices for the troll images (known examples)
shrekIdx = 12691
manIdx = 13610

shrek_indices = []
trol_indices = []
new_data = []
new_labels = []
for i, image in enumerate(images):
  if np.sum(images[12691] - image) == 0:
    shrek_indices.append(i)
  elif np.sum(images[13610] - image) == 0:
    trol_indices.append(i)
  else:
    new_data.append(image)
    new_labels.append(labels[i])

new_images = np.array(new_data)
new_dataLabels = np.array(new_labels)

Train/Test Split Data

In [7]:
from sklearn.model_selection import train_test_split

X = new_images
y = new_dataLabels
# Step 1: First, split into train and remaining (test + validation)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Then, split the remaining data into validation and test sets
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

Simple CNN Model

In [8]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# Initialize the model
model = Sequential()

# First Convolutional Layer
model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(96, 96, 3)))  # Example input shape 96x96x3 (RGB)
model.add(MaxPooling2D((2, 2)))

# Second Convolutional Layer
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))  # Dropout layer to prevent overfitting

# Third Convolutional Layer
model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))  # Dropout layer to prevent overfitting

# Fourth Convolutional Layer
model.add(Conv2D(256, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))  # Dropout layer to prevent overfitting

# Fifth Convolutional Layer
model.add(Conv2D(512, (3, 3), activation='relu'))
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.25))  # Dropout layer to prevent overfitting

# Flatten the output to pass it into the fully connected layer
model.add(Flatten())

# Fully connected classifier layer
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.25))  # Dropout layer to prevent overfitting
model.add(Dense(8, activation='softmax'))  # Output layer for 8 classes (change the number of classes as needed)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Display the model architecture
model.summary()
history = model.fit(
    x=X_train,  # Input data (features)
    y=y_train,  # Labels (targets)
    batch_size=32,  # Number of samples per gradient update
    epochs=10,  # Number of times to iterate over the training data
    verbose=1,  # Display progress bar (0 = silent, 1 = progress bar, 2 = one line per epoch)
    validation_data=(X_val,y_val),
    shuffle=True,  # Whether to shuffle the training data before each epoch
)



  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 41ms/step - accuracy: 0.3089 - loss: 7.9985 - val_accuracy: 0.6405 - val_loss: 1.1167
Epoch 2/10
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 12ms/step - accuracy: 0.6099 - loss: 1.0918 - val_accuracy: 0.7375 - val_loss: 0.7055
Epoch 3/10
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 13ms/step - accuracy: 0.7255 - loss: 0.7789 - val_accuracy: 0.8060 - val_loss: 0.5499
Epoch 4/10
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 11ms/step - accuracy: 0.7683 - loss: 0.6641 - val_accuracy: 0.8495 - val_loss: 0.4207
Epoch 5/10
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.8064 - loss: 0.5509 - val_accuracy: 0.8704 - val_loss: 0.4189
Epoch 6/10
[1m299/299[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 12ms/step - accuracy: 0.8355 - loss: 0.4758 - val_accuracy: 0.8395 - val_loss: 0.4330
Epoch 7/10
[1m299/29

Get Validation Accuracy

In [9]:
# Predict class probabilities
predictions = model.predict(X_test)

# Get the class with the highest probability for each test sample
predicted_classes = np.argmax(predictions, axis=1)

# Evaluate the model on the test set
test_loss, test_accuracy = model.evaluate(X_test, y_test, verbose=1)


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.8733 - loss: 0.4442
