In [1]:
import random
random.seed(42)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import time

from PIL import Image
import imagesize

from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras import layers, models

2024-06-05 22:36:35.273541: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Dataset folder has 8 different folders, which represent 8 different bloodcells we will be classifying
# Open up folder names and remove folder names that are not bloodcell types

# Original kaggle dataset has images stored in each of the 8 folders, so we made a folder that contained all the images
# so that it is easier to convert the images to numpy arrays later on

bloodcells = os.listdir("bloodcells_dataset")
bloodcells = [x for x in bloodcells if x not in ['.DS_Store', 'All_Images']]

bloodcells

['basophil',
 'neutrophil',
 'ig',
 'monocyte',
 'eosinophil',
 'erythroblast',
 'lymphocyte',
 'platelet']

In [None]:
# initialize empty dataframe to store image strings and bloodcell type
df = pd.DataFrame(np.nan, 
                  index = [0], 
                  columns = ['images', 'type'])

# loop through bloodcell types and store image paths and bloodcell categories
for i in range(len(bloodcells)):

    images = os.listdir('bloodcells_dataset/' + bloodcells[i]) # jpg string paths
    
    images_df = pd.DataFrame(data = {'images': images, 
                                     'type': bloodcells[i]})
    
    df = pd.concat([df, images_df])

# drop row that was first initialized with NaNs
df = df.dropna(how = 'all')

# Convert bloodcel types to numbers for our model
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

# Store dimensions of image incase we find different dimensions 
df['width'] = df['images'].apply(lambda x: imagesize.get('bloodcells_dataset/All_Images/' + x)[0])
df['height'] = df['images'].apply(lambda x: imagesize.get('bloodcells_dataset/All_Images/' + x)[1])

# Reset index and remove image paths that may have been accidentally copied
df = df[df['images'].str.contains('copy') == False]
df = df.reset_index(drop = True)


df

In [None]:
# Visuals for count of height/width and count of each blood cell type
# Maybe use function to plot

# df[['height', 'width']].value_counts().reset_index(name = 'count') 
# df[['type']].value_counts().reset_index(name = 'count')

In [None]:
class Sampling:

    def __init__(self, data, sampling_method):

        self.data = data
        
        self.sampling_method = sampling_method

    def sample_data(self, sampling_percent = 0.8):

        df = self.data.copy()

        category_counts = df[['type']].value_counts().reset_index(name = 'count')

        train = pd.DataFrame(np.nan, index = [0], columns = list(df.columns))

        if self.sampling_method == 'weighted':

            for i in range(len(category_counts)):

                type = category_counts['type'][i]
            
                if category_counts['count'][i] >= 2000:
                    add_samples = df[df['type'] == type].sample(1500)
                else: 
                    add_samples = df[df['type'] == type].sample(1000)
            
                train = pd.concat([train, add_samples])

        elif self.sampling_method == 'proportional': 

            num_samples = int(sampling_percent * len(df))
            
            category_counts['prop'] = category_counts['count'] / len(df)
            category_counts['prop_samples'] = category_counts['prop'] * num_samples
            category_counts['prop_samples'] = category_counts['prop_samples'].astype('int32')

            for i in range(len(category_counts)):

                type = category_counts['type'][i]
            
                samples = category_counts['prop_samples'][i]
            
                add_samples = df[df['type'] == type].sample(samples)
            
                train = pd.concat([train, add_samples])

        train = train.dropna(how = 'all')

        float_cols = train.select_dtypes(np.number)

        train[float_cols.columns] = float_cols.astype('int32')

        test = df[~df['images'].isin(train['images'])]    

        return train, test
            

In [None]:
weighted_sampling = Sampling(df, 'weighted')
proportional_sampling = Sampling(df, 'proportional')

weighted_train, weighted_test = weighted_sampling.sample_data()
prop_train, prop_test = proportional_sampling.sample_data()

In [None]:
class Convert_Images:

    def __init__(self, data):

        self.data = data

        self.file_names = (self.data)['images'].apply(lambda x: 'bloodcells_dataset/All_Images/' + x)

        self.labels = self.data['type']

    def load_image(self, file_name, resize):
        
        raw = tf.io.read_file(file_name)
        
        tensor = tf.io.decode_image(raw, expand_animations = False)
        
        tensor = tf.image.resize(tensor, size = [resize, resize])
        
        tensor = tf.cast(tensor, tf.float32) / 255.0
        
        return tensor

    def image_arrays_and_labels(self, resize = 32):

      file_names = self.file_names

      dataset = tf.data.Dataset.from_tensor_slices(file_names)
        
      dataset = dataset.map(lambda file_name: self.load_image(file_name, resize))
        
      images = np.array(list(dataset))
        
      return images, self.labels

In [None]:
weighted_train_image_df, weighted_test_image_df = Convert_Images(weighted_train), Convert_Images(weighted_test)
prop_train_image_df, prop_test_image_df = Convert_Images(prop_train), Convert_Images(prop_test)


resize_pixels = 32

weighted_train_images, weighted_train_labels = weighted_train_image_df.image_arrays_and_labels(resize_pixels)
weighted_test_images, weighted_test_labels = weighted_test_image_df.image_arrays_and_labels(resize_pixels)
prop_train_images, prop_train_labels = prop_train_image_df.image_arrays_and_labels(resize_pixels)
prop_test_images, prop_test_labels = prop_test_image_df.image_arrays_and_labels(resize_pixels)

In [None]:
def train_model(model, train_data, train_labels, test_data, test_labels, optimizer = 'adam', epochs = 5, batch_size = 64):

    model.compile(optimizer = optimizer,
                  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), 
                  metrics = ['accuracy'])

    history = model.fit(train_data, 
                        train_labels, 
                        epochs = epochs, 
                        batch_size = batch_size)

    predictions = (model.predict(test_data)).argmax(axis = 1)

    test_accuracy = np.sum(predictions == test_labels) / len(test_labels)

    return history, predictions, test_accuracy

In [None]:
# 1st of 3 models
# Simple model
# One for weighted sampling and proportional sampling

# Simple Model - Weighted Sampling
sm_w = models.Sequential(
    
    [
        layers.Conv2D(32, (3, 3), activation = 'relu', input_shape = (resize_pixels, resize_pixels, 3)),
        
        layers.MaxPooling2D((2, 2)),
        
        layers.Conv2D(32, (3, 3), activation = 'relu'),
        
        layers.MaxPooling2D((2, 2)),
        
        layers.Conv2D(64, (3, 3), activation = 'relu'),

        # flatten into 1d array
        layers.Flatten(),

        # Neural network
        layers.Dense(64, activation = 'relu'),

        layers.Dropout(rate = 0.2),
        
        # 8 different categories
        layers.Dense(8) 
    ]
    
)

# Simple Model - Proportional Sampling
sm_p = models.clone_model(sm_w)

In [None]:
sm_w_history, sm_w_predictions, sm_w_test_accuracy = train_model(sm_w,
                                                                 weighted_train_images,
                                                                 weighted_train_labels,
                                                                 weighted_test_images,
                                                                 weighted_test_labels,
                                                                 epochs = 20)

In [None]:
sm_p_history, sm_p_predictions, sm_p_test_accuracy = train_model(sm_p,
                                                                 prop_train_images,
                                                                 prop_train_labels,
                                                                 prop_test_images,
                                                                 prop_test_labels, 
                                                                 epochs = 20)

In [None]:
sm_w_test_accuracy, sm_p_test_accuracy