## Import Libraries

Mix of libraries for data preprocessing, visualization, and modeling.

In [1]:
import random
random.seed(42)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import time

from PIL import Image
import imagesize

from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras import layers, models

2024-06-06 21:33:18.617841: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Exploration

Here, we will be creating a dataframe with all our image paths (as strings), bloodcell type, and image dimension information. We will take a look at if our data is balanced or not with the bloodcell type counts and the image dimension counts.

In [4]:
# Dataset folder has 8 different folders, which represent 8 different bloodcells we will be classifying
# Open up folder names and remove folder names that are not bloodcell types

# Original kaggle dataset has images stored in each of the 8 folders, so we made a folder that contained all the images
# so that it is easier to convert the images to numpy arrays later on

bloodcells = os.listdir("bloodcells_dataset")
bloodcells = [x for x in bloodcells if x not in ['.DS_Store', 'All_Images']]

bloodcells

['basophil',
 'neutrophil',
 'ig',
 'monocyte',
 'eosinophil',
 'erythroblast',
 'lymphocyte',
 'platelet']

In [5]:
def image_df(folder_names):

    '''
    Outputs a dataframe for image paths (as strings), bloodcell type, and image dimension information.

    Args:
        1) folder_names (list): list of bloodcell type folders

    Returns:
        Dataframe with all image paths, bloodcell types, and image dimensions
    '''

    # initialize empty list to store dataframes that contain image strings and bloodcell type
    dfs = []

    # loop through bloodcell types and store image paths and bloodcell categories
    for i in range(len(folder_names)):

        # jpg string paths
        images = os.listdir('bloodcells_dataset/' + folder_names[i]) 

        # dataframe holding specific bloodcell type info (string path and type name)
        df = pd.DataFrame(data = {'images': images, 'type': folder_names[i]})

        # append dataframe to list
        dfs.append(df)

    # combine all dataframes
    all_data = pd.concat(dfs)

    # Remove image paths that may have been accidentally copied or contain .DS_Store
    all_data = all_data[all_data['images'].str.contains('.DS_Store') == False]
    all_data = all_data[all_data['images'].str.contains('copy') == False]

    # Convert bloodcell types to numbers for our model
    le = LabelEncoder()
    all_data['type_category'] = all_data['type'] # keep a copy of bloodcell types by name
    all_data['type'] = le.fit_transform(all_data['type'])

    # Store dimensions of image incase we find different dimensions 
    dimensions = pd.Series([imagesize.get('bloodcells_dataset/All_Images/' + x) for x in all_data['images']])
    widths, heights = map(list, zip(*dimensions))
    all_data['width'] = widths
    all_data['height'] = heights

    # Reset index 
    all_data = all_data.reset_index(drop = True)
    
    return all_data


    

In [6]:
df = image_df(bloodcells)

df

Unnamed: 0,images,type,type_category,width,height
0,BA_689200.jpg,0,basophil,360,363
1,BA_883452.jpg,0,basophil,360,363
2,BA_382161.jpg,0,basophil,366,369
3,BA_175579.jpg,0,basophil,360,363
4,BA_775722.jpg,0,basophil,360,363
...,...,...,...,...,...
17087,PLATELET_495918.jpg,7,platelet,360,363
17088,PLATELET_897238.jpg,7,platelet,360,363
17089,PLATELET_750430.jpg,7,platelet,360,363
17090,PLATELET_810431.jpg,7,platelet,360,363


In [8]:
# Visuals for count of height/width and count of each blood cell type
# Maybe use function to plot

# df[['height', 'width']].value_counts().reset_index(name = 'count') 
# df[['type_category']].value_counts().reset_index(name = 'count')

In [6]:
class Sampling:

    def __init__(self, data, sampling_method):

        self.data = data
        
        self.sampling_method = sampling_method

    def sample_data(self, sampling_percent = 0.8):

        df = self.data.copy()

        category_counts = df[['type']].value_counts().reset_index(name = 'count')

        train_dfs = []

        if self.sampling_method == 'weighted':

            for i in range(len(category_counts)):

                type = category_counts['type'][i]
            
                if category_counts['count'][i] >= 2000:
                    add_samples = df[df['type'] == type].sample(1500)
                else: 
                    add_samples = df[df['type'] == type].sample(1000)

                train_dfs.append(add_samples)
            
            train = pd.concat(train_dfs)

            remaining_data = df[~df['images'].isin(train['images'])]
    
            validation = remaining_data.sample(int(len(remaining_data) / 2))
        
        elif self.sampling_method == 'proportional': 

            num_samples = int(sampling_percent * len(df))
            
            category_counts['prop'] = category_counts['count'] / len(df)
            category_counts['train_samples'] = (category_counts['prop'] * num_samples).astype('int32')
            category_counts['val_samples'] = (category_counts['train_samples'] * 0.2).astype('int32')
            category_counts['train_samples'] = category_counts['train_samples'] - category_counts['val_samples']

            val_dfs = []

            for i in range(len(category_counts)):

                type = category_counts['type'][i]
            
                samples = category_counts['train_samples'][i]
            
                add_samples = df[df['type'] == type].sample(samples)

                train_dfs.append(add_samples)
            
            train = pd.concat(train_dfs)

            remaining_data = df[~df['images'].isin(train['images'])]

            for i in range(len(category_counts)):

                type = category_counts['type'][i]
            
                samples = category_counts['val_samples'][i]
            
                add_samples = remaining_data[remaining_data['type'] == type].sample(samples)

                val_dfs.append(add_samples)
            
            validation = pd.concat(val_dfs)

        train = train.dropna(how = 'all')

        float_cols = train.select_dtypes(np.number)

        train[float_cols.columns] = float_cols.astype('int32')

        test = remaining_data[~remaining_data['images'].isin(validation['images'])]

        return train, validation, test
            

In [7]:
weighted_sampling = Sampling(df, 'weighted')
proportional_sampling = Sampling(df, 'proportional')

weighted_train, weighted_val, weighted_test = weighted_sampling.sample_data()
prop_train, prop_val, prop_test = proportional_sampling.sample_data()

In [8]:
class Convert_Images:

    def __init__(self, data):

        self.data = data

        self.file_names = (self.data)['images'].apply(lambda x: 'bloodcells_dataset/All_Images/' + x)

        self.labels = self.data['type']

    def load_image(self, file_name, resize):
        
        raw = tf.io.read_file(file_name)
        
        tensor = tf.io.decode_image(raw, expand_animations = False)
        
        tensor = tf.image.resize(tensor, size = [resize, resize])
        
        tensor = tf.cast(tensor, tf.float32) / 255.0
        
        return tensor

    def image_arrays_and_labels(self, resize = 32):

      dataset = tf.data.Dataset.from_tensor_slices(self.file_names)
        
      dataset = dataset.map(lambda file_name: self.load_image(file_name, resize))
        
      images = np.array(list(dataset))
        
      return images, self.labels

In [9]:
weighted_train_image_df, weighted_val_image_df, weighted_test_image_df = Convert_Images(weighted_train), Convert_Images(weighted_val), Convert_Images(weighted_test) 
prop_train_image_df, prop_val_image_df, prop_test_image_df = Convert_Images(prop_train), Convert_Images(prop_val), Convert_Images(prop_test)



resize_pixels = 32

weighted_train_images, weighted_train_labels = weighted_train_image_df.image_arrays_and_labels(resize_pixels)
weighted_val_images, weighted_val_labels = weighted_val_image_df.image_arrays_and_labels(resize_pixels)
weighted_test_images, weighted_test_labels = weighted_test_image_df.image_arrays_and_labels(resize_pixels)
prop_train_images, prop_train_labels = prop_train_image_df.image_arrays_and_labels(resize_pixels)
prop_val_images, prop_val_labels = prop_val_image_df.image_arrays_and_labels(resize_pixels)
prop_test_images, prop_test_labels = prop_test_image_df.image_arrays_and_labels(resize_pixels)

2024-06-06 20:56:16.159803: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [10]:
print(f'weighted_train_images: {len(weighted_train_images)}')
print(f'weighted_val_images: {len(weighted_val_images)}')
print(f'weighted_test_images: {len(weighted_test_images)}')
print(f'prop_train_images: {len(prop_train_images)}')
print(f'prop_val_images: {len(prop_val_images)}')
print(f'prop_test_images: {len(prop_test_images)}')

weighted_train_images: 10000
weighted_val_images: 3546
weighted_test_images: 3546
prop_train_images: 10938
prop_val_images: 2731
prop_test_images: 3423


In [11]:
def train_model(model, train_data, train_labels, val_data, val_test, test_data, test_labels, optimizer = 'adam', epochs = 5, batch_size = 64):

    model.compile(optimizer = optimizer,
                  loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), 
                  metrics = ['accuracy'])

    history = model.fit(train_data, 
                        train_labels, 
                        validation_data = (val_data, val_test),
                        epochs = epochs, 
                        batch_size = batch_size)

    predictions = (model.predict(test_data)).argmax(axis = 1)

    test_accuracy = np.sum(predictions == test_labels) / len(test_labels)

    return history, predictions, test_accuracy

In [12]:
# 1st of 3 models
# Simple model
# One for weighted sampling and proportional sampling

# Simple Model - Weighted Sampling
sm_w = models.Sequential(
    
    [
        layers.Conv2D(32, (3, 3), activation = 'relu', input_shape = (resize_pixels, resize_pixels, 3)),
        
        layers.MaxPooling2D((2, 2)),
        
        layers.Conv2D(32, (3, 3), activation = 'relu'),
        
        layers.MaxPooling2D((2, 2)),
        
        layers.Conv2D(64, (3, 3), activation = 'relu'),

        # flatten into 1d array
        layers.Flatten(),

        # Neural network
        layers.Dense(64, activation = 'relu'),

        layers.Dropout(rate = 0.2),
        
        # 8 different categories
        layers.Dense(8) 
    ]
    
)

# Simple Model - Proportional Sampling
sm_p = models.clone_model(sm_w)

In [13]:
sm_w_history, sm_w_predictions, sm_w_test_accuracy = train_model(sm_w,
                                                                 weighted_train_images,
                                                                 weighted_train_labels,
                                                                 weighted_val_images,
                                                                 weighted_val_labels,
                                                                 weighted_test_images,
                                                                 weighted_test_labels,
                                                                 epochs = 25)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [14]:
sm_p_history, sm_p_predictions, sm_p_test_accuracy = train_model(sm_p,
                                                                 prop_train_images,
                                                                 prop_train_labels,
                                                                 prop_val_images,
                                                                 prop_val_labels,
                                                                 prop_test_images,
                                                                 prop_test_labels, 
                                                                 epochs = 25)

Epoch 1/25
Epoch 2/25

KeyboardInterrupt: 

In [None]:
sm_w_test_accuracy, sm_p_test_accuracy