In [16]:
import random

random.seed(42)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os

from PIL import Image
import imagesize

from sklearn.preprocessing import LabelEncoder

import tensorflow as tf
from tensorflow.keras import layers, models

In [17]:
bloodcells = os.listdir("bloodcells_dataset")

print(bloodcells)

bloodcells.remove('.DS_Store')
bloodcells.remove('All_Images')

['basophil', 'neutrophil', '.DS_Store', 'ig', 'monocyte', 'All_Images', 'eosinophil', 'erythroblast', 'lymphocyte', 'platelet']


In [18]:
images = os.listdir('bloodcells_dataset/' + bloodcells[0])
df = pd.DataFrame(data = {'images': images, 'type': bloodcells[0], 'height': np.nan, 'width': np.nan})

for i in range(1, len(bloodcells)):

    images = os.listdir('bloodcells_dataset/' + bloodcells[i]) # jpg string paths
    
    images_df = pd.DataFrame(data = {'images': images, 'type': bloodcells[i], 'height': np.nan, 'width': np.nan})
    
    df = pd.concat([df, images_df])

le = LabelEncoder()

df['width'] = df['images'].apply(lambda x: imagesize.get('bloodcells_dataset/All_Images/' + x)[0])
df['height'] = df['images'].apply(lambda x: imagesize.get('bloodcells_dataset/All_Images/' + x)[1])
df['type'] = le.fit_transform(df['type'])
df.index = range(len(df.index))

df = df[df['images'].str.contains('copy') == False]


df

Unnamed: 0,images,type,height,width
0,BA_689200.jpg,0,363,360
1,BA_883452.jpg,0,363,360
2,BA_382161.jpg,0,369,366
3,BA_175579.jpg,0,363,360
4,BA_775722.jpg,0,363,360
...,...,...,...,...
17087,PLATELET_495918.jpg,7,363,360
17088,PLATELET_897238.jpg,7,363,360
17089,PLATELET_750430.jpg,7,363,360
17090,PLATELET_810431.jpg,7,363,360


In [19]:
df[['height', 'width']].value_counts().reset_index(name = 'count')

Unnamed: 0,height,width,count
0,363,360,16639
1,369,366,250
2,360,360,198
3,361,360,2
4,360,359,1
5,360,361,1
6,360,362,1


In [20]:
category_counts = df[['type']].value_counts().reset_index(name = 'count')

total_num_images = category_counts['count'].sum()

category_counts

Unnamed: 0,type,count
0,6,3329
1,1,3117
2,3,2895
3,7,2348
4,2,1551
5,5,1420
6,0,1218
7,4,1214


In [21]:
# Weighted Sampling

# high representation - types 1, 3, 6, 7 (sample 1500)
# low representation - types 0, 2, 4, 5 (sample 1000)

df_weighted_sampling_train = df[df['type'] == 6].sample(1500)

for i in range(1, len(category_counts)):

    type = category_counts['type'][i]

    if category_counts['count'][i] >= 2000:
        add_samples = df[df['type'] == type].sample(1500)
    else: 
        add_samples = df[df['type'] == type].sample(1000)

    df_weighted_sampling_train = pd.concat([df_weighted_sampling_train, add_samples])

df_weighted_sampling_test = df[~df['images'].isin(df_weighted_sampling_train['images'])]

resize_pixels = 64

# Train
weighted_sampling_train_images = np.array(
    [np.array(Image.open('bloodcells_dataset/All_Images/' + image).resize((resize_pixels, resize_pixels))) for image in df_weighted_sampling_train['images']]
)

weighted_sampling_train_images = tf.convert_to_tensor(weighted_sampling_train_images / 255.0)

weighted_sampling_train_labels = tf.convert_to_tensor(df_weighted_sampling_train['type'])

# Test
weighted_sampling_test_images = np.array(
    [np.array(Image.open('bloodcells_dataset/All_Images/' + image).resize((resize_pixels, resize_pixels))) for image in df_weighted_sampling_test['images']]
)

weighted_sampling_test_images = tf.convert_to_tensor(weighted_sampling_test_images / 255.0)

weighted_sampling_test_labels = tf.convert_to_tensor(df_weighted_sampling_test['type'])

In [22]:
# Proportional Sampling

num_samples = int(0.8 * total_num_images)

category_counts['prop'] = category_counts['count'] / total_num_images
category_counts['prop_samples'] = category_counts['prop'] * num_samples
category_counts['prop_samples'] = category_counts['prop_samples'].astype('int32')

df_prop_sampling_train = df[df['type'] == 6].sample(2663)

for i in range(1, len(category_counts)):

    type = category_counts['type'][i]

    samples = category_counts['prop_samples'][i]

    add_samples = df[df['type'] == type].sample(samples)

    df_prop_sampling_train = pd.concat([df_prop_sampling_train, add_samples])

df_prop_sampling_test = df[~df['images'].isin(df_prop_sampling_train['images'])]

resize_pixels = 64

# # Train
weighted_prop_train_images = np.array(
     [np.array(Image.open('bloodcells_dataset/All_Images/' + image).resize((resize_pixels, resize_pixels))) for image in df_prop_sampling_train['images']]
 )

weighted_prop_train_images = tf.convert_to_tensor(weighted_prop_train_images / 255.0)

weighted_prop_train_labels = tf.convert_to_tensor(df_prop_sampling_train['type'])

# Test
weighted_prop_test_images = np.array(
  [np.array(Image.open('bloodcells_dataset/All_Images/' + image).resize((resize_pixels, resize_pixels))) for image in df_prop_sampling_test['images']]
)

weighted_prop_test_images = tf.convert_to_tensor(weighted_prop_test_images / 255.0)

weighted_prop_test_labels = tf.convert_to_tensor(df_prop_sampling_test['type'])

In [8]:
# Opening an image
# test_img = Image.open('bloodcells_dataset/All_Images/' + df['images'][14000])

# Resize
# test_img.resize((320, 320))

# Crop
# test_img.crop((20, 20, 340, 340))

In [9]:


model = models.Sequential(
    
    [
        # 32 kernels, 3 by 3 kernel
        # input is 300 by 300 pixels with 3 color channels
        layers.Conv2D(32, (3, 3), activation = 'relu', input_shape = (resize_pixels, resize_pixels, 3)),
        
        layers.MaxPooling2D((2, 2)),
        
        layers.Conv2D(32, (3, 3), activation = 'relu'),
        
        layers.MaxPooling2D((2, 2)),
        
        layers.Conv2D(64, (3, 3), activation = 'relu'),

        # flatten into 1d array
        layers.Flatten(),

        # Neural network
        layers.Dense(64, activation = 'relu'),

        # 10 different categories
        layers.Dense(8) 
    ]
    
)

model.compile(optimizer = 'adam',
              loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), # using from_logits = True b/c no softmax layer
              metrics = ['accuracy'])

history = model.fit(weighted_sampling_train_images, 
                    weighted_sampling_train_labels, 
                    epochs = 5, 
                    batch_size = 64)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
predictions = model.predict(weighted_sampling_test_images)

In [14]:
# codes for average pooling:
# ##tf.keras.layers.AveragePooling2D(
#     pool_size=(2, 2),
#     strides=None,
#     padding='same',
#     data_format=None
# )

random.seed(102849)

avgPool_model = models.Sequential(
    [
        layers.Conv2D(32, (3, 3), activation = 'relu', input_shape = (resize_pixels, resize_pixels, 3)),
        
        layers.AveragePooling2D((2, 2), strides=None, padding='valid', data_format=None),
        
        layers.Conv2D(32, (3, 3), activation = 'relu'),
        
        layers.AveragePooling2D((2, 2), strides=None, padding='valid', data_format=None),
        
        layers.Conv2D(64, (3, 3), activation = 'relu'),

        # flatten into 1d array
        layers.Flatten(),

        # Neural network
        layers.Dense(64, activation = 'relu'),

        # 10 different categories
        layers.Dense(8) 
    ]
)

avgPool_model.compile(optimizer = 'adam',
              loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), # using from_logits = True b/c no softmax layer
              metrics = ['accuracy'])

history = avgPool_model.fit(weighted_sampling_train_images, 
                    weighted_sampling_train_labels, 
                    epochs = 5, 
                    batch_size = 64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [26]:
random.seed(10439)

from keras.layers import Input,Conv2D
from tensorflow.keras.layers import DepthwiseConv2D

def matlab_style_gauss2D(shape=(3,3),sigma=0.5):
    """
    2D gaussian mask - should give the same result as MATLAB's
    fspecial('gaussian',[shape],[sigma])
    """
    m,n = [(ss-1.)/2. for ss in shape]
    y,x = np.ogrid[-m:m+1,-n:n+1]
    h = np.exp( -(x*x + y*y) / (2.*sigma*sigma) )
    h[ h < np.finfo(h.dtype).eps*h.max() ] = 0
    sumh = h.sum()
    if sumh != 0:
        h /= sumh
    return h

kernel_size = 3
kernel_weights = matlab_style_gauss2D()

kernel_weights = np.expand_dims(kernel_weights, axis=-1)
kernel_weights = np.repeat(kernel_weights, 3, axis=-1)
kernel_weights = np.expand_dims(kernel_weights, axis=-1)

gaussian_model = models.Sequential(
    [
        layers.DepthwiseConv2D((kernel_size,kernel_size), use_bias=False, padding='same', data_format=None),
        
        layers.AveragePooling2D((2, 2), strides=None, padding='same', data_format=None),
        
        layers.DepthwiseConv2D((kernel_size,kernel_size), use_bias=False, padding='same', data_format=None),
        
        layers.AveragePooling2D((2, 2), strides=None, padding='same', data_format=None),
        
        layers.DepthwiseConv2D((kernel_size,kernel_size), use_bias=False, padding='same', data_format=None),

        # flatten into 1d array
        layers.Flatten(),

        # Neural network
        layers.Dense(64, activation = 'relu'),

        # 10 different categories
        layers.Dense(8) 
    ]
)

gaussian_model.compile(optimizer = 'adam',
              loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True), # using from_logits = True b/c no softmax layer
              metrics = ['accuracy'])

In [27]:
# Average pooling model using proportional sampling method
history = gaussian_model.fit(weighted_prop_train_images, 
                    weighted_prop_train_labels, 
                    epochs = 5, 
                    batch_size = 64)

# # Average pooling model using weighted sampling method
# history = avgPool_model.fit(weighted_sampling_train_images, 
#                     weighted_sampling_train_labels, 
#                     epochs = 5, 
#                     batch_size = 64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
