# Kaggle Cell Protein Classification - Create model on Kaggle Environment

Link to competition: https://www.kaggle.com/c/human-protein-atlas-image-classification

This notebook was converted from my prior Kaggle notebook.  Migrated to TF 2.x and converted various methods to be more native TF.  This creates a model to classify all proteins in a cell.  This is a multi-classification problem as more than one protein can be present in a sample.  This was run on the Kaggle environment, the training files are large.  The cell protein links are very good and help with the understanding of the different proteins and color stains.  


## Cell Protein Links:
https://www.proteinatlas.org/humanproteome/cell

https://www.proteinatlas.org/learn/dictionary/cell


## Other links:
Great analysis: https://www.kaggle.com/allunia/protein-atlas-exploration-and-baseline

3rd place solution: https://www.kaggle.com/c/human-protein-atlas-image-classification/discussion/77320

## Final Classification Report from Training images:

Accuracy : 0.4034256559766764

              precision    recall  f1-score   support

           0    0.81948   0.83375   0.82655      2412
           1    0.73514   0.62100   0.67327       219
           2    0.74301   0.64492   0.69050       659
           3    0.65948   0.51689   0.57955       296
           4    0.73810   0.66159   0.69775       328
           5    0.61420   0.43355   0.50830       459
           6    0.51479   0.51176   0.51327       170
           7    0.72519   0.62363   0.67059       457
           8    0.00000   0.00000   0.00000        11
           9    1.00000   0.28571   0.44444         7
          10    1.00000   0.50000   0.66667         4
          11    0.73885   0.61053   0.66859       190
          12    0.73000   0.52518   0.61088       139
          13    0.65789   0.48544   0.55866       103
          14    0.83158   0.78607   0.80818       201
          15    0.00000   0.00000   0.00000         8
          16    0.60000   0.16484   0.25862        91
          17    0.31034   0.30000   0.30508        30
          18    0.41727   0.33143   0.36943       175
          19    0.49242   0.23050   0.31401       282
          20    0.66667   0.14815   0.24242        27
          21    0.71386   0.62779   0.66806       763
          22    0.59821   0.47183   0.52756       142
          23    0.77897   0.72167   0.74923       503
          24    0.72917   0.64815   0.68627        54
          25    0.70211   0.71169   0.70687      1540
          26    0.46429   0.22034   0.29885        59
          27    0.00000   0.00000   0.00000         3
          
   micro avg    0.73121   0.65559   0.69134      9332
   
   macro avg    0.60646   0.45059   0.50156      9332
   
weighted avg    0.71972   0.65559   0.68136      9332

 samples avg    0.72583   0.68251   0.67804      9332
 

In [None]:
from __future__ import absolute_import, division, print_function, unicode_literals

import os, sys, random, warnings, time, copy, csv, gc
import numpy as np 

import IPython.display as display
from PIL import Image

import matplotlib.pyplot as plt
%matplotlib inline

from tqdm import tqdm_notebook, tnrange, tqdm
import pandas as pd

import tensorflow as tf
print(tf.__version__)

from tensorflow.keras.models import load_model 

from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

AUTOTUNE = tf.data.experimental.AUTOTUNE
print("AUTOTUNE: ", AUTOTUNE)

#warnings.filterwarnings("ignore", category=DeprecationWarning)
#warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)

## Initial Setup


In [None]:
# Copied in class from personal library

class GlobalParms(object):

    def __init__(self, **kwargs):
        self.keys_and_defaults = {
         "MODEL_NAME": "",  # if you leave .h5 off, puts into a subdirectory
         "ROOT_PATH": "",  # Location of the data for storing any data or files
         "TRAIN_DIR": "",  # Subdirectory in the Root for Training files
         "TEST_DIR": "",  # Optional subdirectory in  Root for Testing file
         "SUBMISSION_PATH": None,  # Optional subdirectory for Contest files
         "MODEL_PATH": None,  # Optional, subdirectory for saving/loading model
         "TRAIN_PATH": None,  # Subdirectory in the Root for Training files
         "TEST_PATH": None,  # Optional subdirectory in  Root for Testing file
         "SMALL_RUN": False,   # Optional, run size will be reduced
         "NUM_CLASSES": 0,  # Number of classes
         "CLASS_NAMES": [],  # list of class names
         "IMAGE_ROWS": 0,  # Row size of the image
         "IMAGE_COLS": 0,  # Col size of the image
         "IMAGE_CHANNELS": 0,  # Num of Channels, 1 for Greyscale, 3 for color
         "BATCH_SIZE": 0,  # Number of images in each batch
         "EPOCS": 0,  # Max number of training EPOCS
         "ROW_SCALE_FACTOR": 1,  # Optional, allows scaling of an image.
         "COL_SCALE_FACTOR": 1,  # Optional, allows scaling of an image.
         "IMAGE_EXT": ".jpg",  # Extent of the image file_ext
         # Optional, default is np.float64, reduce memory by using np.float32
         # or np.float16
         "IMAGE_DTYPE": np.float32,
         # Optional, change default if needed, can save memory space
         "Y_DTYPE": np.int,
         "LOAD_MODEL": False,  # Optional, If you want to load a saved model
         "SUBMISSION": "submission.csv",  # Optional, Mainly used for Kaggle
         "METRICS": ['accuracy'],  # ['categorical_accuracy'], ['accuracy']
         "FINAL_ACTIVATION": 'sigmoid',  # sigmoid, softmax
         "LOSS": ""  # 'binary_crossentropy', 'categorical_crossentropy'
        }

        self.__dict__.update(self.keys_and_defaults)
        self.__dict__.update((k, v) for k, v in kwargs.items()
                             if k in self.keys_and_defaults)

        # Automatically reduce the training parms, change as needed
        if self.__dict__["SMALL_RUN"]:
            self.__dict__["BATCH_SIZE"] = 1
            self.__dict__["EPOCS"] = 2
            self.__dict__["ROW_SCALE_FACTOR"] = 1
            self.__dict__["COL_SCALE_FACTOR"] = 1

        # Use configuration items to create real ones
        self.__dict__["SCALED_ROW_DIM"] = \
            np.int(self.__dict__["IMAGE_ROWS"] /
                   self.__dict__["ROW_SCALE_FACTOR"])

        self.__dict__["SCALED_COL_DIM"] =  \
            np.int(self.__dict__["IMAGE_COLS"] /
                   self.__dict__["COL_SCALE_FACTOR"])

        if self.__dict__["TRAIN_PATH"] is None:  # Not passed, so set it
            self.__dict__["TRAIN_PATH"] = \
                os.path.join(self.__dict__["ROOT_PATH"],
                             self.__dict__["TRAIN_DIR"])

        if self.__dict__["TEST_PATH"] is None:  # Not passed, so set it
            self.__dict__["TEST_PATH"] = \
                os.path.join(self.__dict__["ROOT_PATH"],
                             self.__dict__["TEST_DIR"])

        if self.__dict__["SUBMISSION_PATH"] is None:  # Not passed, so set
            self.__dict__["SUBMISSION_PATH"] = \
                os.path.join(self.__dict__["ROOT_PATH"],
                             self.__dict__["SUBMISSION"])
        else:
            self.__dict__["SUBMISSION_PATH"] = \
                os.path.join(self.__dict__["SUBMISSION_PATH"],
                             self.__dict__["SUBMISSION"])

        if self.__dict__["MODEL_PATH"] is None:  # Not passed, so set it
            self.__dict__["MODEL_PATH"] = \
                os.path.join(self.__dict__["ROOT_PATH"],
                             self.__dict__["MODEL_NAME"])
        else:
            self.__dict__["MODEL_PATH"] = \
                os.path.join(self.__dict__["MODEL_PATH"],
                             self.__dict__["MODEL_NAME"])

        self.__dict__["IMAGE_DIM"] = \
            (self.__dict__["SCALED_ROW_DIM"],
             self.__dict__["SCALED_COL_DIM"],
             self.__dict__["IMAGE_CHANNELS"])

        if self.__dict__["IMAGE_CHANNELS"] == 1:
            self.__dict__["COLOR_MODE"] = "grayscale"
        else:
            self.__dict__["COLOR_MODE"] = "rgb"

    def set_train_path(self, train_path):
        self.__dict__["TRAIN_PATH"] = train_path

    def set_class_names(self, class_name_list):
        self.__dict__["CLASS_NAMES"] = class_name_list

        if self.__dict__["NUM_CLASSES"] != \
           len(self.__dict__["CLASS_NAMES"]):
            raise ValueError("ERROR number of classses do not match, Classes: "
                             + str(self.__dict__["NUM_CLASSES"])
                             + " Class List: "
                             + str(self.__dict__["CLASS_NAMES"]))

    def print_contents(self):
        print(self.__dict__)

    def print_key_value(self):
        for key, value in self.__dict__.items():
            print(key, ":", value)

In [None]:
# Setup GLOBALS/CONFIG ITEMS

USING_KAGGLE = True
# Set root directory path to data
if USING_KAGGLE:
    ROOT_PATH = "../input/human-protein-atlas-image-classification/"  ###### CHANGE FOR SPECIFIC ENVIRONMENT
else:
    ROOT_PATH = "/Users/john/Documents/ImageData/KaggleCellProteins"  ###### CHANGE FOR SPECIFIC ENVIRONMENT
        
# Establish global dictionary
parms = GlobalParms(MODEL_NAME="model-cell-protein-all-V01.h5",
                    ROOT_PATH=ROOT_PATH,
                    MODEL_PATH="",
                    TRAIN_DIR="train", 
                    NUM_CLASSES=28,
                    IMAGE_ROWS=224,
                    IMAGE_COLS=224,
                    IMAGE_CHANNELS=3,
                    BATCH_SIZE=16, #32
                    EPOCS=20,
                    IMAGE_EXT=".png",
                    FINAL_ACTIVATION='softmax',
                    LOSS='binary_crossentropy',
                    METRICS=['accuracy'])

parms.print_contents()

In [None]:
# Simple helper method to display batches of images with labels....   

def show_batch(image_batch, label_batch, number_to_show=25, r=5, c=5, print_shape=False):
    show_number = min(number_to_show, parms.BATCH_SIZE)

    if show_number < 8: #if small number, then change row, col and figure size
        if parms.IMAGE_COLS > 64 or parms.IMAGE_ROWS > 64:
            plt.figure(figsize=(25,25)) 
        else:
            plt.figure(figsize=(10,10))  
        r = 4
        c = 2 
    else:
        plt.figure(figsize=(10,10))  

    if show_number == 1:
        image_batch = np.expand_dims(image_batch, axis=0)
        label_batch = np.expand_dims(label_batch, axis=0)

    for n in range(show_number):
        if print_shape:
            print("Image shape: {}  Max: {}  Min: {}".format(image_batch[n].shape, np.max(image_batch[n]), np.min(image_batch[n])))
        ax = plt.subplot(r,c,n+1)
        cmap="gray"
        if len(image_batch[n].shape) == 3:
            if image_batch[n].shape[2] == 3:
                cmap="viridis"
        plt.imshow(tf.keras.preprocessing.image.array_to_img(image_batch[n]), cmap=plt.get_cmap(cmap))
        
        s=""
        for i, val in enumerate(label_batch[n].numpy()):
            if val == 1:
                #s += label_names[i] + ", "
                s += str(i) + ", "
        s = s[-1]
        plt.title(s)
        plt.axis('off')

In [None]:
# Establish labels

label_names = {
    0:  "Nucleoplasm",  
    1:  "Nuclear membrane",   
    2:  "Nucleoli",   
    3:  "Nucleoli fibrillar center",   
    4:  "Nuclear speckles",
    5:  "Nuclear bodies",   
    6:  "Endoplasmic reticulum",   
    7:  "Golgi apparatus",   
    8:  "Peroxisomes",   
    9:  "Endosomes",   
    10:  "Lysosomes",   
    11:  "Intermediate filaments",   
    12:  "Actin filaments",   
    13:  "Focal adhesion sites",   
    14:  "Microtubules",   
    15:  "Microtubule ends",   
    16:  "Cytokinetic bridge",   
    17:  "Mitotic spindle",   
    18:  "Microtubule organizing center",   
    19:  "Centrosome",   
    20:  "Lipid droplets",   
    21:  "Plasma membrane",   
    22:  "Cell junctions",   
    23:  "Mitochondria",   
    24:  "Aggresome",   
    25:  "Cytosol",   
    26:  "Cytoplasmic bodies",   
    27:  "Rods & rings"
}

parms.set_class_names(label_names)

reverse_class_names = dict((v,k) for k,v in label_names.items())



## Load and process training csv

In [None]:
# Load and process cvs

# Fills all of the protein targets using row apply method
def fill_targets(row):
#    print(row)
    row.Target = np.array(row.Target.split(" ")).astype(np.int)
    for num in row.Target:
        name = label_names[int(num)]
        row.loc[name] = 1
    return row

# load csv file
all_df = pd.read_csv(os.path.join(parms.ROOT_PATH, "train.csv"))

# Build empty protein targets
for key in label_names.keys(): 
    all_df[label_names[key]] = 0

# Apply fill_targets to fill proteins with "1"
all_df = all_df.apply(fill_targets, axis=1)

# Create number of targets column
all_df["number_of_targets"] = all_df.drop(["Id", "Target"],axis=1).sum(axis=1)  #add count col

all_df.head()

In [None]:
# Builds a string label to pass into datasets
# Datasets needs something hashable, so passing a string that will be converted to an array was the easiest

def build_label(row):
    row_label = np.zeros((parms.NUM_CLASSES), dtype=np.int32)
    row_label[row.Target] = 1
    row_label_s = str(row_label)
    row_label_s = row_label_s[1:len(row_label_s)-1]
    row.Labels = row_label_s
    #return str(row_label)
    return row

all_df["Labels"] = ""
all_df = all_df.apply(build_label, axis=1)
all_df.head()

In [None]:
# Shows targets

balanced_all_df = all_df.groupby('number_of_targets').apply(lambda x: x.sample(SAMPLES_PER_GROUP) if len(x) > SAMPLES_PER_GROUP else x)
balanced_all_df['number_of_targets'].hist()

In [None]:
# Limits the number targets to help balance classes

SAMPLES_PER_GROUP = 12000
balanced_all_df = all_df.groupby('number_of_targets').apply(lambda x: x.sample(SAMPLES_PER_GROUP) if len(x) > SAMPLES_PER_GROUP else x)
balanced_all_df['number_of_targets'].hist()

In [None]:
# Split train and val, stratify by number of targets

train_df, valid_df = train_test_split(balanced_all_df, 
                                      test_size = 0.2,
                                      stratify = balanced_all_df['number_of_targets'])

# Add some more training examples from the sparse examples
print('Original Training len: ', train_df.shape[0], "  Validation len: ", valid_df.shape[0])
add_more_df = train_df.loc[train_df["number_of_targets"] > 2]
add_more_df = pd.concat([add_more_df, add_more_df])
train_df = pd.concat([train_df, add_more_df])
train_df.reset_index(drop=True)
train_df = shuffle(train_df) # Shuffle

print('After Adjust, Training len: ', train_df.shape[0], "  Validation len: ", valid_df.shape[0])

# set lengths and steps
train_len = len(train_df)
val_len = len(valid_df)
images_list_len = train_len + val_len


In [None]:
# Set counts

steps_per_epoch = np.ceil(train_len // parms.BATCH_SIZE) # set step sizes based on train & batch
validation_steps = np.ceil(val_len // parms.BATCH_SIZE) # set step sizes based on val & batch

print("Total number: ", images_list_len, "  Train number: ", train_len, "  Val number: ", val_len)
print("Steps/EPOC: ", steps_per_epoch, "  Steps/Validation: ", validation_steps)


In [None]:
# Double check training and validation counts

print(train_df['number_of_targets'].value_counts())
print(valid_df['number_of_targets'].value_counts())

## Build an input pipeline

In [None]:
# Augments training images
def image_mask_aug(image):
    
    if tf.random.uniform(()) > 0.5:    
        k = tf.random.uniform(shape=[], minval=1, maxval=4, dtype=tf.int32)
        image = tf.image.rot90(image, k) #0-4, 0/360, 90/180/270

    if tf.random.uniform(()) > 0.5:
        image = tf.image.flip_left_right(image)
        
    if tf.random.uniform(()) > 0.5:
        image = tf.image.flip_up_down(image)

    return image

# Read, decode the image, convert to float
def read_decode_image(image_id: tf.Tensor, label_string: tf.Tensor) -> tf.Tensor:

    # load the raw data from the files
    file_path = parms.TRAIN_PATH+"/"+image_id+"_green"+parms.IMAGE_EXT
    image_g = tf.io.read_file(file_path)
    image_g = tf.image.decode_png(image_g, channels=1)
    image_g = tf.image.convert_image_dtype(image_g, parms.IMAGE_DTYPE)

    file_path = parms.TRAIN_PATH+"/"+image_id+"_blue"+parms.IMAGE_EXT
    image_b = tf.io.read_file(file_path)
    image_b = tf.image.decode_png(image_b, channels=1)
    image_b = tf.image.convert_image_dtype(image_b, parms.IMAGE_DTYPE)

    file_path = parms.TRAIN_PATH+"/"+image_id+"_red"+parms.IMAGE_EXT
    image_r = tf.io.read_file(file_path)
    image_r = tf.image.decode_png(image_r, channels=1)
    image_r = tf.image.convert_image_dtype(image_r, parms.IMAGE_DTYPE)

    file_path = parms.TRAIN_PATH+"/"+image_id+"_yellow"+parms.IMAGE_EXT
    image_y = tf.io.read_file(file_path)
    image_y = tf.image.decode_png(image_y, channels=1)
    image_y = tf.image.convert_image_dtype(image_y, parms.IMAGE_DTYPE)
    
    # Merge Red and Yellow images into a single image
    image_r_plus_y = image_r + image_y
    image_ry = tf.where(image_r_plus_y > 0, image_r_plus_y / 2, 0)

    # Build label from string
    b = tf.strings.split(label_string, sep=" ")
    label = tf.strings.to_number(b, tf.float32)

    # Stack files to create a 3 dim image
    image = tf.stack([image_g[:,:,0], image_b[:,:,0], image_ry[:,:,0]], axis=2)
    image = tf.image.resize(image, (parms.IMAGE_ROWS, parms.IMAGE_COLS))
    
    # If image is dark, brighten by 10%
    image_mean = tf.math.reduce_mean(image)
    image_mean_adj = tf.cond(image_mean < 0.10, lambda: image_mean + 0.10, lambda: 0.0)
    image = tf.where(image > 0, image + image_mean_adj, image)
    image = tf.clip_by_value(image, clip_value_min=0, clip_value_max=1)

    return image, label

# Apply method for training files
def process_train(image_id: tf.Tensor, label_string: tf.Tensor) -> tf.Tensor:
    image, label = read_decode_image(image_id, label_string)
    image = image_mask_aug(image)
    return image, label

# Apply method for validation files
def process_val(image_id: tf.Tensor, label_string: tf.Tensor) -> tf.Tensor:
    image, label = read_decode_image(image_id, label_string)
    return image, label


In [None]:
# Create Dataset from df
train_dataset = tf.data.Dataset.from_tensor_slices((train_df["Id"].values,
                                                    train_df["Labels"].values)
                                                  )

# Verify image paths were loaded
for image_id, label in train_dataset.take(2):
    print("Image ID: ", image_id.numpy().decode("utf-8"), "  Label: ", label.numpy())

# map training images to processing, includes any augmentation
train_dataset = train_dataset.map(process_train, num_parallel_calls=AUTOTUNE)

# Verify the mapping worked
for image, label in train_dataset.take(1):
    print("Image shape: {}  Max: {}  Min: {}".format(image.numpy().shape, np.max(image.numpy()), np.min(image.numpy())))
    print("Label: ", label.numpy())
    some_image = image.numpy()
    some_label = label.numpy()

train_dataset = train_dataset.batch(parms.BATCH_SIZE) \
                             .prefetch(1) \
                             .repeat()

# Show the images, execute this cell multiple times to see the images
for image, label in train_dataset.take(1):
    sample_image, sample_label = image, label
show_batch(sample_image, sample_label)

In [None]:
# Create Dataset from df
val_dataset = tf.data.Dataset.from_tensor_slices((valid_df["Id"].values,
                                                  valid_df["Labels"].values)
                                                 )


# Verify image paths were loaded
for image_id, label in val_dataset.take(2):
    print("Image ID: ", image_id.numpy().decode("utf-8"), "  Label: ", label.numpy())

    # map training images to processing, includes any augmentation
val_dataset = val_dataset.map(process_val, num_parallel_calls=AUTOTUNE)

# Verify the mapping worked
for image, label in val_dataset.take(1):
    print("Image shape: {}  Max: {}  Min: {}".format(image.numpy().shape, np.max(image.numpy()), np.min(image.numpy())))
    print("Label: ", label.numpy())
    some_image = image.numpy()
    some_label = label.numpy()

val_dataset = val_dataset.batch(parms.BATCH_SIZE) \
                         .prefetch(1) \
                         .repeat()


In [None]:
# Final check before model training.  I added a string of the mask non-zero counts - need to make sure the masks 
# were created ok.  (got bit by this one after a small change....)

# Test Validation or Train by changing the dataset

#for image, mask in train_dataset.take(1):
for image, label in val_dataset.take(1):
    show_batch(image, label)  # Will show all of the batch


## Build  model
- add and validate pretrained model as a baseline

In [None]:
# Create any call backs for training...These are the most common.

from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, CSVLogger

reduce_lr = ReduceLROnPlateau(monitor='val_loss', patience=2, verbose=1, min_lr=1e-6)
earlystopper = EarlyStopping(patience=8, verbose=1)
checkpointer = ModelCheckpoint(parms.MODEL_PATH, monitor='val_simple_F1', verbose=1, mode="max", save_best_only=True)


In [None]:
# https://towardsdatascience.com/metrics-for-imbalanced-classification-41c71549bbb5
def simple_F1(y_true, y_score):
    # True positive
    tp = tf.math.reduce_sum(y_true * y_score)
    # False positive
    fp = tf.math.reduce_sum(tf.cast((y_true == 0), y_true.dtype) * y_score)
    # True negative
    tn = tf.math.reduce_sum(tf.cast((y_true==0), y_true.dtype) * tf.cast((y_score==0), y_true.dtype))
    # False negative
    fn = tf.math.reduce_sum(y_true * tf.cast((y_score==0), y_true.dtype))

    # F1 score
    f1 = 2*tp / (2*tp + fp + fn)
    return f1


In [None]:
# Create model and compile it

from tensorflow.keras.models import Sequential, load_model, Model
from tensorflow.keras.layers import Dense, Dropout, Flatten, Input, Conv2D, MaxPooling2D, BatchNormalization, UpSampling2D, Conv2DTranspose, Concatenate, Activation
from tensorflow.keras.losses import binary_crossentropy, categorical_crossentropy
from tensorflow.keras.optimizers import Adadelta, Adam, Nadam, SGD
from tensorflow.keras.layers import Dense,GlobalAveragePooling2D

densenet = tf.keras.applications.DenseNet121(include_top=False, input_shape=(224,224,3))
# Build and compile model.  I used this model before, did not adjust parms.
# You can change to try different configurations.  (DO percentages, Dense layers, etc)
def build_compile_model(parms):
    model = Sequential()
    model.add(densenet)
    model.add(GlobalAveragePooling2D())

    model.add(Dropout(0.3))
    model.add(Dense(1024,activation='relu'))
    model.add(Dropout(0.4))

    #model.add(Dropout(0.5))
    model.add(Dense(parms.NUM_CLASSES, activation='sigmoid'))

    model.compile(
        loss='binary_crossentropy',
        optimizer=Adam(lr=0.0005), 
        metrics=[simple_F1])
    
    return model


## Train model

In [None]:
# Build and Train model

model = build_compile_model(parms)

history = model.fit(train_dataset,
                    validation_data=val_dataset,
                    epochs=parms.EPOCS, 
                    steps_per_epoch=steps_per_epoch,
                    validation_steps=validation_steps,
                    callbacks=[reduce_lr, earlystopper, checkpointer]
                    )

In [None]:
# Plot the training history

history_df = pd.DataFrame(history.history)
plt.figure()
history_df[['loss', 'val_loss']].plot(title="Loss")
plt.xlabel('Epocs')
plt.ylabel('Loss')
history_df[['simple_F1', 'val_simple_F1']].plot(title="F1")
plt.xlabel('Epocs')
plt.ylabel('Accuracy')
plt.show()

## Validate model's predictions
- Create actual_lables and predict_labels
- Calculate Confusion Matrix & Accuracy
- Display results


In [None]:
#Load saved model
from tensorflow.keras.models import load_model 

if USING_KAGGLE:
    file_name = parms.MODEL_PATH
else:
    file_name = os.path.join(parms.ROOT_PATH, parms.MODEL_NAME)

print(file_name)
model = load_model(file_name, custom_objects={'simple_F1': simple_F1})
print("loaded...")

In [None]:
def predictions_using_dataset_cell(model_actual,
                              dataset,
                              steps,
                              batch_size,
                              create_bad_results_list=False):
    """
      Uses dataset to predict results.  Builds actual_labels, predict_labels
      and predict_probabilities

      Args:
        model_actual : trained model to use for predictions
        ds_iter : dataset iterator
        steps : number of batches to process
        create_bad_results_list : bool default True.  Lets you trun on/off
            the creation of the bad results lists.

      Returns:
        actual_labels : list of actual labels
        predict_labels : list of predicted labels
        predict_probabilities : list of predicted probability array
        bad_results : list of bad results [actual_labels, predict_labels,
                      predict_probabilities, image]
    """

    bad_cnt = 0.0
    good_cnt = 0.0
    total_cnt = 0
    actual_labels = []
    predict_labels = []
    predict_probabilities = []
    bad_results = []

    for image_batch, label_batch in tqdm(dataset.take(steps)):
        for j in range(batch_size):
            image = image_batch[j]
            label = label_batch[j].numpy()

            total_cnt += 1
            actual_label = label
            
            image = np.expand_dims(image, axis=0)

            predict_probabilities_tmp = model_actual.predict(image)[0]
            # Create binary predictions
            predict_label = np.where(predict_probabilities_tmp > 0.5, 1., 0.)
            
            #print(actual_label, predict_label, predict_probabilities_tmp)
            
            actual_labels.append(actual_label)
            predict_labels.append(predict_label)
            predict_probabilities.append(predict_probabilities_tmp)

            correct_flag = np.array_equal(actual_label, predict_label)
            if correct_flag:
                good_cnt = good_cnt + 1
            else:
                bad_cnt = bad_cnt + 1
                if create_bad_results_list:
                    bad_results.append([[actual_label],
                                        [predict_label],
                                        predict_probabilities_tmp,
                                        image])
    print(" ")
    print("total: ", total_cnt, "  Good: ", good_cnt, "  Bad: ",
          bad_cnt, "  percent good: ", str(good_cnt/total_cnt))

    return actual_labels, predict_labels, predict_probabilities, \
        bad_results



In [None]:
# Use model to generate predicted labels and probabilities

labels, predict_labels, predict_probabilities, bad_results = predictions_using_dataset_cell(model, val_dataset, validation_steps, parms.BATCH_SIZE)

# For troubleshooting, uncomment the print statement in predictions_using_dataset2 and set steps to 1
#labels, predict_labels, predict_probabilities, bad_results = predictions_using_dataset_cell(model, val_dataset, 1, parms.BATCH_SIZE)


In [None]:
# Copied from personal library

def plot_confusion_matrix(cm,
                          classes,
                          normalize=False,
                          cmap=plt.cm.Blues):
    """
    Used by show_confusion_matrix.
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        title = 'Normalized confusion matrix'
    else:
        title = 'Confusion matrix'

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.show()


def show_confusion_matrix(labels,
                          predict_labels,
                          class_names,
                          show_graph=True):
    """
      Shows various accuracry measurements.

      Args:
        labels : actual labels
        predict_labels : predicted labels
        class_names : list of class names
        show_graph : flag to show or not show the actual graph.  set
                     to False for large number of classes.
      Returns:
        nothing
    """

    # Accuracy score
    print("Accuracy : " + str(accuracy_score(np.array(labels), np.array(predict_labels))))

    print("")

    # Classification report
    print("Classification Report")
    print(classification_report(np.array(labels),
                                np.array(predict_labels), digits=5))

    if show_graph:
        # Plot confusion matrix
        cnf_matrix = confusion_matrix(labels, predict_labels)
        print(cnf_matrix)
        plot_confusion_matrix(cnf_matrix, classes=class_names)



In [None]:
import itertools
from sklearn.metrics import confusion_matrix, classification_report, \
                            accuracy_score

show_confusion_matrix(labels, predict_labels, parms.CLASS_NAMES, show_graph=False)