## Model Inference ##

This notebook runs inference on the Kaggle Test dataset and generate 'submission.csv' for contest scoring.

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras


from PIL import Image, ImageFilter
Image.MAX_IMAGE_PIXELS = None

import skimage

import os
import shutil
from tqdm import tqdm

from random import shuffle

import time

In [2]:
DATA_DIR = '/kaggle/input/UBC-OCEAN/'
os.listdir(DATA_DIR)

['updated_image_ids.json',
 'test_thumbnails',
 'sample_submission.csv',
 'train_images',
 'train_thumbnails',
 'train.csv',
 'test.csv',
 'test_images']

In [3]:
test_df=pd.read_csv(DATA_DIR+'test.csv')
test_df.tail()

Unnamed: 0,image_id,image_width,image_height
0,41,28469,16987


In [4]:
#Make sure Trained model is available
os.listdir('/kaggle/input/tl-gpu-072')

['__results__.html',
 'OCEANs_train072.h5',
 '__notebook__.ipynb',
 'model.h5',
 '__output__.json',
 'custom.css']

In [7]:
# #load model
model = tf.keras.models.load_model('/kaggle/input/tl-gpu-072/OCEANs_train072.h5')

# # Check its architecture
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 sequential (Sequential)     (None, 256, 256, 3)       0         
                                                                 
 rescaling_1 (Rescaling)     (None, 256, 256, 3)       0         
                                                                 
 efficientnetv2-m (Functiona  (None, 8, 8, 1280)       53150388  
 l)                                                              
                                                                 
 gap (GlobalMaxPooling2D)    (None, 1280)              0         
                                                                 
 dense (Dense)               (None, 128)               163968    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                      

In [8]:
test_images=os.listdir(DATA_DIR+'test_images')
test_thumbnails=os.listdir(DATA_DIR+'test_thumbnails')

In [9]:
#Add image path and label information to dataframe
def addimgpath(row):
    img=row['image_id']
    img=str(img)+'.png'
    if img in test_images:
        return DATA_DIR+'test_images/'+img
    else:
        return None
    
def addthumbpath(row):
    img=row['image_id']
    img=str(img)+'_thumbnail.png'
    if img in test_thumbnails:
        return DATA_DIR+'test_thumbnails/'+img
    else:
        return None 

#labels in alphanumeric order from image_dataset_from_directory
labels=['CC', 'EC', 'HGSC', 'LGSC', 'MC']

test_df['img_path']=test_df[['image_id']].apply(addimgpath, axis=1)
test_df['thumb_path']=test_df[['image_id']].apply(addthumbpath, axis=1)

In [10]:
test_df.head()

Unnamed: 0,image_id,image_width,image_height,img_path,thumb_path
0,41,28469,16987,/kaggle/input/UBC-OCEAN/test_images/41.png,/kaggle/input/UBC-OCEAN/test_thumbnails/41_thu...


In [11]:
#Same as in Image Preparation - decided whether to keep tile
def keep_patch(patch, size):
    #receives color patch, performs checks
    #force np.float64 with *1.0 -> r+b+g won't roll over if exceeds 128 (uint8)
    # + 0.001 prevents division by zero
    red=patch[:,:,0]*1.0 + 0.001
    green=patch[:,:,1]*1.0 + 0.001
    blue=patch[:,:,2]*1.0 + 0.001
    
    blank=np.where((red+blue+green)>3) #not black/masked
    white=np.where((red+blue+green)<630)
    
    #blue intensity where blue > green
    bg=np.where(blue/green<=1.0, 0, blue)
    bg=np.where(bg<100,0,1)

    #criteria
    sz=size**2
    
    #keep: areas with tissue and blueness
    if blank[0].shape[0]>.8*sz and white[0].shape[0]>0.5*sz and bg.mean()>0.4:
        return True
    else:
        return False

In [12]:
#Step over slide to determine tiles to keep
def get_thumb_coords(image_array, ws):
    #gets coordinates for thumb
    #returns bboxes based on intensity metric
    
    #moving window
    dims=image_array.shape
    
    #height, width
    steps=[dims[0]//ws, dims[1]//ws]

    coords=[]
    for i in range(steps[0]):
        for j in range(steps[1]):
            patch=image_array[ws*i:ws*i+ws, ws*j:ws*j+ws]

            decision = keep_patch(patch, ws)
            if decision:
                coords.append([i*ws, i*ws+ws, j*ws, j*ws+ws])

    return coords, dims

In [13]:
#Window size for searching in thumbnail
ws=256
fs=1024

#Get data from thumbnail images

testlabels=[]
image_id=[]

#Number of sites for high-res and low-res tiles
samples_per_image=10 

#Limit of tiles per slide
samples_per_id=9999

#column for thumbnail coordinates ranked by intensity in filters/thresholds
test_df['thumbcoords']=None
test_df['thumbdims']=None

#initialize seed - make repeatable
np.random.seed(seed=41)

#Loop over samples in Test dataset
for idx, row in test_df.iterrows():
    #Initialize data per image
    fullsize=True
    data=[]
    thumb=row['thumb_path']
    image_id.append(row['image_id'])
    if thumb==None:
        #Use image, is_tma=True
        thumb=row['img_path']
        fullsize=False
    
    img=Image.open(thumb)
    imgarray=np.array(img)
    
    #search image
    coords, dims =get_thumb_coords(imgarray, ws)
    
    #assign to df
    test_df.at[idx,'thumbcoords']=coords
    test_df.at[idx,'thumbdims']=dims
    
    #Assign Label 'Other' if very few tiles meet criteria
    if len(coords)<2:
        testlabels.append('Other')
        continue

    #Open fullsize image
    if fullsize:
        fullpath=row['img_path']
        fullimg=Image.open(fullpath)
    

    #shuffle coords - otherwise, tiles would be in the same order within every slide
    np.random.shuffle(coords)
    for coord in coords[:samples_per_image]:
        data.append(imgarray[coord[0]:coord[1],coord[2]:coord[3]])
        
        if fullsize:
            
            #ratios to find same spot as thumbnail
            thumbheight=dims[0]
            thumbwidth=dims[1]
            imgheight=row['image_height']
            imgwidth=row['image_width']
            htratio=imgheight/thumbheight
            wdratio=imgwidth/thumbwidth
        
            #Center of small image, base for 4 large images
            ctrht=int(htratio*(coord[0]+coord[1])//2)
            ctrwd=int(wdratio*(coord[2]+coord[3])//2)
            
            #upper left
            fullcrop=fullimg.crop((ctrwd-fs, ctrht-fs, ctrwd, ctrht))
            fullcrop=fullcrop.resize((ws,ws))
            patch=np.array(fullcrop)
            decision = keep_patch(patch, ws)
            if decision:
                data.append(patch)

            #upper right
            fullcrop=fullimg.crop((ctrwd, ctrht-fs, ctrwd+fs, ctrht))
            fullcrop=fullcrop.resize((ws,ws))
            patch=np.array(fullcrop)
            decision = keep_patch(patch, ws)
            if decision:
                data.append(patch)

            #lower left
            fullcrop=fullimg.crop((ctrwd-fs, ctrht, ctrwd, ctrht+fs))
            fullcrop=fullcrop.resize((ws,ws))
            patch=np.array(fullcrop)
            decision = keep_patch(patch, ws)
            if decision:
                data.append(patch)


            #lower right
            fullcrop=fullimg.crop((ctrwd, ctrht, ctrwd+fs, ctrht+fs))
            fullcrop=fullcrop.resize((ws,ws))
            patch=np.array(fullcrop)
            decision = keep_patch(patch, ws)
            if decision:
                data.append(patch)
                
    #Prep for prediction
    data=np.array(data)
    
    #shuffle and prune - images are sequential prior to shuffle
    #shuffle is along first axis
    np.random.shuffle(data)
    data=data[:samples_per_id]

    #Predictions
    predictions=model.predict(data)
    
    #Finding most-favored
    #Find average of each, choose max if over 0.3; otherwise, other
    predictions=predictions.sum(axis=0)/predictions.shape[0]
    maxval=predictions.max()
    if maxval>0.3:
        label_index=predictions.argmax()
        testlabels.append(labels[label_index])
    else:
        testlabels.append('Other')



In [16]:
#Create dictionary for DF, then CSV
types={}

types['image_id']=image_id
types['label']=testlabels

#create dataframe for submission
dftest_submission = pd.DataFrame(types)
dftest_submission.to_csv("submission.csv", index=False)