Creating the dataset for the IRR study

In [None]:
from PIL import Image
import imageio as io
import numpy as np
from ipywidgets import interact
import pandas as pd

def imshow(fn):
    """Quick image show routine using PIL

    Args:
        fn (str): file name

    Returns:
        Image: the opened file as Image
    """
    arr = io.v3.imread(fn)
    return Image.fromarray(arr)

Show some samples of the dataset

In [6]:
@interact
def showSamples(seed:(0,100)=1, i:(0,99)=0):
    N  = 55750 
    xi = np.arange(N)
    np.random.seed(seed)
    np.random.shuffle(xi)
    
    return imshow(f"training_224x224/{xi[i]}.png")

interactive(children=(IntSlider(value=42, description='seed'), IntSlider(value=0, description='i', max=99), Ou…

Generate the dataset

In [None]:
# Take a random subset of 100 images using random seed 1
N  = 55750 
xi = np.arange(N)
np.random.seed(1)
np.random.shuffle(xi)

# Save the filenames
fns = xi[:100].copy()

to_df = []

# We create three sets of the same images
# to compute inter- and intra-rater reliability
for round_i in range(1,4):
    folder = f"IRR/Round {round_i}"
    
    np.random.shuffle(fns)
    
    for i, fn in enumerate(fns):
        im = io.v3.imread(f"training_224x224/{fn}.png")
        io.v3.imwrite(folder+f"/{i}.png", im)
        
        to_df.append({
            'round_i': round_i,
            'new_im_id': i,
            'im_id': fn
        })

Save look-up table as CSV

In [None]:
df = pd.DataFrame(to_df)
df.to_csv("IRR_LUT.csv")