## Notebook 3: Split data into training, validation and test sets ##

In [1]:
# Imports
import os
import numpy as np
import pandas as pd

# Matplotlib for plotting
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm
from matplotlib import patches

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import dentexmodel as dm
from dentexmodel.fileutils import FileOP
from dentexmodel.imageproc import ImageData

print(f'Project module version: {dm.__version__}')

Project module version: 0.0.1.post1.dev5+gb946bb2.d20231214


In [2]:
# Path settings 
dentex_dir = os.path.join(os.environ['HOME'], 'data', 'dentex')
data_dir = os.path.join(dentex_dir, 'dentex_disease')
image_dir = os.path.join(data_dir, 'quadrant-enumeration-disease', 'xrays')
cropped_image_dir = os.path.join(image_dir, 'crop')
df_box_file_name = 'dentex_disease_cropped_dataset.parquet'
df_box_file = os.path.join(dentex_dir, df_box_file_name)

In [3]:
# Load the data frame with image paths and bounding boxes
data_df = pd.read_parquet(df_box_file)
display(data_df.head(2))

Unnamed: 0,height,width,id,file_name,image_number,file_path,quadrant,position,label,area,bbox,box_name,annotations,box_file,cropped_width,cropped_height
0,1316,2850,57,train_0.png,0,/app/data/dentex/dentex_disease/quadrant-enume...,2,6,Caries,33015,"[1791.0, 445.72727272727263, 147.4545454545452...",train_0_2_6,2,/app/data/dentex/dentex_disease/quadrant-enume...,147.0,323.0
1,1316,2850,57,train_0.png,0,/app/data/dentex/dentex_disease/quadrant-enume...,4,8,Caries,35006,"[682.090909090909, 687.5454545454545, 200.0, 3...",train_0_4_8,2,/app/data/dentex/dentex_disease/quadrant-enume...,200.0,300.0


In [4]:
# We mark the images for the data sets in the data frame
seed = 123
dset_df = data_df.assign(dataset=None).\
                sample(frac=1, random_state=seed)
# Set the number of images for the training set
n_images_train = int(np.ceil(0.70 * len(data_df)))
# Set the number of images for the test set
n_images_test = int(np.ceil(0.15 * len(data_df)))
# Take the remaining images for validation
n_images_val = len(data_df) - n_images_train - n_images_test

dataset_names = ['train', 'val', 'test']
dataset_numbers = [n_images_train, n_images_val, n_images_test]
startindex = 0
for d, dset in enumerate(dataset_names):
    dset_images = dataset_numbers[d]
    index_list = np.arange(startindex, startindex+dset_images)
    startindex+=dset_images
    dset_df.loc[dset_df.index.isin(index_list), 'dataset'] = dset

# Shuffle this one more time and then save the data splits
seed = 234
dset_df = dset_df.sample(frac=1, random_state=seed).reset_index(drop=True)

# Make sure that we have three non-overlapping data sets
train_set = set(dset_df.loc[dset_df['dataset']=='train', 'box_name'].values)
print(f'We have {len(train_set)} images in the test set.')
val_set = set(dset_df.loc[dset_df['dataset']=='val', 'box_name'].values)
print(f'We have {len(val_set)} images in the validation set.')
test_set = set(dset_df.loc[dset_df['dataset']=='test', 'box_name'].values)
print(f'We have {len(test_set)} images in the test set.')

# Make sure that these data sets are distinct
print(train_set.intersection(val_set))
print(train_set.intersection(test_set))
print(val_set.intersection(test_set))

We have 2445 images in the test set.
We have 504 images in the validation set.
We have 507 images in the test set.
set()
set()
set()


In [5]:
# Save the data split
datasplit_file_name = 'dentex_disease_datasplit.parquet'
datasplit_file = os.path.join(dentex_dir, datasplit_file_name)
dset_df.to_parquet(datasplit_file)