## Notebook 3: Split data into training, validation and test sets ##

In [1]:
# Imports
import os
import numpy as np
import pandas as pd

# Matplotlib for plotting
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm
from matplotlib import patches

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import dentexmodel as dm
from dentexmodel.fileutils import FileOP
from dentexmodel.imageproc import ImageData

print(f'Project module version: {dm.__version__}')

Project module version: 0.0.post1.dev48+g39b077d.d20240115


In [2]:
# Path settings 
dentex_dir = os.path.join(os.environ['HOME'], 'data', 'dentex')
data_dir = os.path.join(dentex_dir, 'dentex_disease')
image_dir = os.path.join(data_dir, 'quadrant-enumeration-disease', 'xrays')
cropped_image_dir = os.path.join(image_dir, 'crop')
df_box_file_name = 'dentex_disease_cropped_dataset.parquet'
df_box_file = os.path.join(dentex_dir, df_box_file_name)

In [3]:
# Load the data frame with image paths and bounding boxes
data_df = pd.read_parquet(df_box_file)
display(data_df.head(2))

Unnamed: 0,height,width,id,file_name,image_number,file_path,quadrant,position,label,area,bbox,box_name,annotations,box_file,im_width,im_height
0,1316,2850,57,train_0.png,0,/home/andreas/data/dentex/dentex_disease/quadr...,2,6,Caries,33015,"[1791.0, 445.72727272727263, 147.4545454545452...",train_0_334_2_6,2,/home/andreas/data/dentex/dentex_disease/quadr...,147,323
1,1316,2850,57,train_0.png,0,/home/andreas/data/dentex/dentex_disease/quadr...,4,8,Caries,35006,"[682.090909090909, 687.5454545454545, 200.0, 3...",train_0_335_4_8,2,/home/andreas/data/dentex/dentex_disease/quadr...,200,300


In [4]:
print(len(data_df['box_name'].unique()))
print(data_df.shape)

3529
(3529, 16)


In [5]:
# Too many rows. 
n = data_df[['box_name', 'label']].\
                groupby('box_name').count().\
                reset_index(drop=False).\
                rename(columns={'label': 'nrows'})
#display(n.loc[n['nrows']>1])
display(data_df.loc[data_df['box_name'] == 'train_267_4_6'])

Unnamed: 0,height,width,id,file_name,image_number,file_path,quadrant,position,label,area,bbox,box_name,annotations,box_file,im_width,im_height


In [8]:
# Function to create the data splits
seed = 123
def val_test_split(df, label_col, n_test_per_class=30, n_val_per_class=15, seed=123):
    dset_df = df.copy().sample(frac=1, random_state=np.random.seed(seed)). \
        assign(dataset=None, cl=None).reset_index(drop=True)
    dataset_list = ['test', 'val']
    label_list = sorted(list(dset_df[label_col].unique()))
    dataset_n_list = [n_test_per_class, n_val_per_class]
    for d, dataset in enumerate(dataset_list):
        for c, cl in enumerate(label_list):
            np.random.seed(seed)
            idx_list = np.random.choice(dset_df.loc[(dset_df[label_col] == cl) & (dset_df['dataset'].isnull())].
                                        index, size=dataset_n_list[d], replace=False)
            dset_df.loc[dset_df.index.isin(idx_list), 'dataset'] = dataset
            dset_df.loc[dset_df[label_col] == cl, 'cl'] = c 
    # Use the remaining samples for training
    dset_df.loc[dset_df['dataset'].isnull(), 'dataset'] = 'train'
    return dset_df

dset_df = val_test_split(df=data_df, label_col='label', n_test_per_class=30, n_val_per_class=15)

# Print the splits 
label_col = 'label'
label_list = sorted(list(dset_df[label_col].unique()))
for l, label in enumerate(label_list):
    dset_label = dset_df.loc[dset_df[label_col] == label]
    cl_numbers = list(dset_label['cl'].unique())
    print(f'Label {label} with class number: {cl_numbers}')
    for dataset in ['train', 'val', 'test']:
        label_dataset_n_images = dset_label.loc[dset_label['dataset'] == dataset].shape[0]
        print(f'Label {label} {dataset.upper()}: {label_dataset_n_images}')
    print() 

Label Caries with class number: [0]
Label Caries TRAIN: 2144
Label Caries VAL: 15
Label Caries TEST: 30

Label Deep Caries with class number: [1]
Label Deep Caries TRAIN: 533
Label Deep Caries VAL: 15
Label Deep Caries TEST: 30

Label Impacted with class number: [2]
Label Impacted TRAIN: 559
Label Impacted VAL: 15
Label Impacted TEST: 30

Label Periapical Lesion with class number: [3]
Label Periapical Lesion TRAIN: 113
Label Periapical Lesion VAL: 15
Label Periapical Lesion TEST: 30



In [9]:
# Make sure that we have three non-overlapping data sets
train_set = set(dset_df.loc[dset_df['dataset']=='train', 'box_name'].values)
print(f'We have {len(train_set)} images in the train set.')

val_set = set(dset_df.loc[dset_df['dataset']=='val', 'box_name'].values)
print(f'We have {len(val_set)} images in the validation set.')

test_set = set(dset_df.loc[dset_df['dataset']=='test', 'box_name'].values)
print(f'We have {len(test_set)} images in the test set.')
print()

# Make sure that these data sets are distinct
print(train_set.intersection(val_set))
print(train_set.intersection(test_set))
print(val_set.intersection(test_set))

We have 3349 images in the train set.
We have 60 images in the validation set.
We have 120 images in the test set.

set()
set()
set()


In [10]:
# Save the data split
datasplit_file_name = 'dentex_disease_datasplit.parquet'
datasplit_file = os.path.join(dentex_dir, datasplit_file_name)
dset_df.to_parquet(datasplit_file)
print(datasplit_file)
display(dset_df.head())

/home/andreas/data/dentex/dentex_disease_datasplit.parquet


Unnamed: 0,height,width,id,file_name,image_number,file_path,quadrant,position,label,area,bbox,box_name,annotations,box_file,im_width,im_height,dataset,cl
0,1504,2880,272,train_191.png,191,/home/andreas/data/dentex/dentex_disease/quadr...,1,8,Impacted,39275,"[677.0, 446.0, 276.0, 207.0]",train_191_1435_1_8,9,/home/andreas/data/dentex/dentex_disease/quadr...,276,207,train,2
1,1316,2840,600,train_418.png,418,/home/andreas/data/dentex/dentex_disease/quadr...,1,7,Caries,34312,"[790.2912621359224, 372.81553398058253, 141.74...",train_418_3035_1_7,9,/home/andreas/data/dentex/dentex_disease/quadr...,142,310,train,0
2,1504,2884,202,train_391.png,391,/home/andreas/data/dentex/dentex_disease/quadr...,3,6,Caries,49152,"[1892.0, 767.0, 286.0, 316.0]",train_391_1068_3_6,17,/home/andreas/data/dentex/dentex_disease/quadr...,286,316,train,0
3,1316,2932,535,train_659.png,659,/home/andreas/data/dentex/dentex_disease/quadr...,4,8,Caries,42396,"[753.0, 611.0, 269.0, 277.0]",train_659_2767_4_8,1,/home/andreas/data/dentex/dentex_disease/quadr...,269,277,train,0
4,1316,2714,632,train_587.png,587,/home/andreas/data/dentex/dentex_disease/quadr...,4,6,Caries,45572,"[798.9655172413794, 655.2068965517242, 170.689...",train_587_3155_4_6,9,/home/andreas/data/dentex/dentex_disease/quadr...,171,324,train,0
