## Notebook 3: Split data into training, validation and test sets ##

In [1]:
# Imports
import os
import numpy as np
import pandas as pd

# Matplotlib for plotting
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm
from matplotlib import patches

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import dentexmodel as dm
from dentexmodel.fileutils import FileOP
from dentexmodel.imageproc import ImageData

print(f'Project module version: {dm.__version__}')

Project module version: 0.0.post1.dev25+ga99b50c.d20231228


In [2]:
# Path settings 
dentex_dir = os.path.join(os.environ['HOME'], 'data', 'dentex')
data_dir = os.path.join(dentex_dir, 'dentex_disease')
image_dir = os.path.join(data_dir, 'quadrant-enumeration-disease', 'xrays')
cropped_image_dir = os.path.join(image_dir, 'crop')
df_box_file_name = 'dentex_disease_cropped_dataset.parquet'
df_box_file = os.path.join(dentex_dir, df_box_file_name)

In [3]:
# Load the data frame with image paths and bounding boxes
data_df = pd.read_parquet(df_box_file)
display(data_df.head(2))

Unnamed: 0,height,width,id,file_name,image_number,file_path,quadrant,position,label,area,bbox,box_name,annotations,box_file,im_width,im_height
0,1316,2850,57,train_0.png,0,/app/data/dentex/dentex_disease/quadrant-enume...,2,6,Caries,33015,"[1791.0, 445.72727272727263, 147.4545454545452...",train_0_334_2_6,2,/app/data/dentex/dentex_disease/quadrant-enume...,147,323
1,1316,2850,57,train_0.png,0,/app/data/dentex/dentex_disease/quadrant-enume...,4,8,Caries,35006,"[682.090909090909, 687.5454545454545, 200.0, 3...",train_0_335_4_8,2,/app/data/dentex/dentex_disease/quadrant-enume...,200,300


In [4]:
print(len(data_df['box_name'].unique()))
print(data_df.shape)

3529
(3529, 16)


In [5]:
# Too many rows. 
n = data_df[['box_name', 'label']].\
                groupby('box_name').count().\
                reset_index(drop=False).\
                rename(columns={'label': 'nrows'})
#display(n.loc[n['nrows']>1])
display(data_df.loc[data_df['box_name'] == 'train_267_4_6'])

Unnamed: 0,height,width,id,file_name,image_number,file_path,quadrant,position,label,area,bbox,box_name,annotations,box_file,im_width,im_height


In [6]:
# Add a new column for the train, test and val data splits
dset_df = data_df.copy().assign(dataset=None)

# Look at the class distribution
display(dset_df['label'].value_counts())

# For the test set, take 30 random images from each class
n_images_test_per_class = 30
# For the validation set, take 15 images from each class
n_images_val_per_class = 15

dataset_list = ['test', 'val']
n_image_list = [n_images_test_per_class, n_images_val_per_class]
seed = 123
for d, dataset in enumerate(dataset_list):
    for c, cl in enumerate(data_df['label'].unique()):
        np.random.seed(seed)
        index_list = np.random.choice(dset_df.loc[(dset_df['label'] == cl) & (dset_df['dataset'].isnull())].\
                                      index, size=n_image_list[d], replace=False)
        dset_df.loc[dset_df.index.isin(index_list), 'dataset'] = dataset

# Use the remaining images for training
dset_df.loc[dset_df['dataset'].isnull(), 'dataset'] = 'train'

label
Caries               2189
Impacted              604
Deep Caries           578
Periapical Lesion     158
Name: count, dtype: int64

In [7]:
# Shuffle this one more time and then save the data splits
seed = 234
dset_df = dset_df.sample(frac=1, random_state=seed).reset_index(drop=True)

# Make sure that we have three non-overlapping data sets
train_set = set(dset_df.loc[dset_df['dataset']=='train', 'box_name'].values)
print(f'We have {len(train_set)} images in the train set.')

val_set = set(dset_df.loc[dset_df['dataset']=='val', 'box_name'].values)
print(f'We have {len(val_set)} images in the validation set.')

test_set = set(dset_df.loc[dset_df['dataset']=='test', 'box_name'].values)
print(f'We have {len(test_set)} images in the test set.')
print()

# Make sure that these data sets are distinct
print(train_set.intersection(val_set))
print(train_set.intersection(test_set))
print(val_set.intersection(test_set))

We have 3349 images in the train set.
We have 60 images in the validation set.
We have 120 images in the test set.

set()
set()
set()


In [8]:
# Save the data split
datasplit_file_name = 'dentex_disease_datasplit.parquet'
datasplit_file = os.path.join(dentex_dir, datasplit_file_name)
dset_df.to_parquet(datasplit_file)
print(datasplit_file)
display(dset_df.head())

/app/data/dentex/dentex_disease_datasplit.parquet


Unnamed: 0,height,width,id,file_name,image_number,file_path,quadrant,position,label,area,bbox,box_name,annotations,box_file,im_width,im_height,dataset
0,1316,2765,122,train_246.png,246,/app/data/dentex/dentex_disease/quadrant-enume...,3,8,Impacted,32453,"[1981.0, 737.0, 166.0, 252.0]",train_246_665_3_8,7,/app/data/dentex/dentex_disease/quadrant-enume...,166,252,train
1,1316,2850,703,train_344.png,344,/app/data/dentex/dentex_disease/quadrant-enume...,1,5,Deep Caries,20697,"[1042.0, 418.0, 96.0, 293.0]",train_344_3511_1_5,3,/app/data/dentex/dentex_disease/quadrant-enume...,96,293,train
2,1316,2938,138,train_573.png,573,/app/data/dentex/dentex_disease/quadrant-enume...,4,6,Deep Caries,42532,"[1037.0, 674.0, 183.0, 303.0]",train_573_725_4_6,6,/app/data/dentex/dentex_disease/quadrant-enume...,183,303,train
3,1316,2862,242,train_358.png,358,/app/data/dentex/dentex_disease/quadrant-enume...,3,8,Deep Caries,29479,"[2039.4455445544554, 745.4356435643564, 181.18...",train_358_1275_3_8,12,/app/data/dentex/dentex_disease/quadrant-enume...,182,230,train
4,1504,2868,353,train_374.png,374,/app/data/dentex/dentex_disease/quadrant-enume...,3,8,Caries,33774,"[2185.0, 812.0, 197.0, 255.0]",train_374_1863_3_8,2,/app/data/dentex/dentex_disease/quadrant-enume...,197,255,train
