## Split data into training, validation and test sets ##

In [1]:
# Imports
import os
import numpy as np
import pandas as pd

# Matplotlib for plotting
from matplotlib import pyplot as plt
from matplotlib.pyplot import cm
from matplotlib import patches

# Appearance of the Notebook
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
np.set_printoptions(linewidth=110)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)

# Import this module with autoreload
%load_ext autoreload
%autoreload 2
import computervision as dm
from computervision.fileutils import FileOP
from computervision.imageproc import ImageData
from computervision.dentexdata import val_test_split

print(f'Project module version: {dm.__version__}')

Project module version: 0.0.post1.dev48+g56bd7cc.d20240808


In [2]:
# Main data directory (defined as environment variable in docker-compose.yml)
data_root = os.environ.get('DATA_ROOT')

# Download directory (change as needed)
dentex_dir = os.path.join(data_root, 'dentex')
model_dir = os.path.join(data_root, 'model')
data_dir = os.path.join(dentex_dir, 'dentex_classification')

# This image directory is where the xrays are in the archive, so should be left as-is
image_dir = os.path.join(data_dir, 'quadrant-enumeration-disease', 'xrays')
cropped_image_dir = os.path.join(image_dir, 'crop')

# Directory for the output
output_dir = os.path.join(data_dir, 'output')

df_box_file_name = 'dentex_disease_cropped_dataset.parquet'
df_box_file = os.path.join(data_dir, df_box_file_name)

In [3]:
# Load the data frame with image paths and bounding boxes
data_df = pd.read_parquet(df_box_file)
display(data_df.head(2))

Unnamed: 0,image_id,file_name,image_number,file_path,quadrant,position,label,cl,area,bbox,box_name,annotations,box_file,im_width,im_height
0,57,train_0.png,0,/app/data/dentex/dentex_classification/quadran...,2,6,Caries,1,33015,"[1791.0, 445.72727272727263, 147.4545454545452...",train_0_334_2_6,2,/app/data/dentex/dentex_classification/quadran...,147,323
1,57,train_0.png,0,/app/data/dentex/dentex_classification/quadran...,4,8,Caries,1,35006,"[682.090909090909, 687.5454545454545, 200.0, 3...",train_0_335_4_8,2,/app/data/dentex/dentex_classification/quadran...,200,300


In [4]:
# Function to create the data splits
label_col = 'label'
dset_df = val_test_split(data=data_df, 
                         label_col=label_col,
                         n_test_per_class=30,
                         n_val_per_class=30)

In [5]:
# Make sure that we have three non-overlapping data sets
train_set = set(dset_df.loc[dset_df['dataset']=='train', 'box_name'].values)
print(f'We have {len(train_set)} images in the train set.')

val_set = set(dset_df.loc[dset_df['dataset']=='val', 'box_name'].values)
print(f'We have {len(val_set)} images in the validation set.')

test_set = set(dset_df.loc[dset_df['dataset']=='test', 'box_name'].values)
print(f'We have {len(test_set)} images in the test set.')
print()

# Make sure that these data sets are distinct
print(train_set.intersection(val_set))
print(train_set.intersection(test_set))
print(val_set.intersection(test_set))

We have 3289 images in the train set.
We have 120 images in the validation set.
We have 120 images in the test set.

set()
set()
set()


In [6]:
# Save the data split
datasplit_file_name = 'dentex_disease_datasplit.parquet'
datasplit_file = os.path.join(data_dir, datasplit_file_name)
dset_df.to_parquet(datasplit_file)
print(datasplit_file)
display(dset_df.head())

/app/data/dentex/dentex_classification/dentex_disease_datasplit.parquet


Unnamed: 0,image_id,file_name,image_number,file_path,quadrant,position,label,cl,area,bbox,box_name,annotations,box_file,im_width,im_height,dataset
0,272,train_191.png,191,/app/data/dentex/dentex_classification/quadran...,1,8,Impacted,0,39275,"[677.0, 446.0, 276.0, 207.0]",train_191_1435_1_8,9,/app/data/dentex/dentex_classification/quadran...,276,207,train
1,600,train_418.png,418,/app/data/dentex/dentex_classification/quadran...,1,7,Caries,1,34312,"[790.2912621359224, 372.81553398058253, 141.74...",train_418_3035_1_7,9,/app/data/dentex/dentex_classification/quadran...,142,310,train
2,202,train_391.png,391,/app/data/dentex/dentex_classification/quadran...,3,6,Caries,1,49152,"[1892.0, 767.0, 286.0, 316.0]",train_391_1068_3_6,17,/app/data/dentex/dentex_classification/quadran...,286,316,train
3,535,train_659.png,659,/app/data/dentex/dentex_classification/quadran...,4,8,Caries,1,42396,"[753.0, 611.0, 269.0, 277.0]",train_659_2767_4_8,1,/app/data/dentex/dentex_classification/quadran...,269,277,train
4,632,train_587.png,587,/app/data/dentex/dentex_classification/quadran...,4,6,Caries,1,45572,"[798.9655172413794, 655.2068965517242, 170.689...",train_587_3155_4_6,9,/app/data/dentex/dentex_classification/quadran...,171,324,train
