## Data splits ##
Split the data into training, validation and testing data

In [1]:
import os
import sys
import glob
import random
import logging
import json
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from matplotlib import pyplot as plt
from matplotlib.patches import Rectangle
import cv2

import albumentations as alb

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

%load_ext autoreload
%autoreload 2
import computervision
from computervision.dentex import Dentex
from computervision.imageproc import ImageData

# Print version info
print(f'Package version: {computervision.__version__}')
print(f'Python version:  {sys.version}')

Package version: v0.0.1
Python version:  3.12.3 (main, Jun 18 2025, 17:59:45) [GCC 13.3.0]


In [2]:
data_dir = os.path.join(os.environ.get('HOME'), 'data')
print(f'data_dir: {data_dir}')

# Directory to store the data
dataset_dir = os.path.join(data_dir, 'dentex')

# After extraction from the .tar.gz archive, the images should be on the local fs (see the first notebook)
xrays_dir = os.path.join(dataset_dir, 'cropped')

data_dir: /app/data


## Load the annotations ##

In [3]:
parquet_file_name = 'train_quadrant_enumeration_cropped.parquet'
parquet_file = os.path.join(xrays_dir, parquet_file_name)
try:
    df = pd.read_parquet(parquet_file)
except Exception as e:
    print(f'ERROR: Could not load file: \n {e}')
    print('Make sure to run the first notebook which downloads the data.')
display(df.head())

# Create a file name for the data split
split_file_name = f'{os.path.splitext(parquet_file_name)[0]}_dset.parquet'
split_file = os.path.join(xrays_dir, split_file_name)
print(split_file)

# Let's make sure that we have the expected number of images available
# If the image numbers don't match, please run the first three notebooks.
expected_images = len(df['file_name'].unique())
file_list = glob.glob(os.path.join(xrays_dir, '*.png'))
print(f'Found {len(file_list)} images in {xrays_dir}')
print(f'Expected number of images is {expected_images}.')

# We want to be sure that the number of images is correct before we continue
assert expected_images == len(file_list), \
    f'WARNING: expected images ({expected_images}) != images on file system ({len(file_list)})'

Unnamed: 0,bbox,segmentation,height,width,file_name,file_base_name,quadrants,quadrant,pos,fdi,ada
0,"[666, 102, 103, 376]","[[757, 478, 769, 102, 678, 113, 666, 469]]",494,1473,train_0_12.png,train_0,12,1,1,11,8
1,"[593, 107, 85, 377]","[[666, 484, 678, 110, 607, 107, 604, 299, 619,...",494,1473,train_0_12.png,train_0,12,1,2,12,7
2,"[531, 69, 85, 368]","[[587, 437, 616, 357, 607, 72, 534, 69, 531, 4...",494,1473,train_0_12.png,train_0,12,1,3,13,6
3,"[457, 31, 115, 403]","[[522, 434, 572, 378, 543, 31, 463, 40, 457, 3...",494,1473,train_0_12.png,train_0,12,1,4,14,5
4,"[369, 10, 100, 406]","[[437, 416, 469, 378, 466, 10, 381, 31, 378, 2...",494,1473,train_0_12.png,train_0,12,1,5,15,4


/app/data/dentex/cropped/train_quadrant_enumeration_cropped_dset.parquet
Found 2534 images in /app/data/dentex/cropped
Expected number of images is 2534.


In [4]:
# Number of panoramic x-rays for testing and validation 
n_test = 20
n_val = 16

# Start with a sorted list of panoramic x-rays
image_list = sorted(list(df['file_base_name'].unique()))
print(len(image_list))
# Shuffle the list
seed = 123
random.seed(seed)
random.shuffle(image_list)
print(image_list[:5])

# Now we can pick the images for validation and testing
val_image_list = image_list[:n_val]
test_image_list = image_list[n_val+1:n_val+n_test+1]
print(len(val_image_list))
print(len(test_image_list))

# Assign the data sets to the images
df = df.assign(dset='train')
df.loc[df['file_base_name'].isin(val_image_list), 'dset'] = 'val'
df.loc[df['file_base_name'].isin(test_image_list), 'dset'] = 'test'

# Count images and annotations for each data set
for dset in ['train', 'val', 'test']:
    n_base_images = len(df.loc[df['dset'] == dset, 'file_base_name'].unique())
    n_images = len(df.loc[df['dset'] == dset, 'file_name'].unique())
    n_annotations = df.loc[df['dset'] == dset, 'file_name'].shape[0]
    print()
    print(f'{dset.upper()}')
    print(f'Panoramic x-rays: {n_base_images}')
    print(f'Cropped images:   {n_images}')
    print(f'Annotations:      {n_annotations}')

634
['train_185', 'train_143', 'train_36', 'train_609', 'train_242']
16
20

TRAIN
Panoramic x-rays: 598
Cropped images:   2391
Annotations:      34166

VAL
Panoramic x-rays: 16
Cropped images:   63
Annotations:      886

TEST
Panoramic x-rays: 20
Cropped images:   80
Annotations:      1138


In [5]:
# Save the data frame
dset_file_name = f'{os.path.splitext(parquet_file_name)[0].\
    rsplit('_', maxsplit=1)[0]}_dset.parquet'
dset_file = os.path.join(xrays_dir, dset_file_name)
if not os.path.exists(dset_file):
    print(f'Saving data: {dset_file}')
    df.to_parquet(dset_file)
else:
    logger.warning(f'File {dset_file} exists. Skipping.')
display(df.sample(5))

Saving data: /app/data/dentex/cropped/train_quadrant_enumeration_dset.parquet


Unnamed: 0,bbox,segmentation,height,width,file_name,file_base_name,quadrants,quadrant,pos,fdi,ada,dset
3160,"[730, 392, 89, 240]","[[730, 392, 800, 397, 819, 632, 779, 626]]",694,829,train_148_14.png,train_148,14,4,1,41,25,train
35485,"[199, 39, 109, 317]","[[208, 273, 199, 305, 211, 341, 233, 341, 252,...",745,816,train_88_23.png,train_88,23,2,4,24,12,train
21993,"[539, 41, 81, 278]","[[620, 41, 620, 248, 611, 274, 615, 298, 617, ...",345,1443,train_447_12.png,train_447,12,1,2,12,7,train
11141,"[766, 54, 108, 289]","[[766, 71, 782, 110, 785, 171, 785, 229, 785, ...",381,1223,train_276_12.png,train_276,12,2,3,23,11,train
15878,"[919, 154, 135, 363]","[[919, 179, 937, 156, 974, 154, 992, 190, 1021...",555,1657,train_35_34.png,train_35,34,3,3,33,22,train


In [6]:
# Making sure that the data sets are distinct
train_images = set(sorted(list(df.loc[df['dset'] == 'train', 'file_name'].unique())))
val_images = set(sorted(list(df.loc[df['dset'] == 'val', 'file_name'].unique())))
test_images = set(sorted(list(df.loc[df['dset'] == 'test', 'file_name'].unique())))

print(f'Training images:   {len(train_images)}')
print(f'Validation images: {len(val_images)}')
print(f'Test images:       {len(test_images)}')

print(train_images.intersection(val_images))
print(train_images.intersection(test_images))
print(val_images.intersection(test_images))

Training images:   2391
Validation images: 63
Test images:       80
set()
set()
set()
