## Data splits ##
Split the data into training, validation and testing data

In [1]:
import os
import sys
import glob
import random
import logging
import json
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path
from matplotlib import pyplot as plt
from matplotlib.patches import Rectangle
import cv2

import albumentations as alb

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

%load_ext autoreload
%autoreload 2
import computervision
from computervision.dentex import Dentex
from computervision.imageproc import ImageData, is_image
from computervision.transformations import AugmentationTransform, DETRansform

# Print version info
print(f'Package version: {computervision.__version__}')
print(f'Python version:  {sys.version}')

Package version: v0.0.2
Python version:  3.12.3 (main, Jun 18 2025, 17:59:45) [GCC 13.3.0]


In [2]:
data_dir = os.environ.get('DATA')
print(f'data_dir: {data_dir}')
image_dir = os.path.join(data_dir, 'dataset_object_roboflow_240930')
print(f'data_dir: {data_dir}')

parquet_file_name = 'roboflow_240930.parquet'
parquet_file = os.path.join(image_dir, parquet_file_name)
try:
    df = pd.read_parquet(parquet_file)
except Exception as e:
    print(f'ERROR: Could not load file: \n {e}')
    print('Make sure to run the first notebook which downloads the data.')

file_col = 'multi_file'
bbox_col = 'bbox'
pos_col = 'pos'

# Let's get the data with the position information
print(df.shape[0])
df = df.loc[~df[pos_col].isnull()].astype({pos_col: int})
print(df.shape[0])
print(len(df[file_col].unique()))
display(df.head(5))

# Check the images
file_name_list = sorted(list(df[file_col].unique()))
file_list = [os.path.join(image_dir, file) for file in file_name_list]
checked = np.sum([is_image(file) for file in file_list])
print(f'Images with position: {len(file_list)}')
print(f'Images checked:       {checked}')

data_dir: /app/data_model
data_dir: /app/data_model
12559
8788
991


Unnamed: 0,id,license,file_name,height,width,date_captured,file_name_hash,dset,multi_file,bbox,category,disease,pos,box_id
1,0,1,pbws-super-set-1-completed__PBWs_Super_Set_2-0...,480,640,2024-09-17T23:44:33+00:00,40762886f4,train,40762886f4.jpg,"[8.0, 0.0, 130.293, 176.736]",tooth 23,teeth,11,40762886f4_1
2,0,1,pbws-super-set-1-completed__PBWs_Super_Set_2-0...,480,640,2024-09-17T23:44:33+00:00,40762886f4,train,40762886f4.jpg,"[130.0, 0.0, 258.408, 186.778]",tooth 25,teeth,13,40762886f4_2
3,0,1,pbws-super-set-1-completed__PBWs_Super_Set_2-0...,480,640,2024-09-17T23:44:33+00:00,40762886f4,train,40762886f4.jpg,"[258.0, 0.0, 438.382, 192.803]",tooth 26,teeth,14,40762886f4_3
4,0,1,pbws-super-set-1-completed__PBWs_Super_Set_2-0...,480,640,2024-09-17T23:44:33+00:00,40762886f4,train,40762886f4.jpg,"[436.0, 1.0, 607.21, 185.77]",tooth 27,teeth,15,40762886f4_4
5,0,1,pbws-super-set-1-completed__PBWs_Super_Set_2-0...,480,640,2024-09-17T23:44:33+00:00,40762886f4,train,40762886f4.jpg,"[5.0, 207.0, 79.395, 473.109]",tooth 33,teeth,22,40762886f4_5


Images with position: 991
Images checked:       991


In [3]:
# Function to plot an image with the bounding boxes
def plot_boxes(image, box_list, ax, label_list=None, color=None, cmap='grey'):
    offset_xy = (10, 100)
    # Take a ratio that looks good
    offset = (image.shape[1]*offset_xy[0]/2500,
              image.shape[0]*offset_xy[1]/1250)
    if color is None:
        # If no color is provided, color each box in a different color
        color_list = list(plt.cm.rainbow(np.linspace(0, 1, len(box_list))))
    else:
        color_list = [color]*len(box_list)
    ax.set(xticks=[], yticks=[])
    ax.imshow(image, cmap=cmap)
    # Loop over the bounding boxes
    for b, box in enumerate(box_list):
        rect = Rectangle(xy=(box[0], box[1]),
                         width=box[2],
                         height=box[3],
                         linewidth=1.5,
                         edgecolor=color_list[b],
                         facecolor='none',
                         alpha=0.7)
        ax.add_patch(rect)
        if label_list is not None:
            ax.text(x=box[0]+offset[0], y=box[1]+offset[1], s=label_list[b], color=color_list[b], fontsize=8)
    return ax

In [4]:
# Positions represented in the data
def pos_images(df, file_col, pos_col):
    n_pos = df[[file_col, pos_col]].\
        drop_duplicates().\
        groupby(pos_col).nunique().\
        reset_index(drop=False).\
        rename(columns={file_col: 'n_images'}).\
        sort_values(by='pos', ascending=True)
    return n_pos

n_pos = pos_images(df=df.copy(), file_col=file_col, pos_col=pos_col)
display(n_pos)

Unnamed: 0,pos,n_images
0,1,148
1,2,475
2,3,477
3,4,472
4,5,452
5,6,231
6,7,13
7,10,4
8,11,227
9,12,453


In [5]:
# Split a test set
n_test = 48
n_val = 48

# Start with a sorted list of panoramic x-rays
image_list = sorted(list(df[file_col].unique()))
print(len(image_list))

# Shuffle the list
seed = 123
random.seed(seed)
random.shuffle(image_list)
print(image_list[:5])

# Now we can pick the images for validation and testing
val_image_list = image_list[:n_val]
test_image_list = image_list[n_val+1:n_val+n_test+1]
print(len(val_image_list))
print(len(test_image_list))

# Assign the data sets to the images
df = df.assign(dset='train')
df.loc[df[file_col].isin(val_image_list), 'dset'] = 'val'
df.loc[df[file_col].isin(test_image_list), 'dset'] = 'test'

# Count images and annotations for each data set
for dset in ['train', 'val', 'test']:
    df_dset = df.loc[df['dset'] == dset]
    n_images = len(df_dset[file_col].unique())
    n_annotations = df_dset.shape[0]
    n_pos = pos_images(df=df_dset.copy(), file_col=file_col, pos_col=pos_col)
    
    print()
    print(f'{dset.upper()}')
    print(f'Images:      {n_images}')
    print(f'Annotations: {n_annotations}')
    display(n_pos)

991
['54e92edb7c.jpg', 'c7573f7a10.jpg', 'cd418c876a.jpg', 'f4dfdff926.jpg', 'eb79ef10bf.jpg']
48
48

TRAIN
Images:      895
Annotations: 7923


Unnamed: 0,pos,n_images
0,1,129
1,2,423
2,3,424
3,4,420
4,5,406
5,6,204
6,7,11
7,10,4
8,11,206
9,12,414



VAL
Images:      48
Annotations: 421


Unnamed: 0,pos,n_images
0,1,9
1,2,25
2,3,25
3,4,24
4,5,20
5,6,11
6,11,11
7,12,19
8,13,23
9,14,23



TEST
Images:      48
Annotations: 444


Unnamed: 0,pos,n_images
0,1,10
1,2,27
2,3,28
3,4,28
4,5,26
5,6,16
6,7,2
7,11,10
8,12,20
9,13,20


In [6]:
# Save the data frame
# Create a file name for the data split
split_file_name = f'{os.path.splitext(parquet_file_name)[0]}_dset.parquet'
split_file = os.path.join(image_dir, split_file_name)
print(split_file)
print(len(file_list))

if not os.path.exists(split_file):
    print(f'Saving data: {split_file}')
    df.to_parquet(split_file)
else:
    logger.warning(f'File {split_file} exists. Skipping.')
display(df.sample(5))

/app/data_model/dataset_object_roboflow_240930/roboflow_240930_dset.parquet
991
Saving data: /app/data_model/dataset_object_roboflow_240930/roboflow_240930_dset.parquet


Unnamed: 0,id,license,file_name,height,width,date_captured,file_name_hash,dset,multi_file,bbox,category,disease,pos,box_id
6156,488,1,pbws-super-set-1-completed__PBWs_Super_Set_2-0...,480,640,2024-09-17T23:44:33+00:00,3ece40fcfa,train,3ece40fcfa.jpg,"[527.0, 255.0, 637.769, 476.0]",tooth 44,teeth,28,3ece40fcfa_6156
4038,315,1,pbws-super-set-1-completed__PBWs_Super_Set_2-0...,480,640,2024-09-17T23:44:33+00:00,fab58294ce,test,fab58294ce.jpg,"[354.0, 4.0, 531.325, 227.071]",tooth 26,teeth,14,fab58294ce_4038
586,44,1,pbws-super-set-1-completed__PBWs_Super_Set_3-0...,480,640,2024-09-17T23:44:33+00:00,d2285fe196,train,d2285fe196.jpg,"[5.0, 228.0, 221.81, 477.53]",tooth 47,teeth,31,d2285fe196_586
11737,105,1,pbws-super-set-1-completed__PBWs_Super_Set_2-0...,480,640,2024-09-17T23:44:34+00:00,701c459a79,train,701c459a79.jpg,"[593.0, 251.0, 637.841, 458.137]",tooth 44,teeth,28,701c459a79_1354
9848,782,1,pbws-super-set-1-completed__PBWs_Super_Set_1-0...,480,640,2024-09-17T23:44:33+00:00,b687b51b5c,test,b687b51b5c.jpg,"[1.0, 0.0, 130.427, 209.0]",tooth 23,teeth,11,b687b51b5c_9848


In [7]:
# Making sure that the data sets are distinct
train_images = set(sorted(list(df.loc[df['dset'] == 'train', 'file_name'].unique())))
val_images = set(sorted(list(df.loc[df['dset'] == 'val', 'file_name'].unique())))
test_images = set(sorted(list(df.loc[df['dset'] == 'test', 'file_name'].unique())))

print(f'Training images:   {len(train_images)}')
print(f'Validation images: {len(val_images)}')
print(f'Test images:       {len(test_images)}')

print(train_images.intersection(val_images))
print(train_images.intersection(test_images))
print(val_images.intersection(test_images))

Training images:   895
Validation images: 48
Test images:       48
set()
set()
set()
