In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import shutil
from PIL import Image
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Split Datasets for yolo training

In [12]:
datasets_training_path = '/content/drive/MyDrive/capstone_periodontal/Datasets_for_training/checked_datasets_corrected_test'
text_label_path = '/content/drive/MyDrive/capstone_periodontal/data_all_fin/data_labels/checked_labels'
pano_split_csv_path = '/content/drive/MyDrive/capstone_periodontal/Splited_HN_Datasets/check_datasets_corrected_test/pano_split.csv'
all_split_csv_path = '/content/drive/MyDrive/capstone_periodontal/Splited_HN_Datasets/check_datasets_corrected_test/all_split.csv'

In [13]:
pano_split_df = pd.read_csv(pano_split_csv_path)
all_split_df = pd.read_csv(all_split_csv_path)

In [14]:
def split_dataset_by_patient_id(image_folder_path, label_folder_path, split_dataset_folder_path, split_df):
    # Create folder structure
    for split in ['train', 'val', 'test']:
        for subfolder in ['image', 'label']:
            os.makedirs(os.path.join(split_dataset_folder_path, split, subfolder), exist_ok=True)

    # Iterate through images
    for filename in os.listdir(image_folder_path):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.tif', '.tiff')):

            #get filename without ending
            parts = filename.split('.')
            if len(parts) < 2:
                print(f"Skipping invalid image name format: {filename}")
                continue
            id = parts[0]
            original_id = parts[0]

            id_parts = id.split('_')
            is_aug = False
            if len(id_parts) >= 2:
              is_aug = id_parts[-2] == 'aug'

            if is_aug:
                original_id = f"{'_'.join(id_parts[:-2])}"

            # Determine the split from the dataframe
            row = split_df[split_df['id'] == original_id]
            if row.empty:
                print(f"No split info for id: {original_id}, skipping.")
                continue

            split_value = row.iloc[0]['split']
            if split_value not in ['train', 'val', 'test']:
                print(f"Invalid split value for id {original_id}: {split_value}, skipping.")
                continue

            # Copy image
            src_image_path = os.path.join(image_folder_path, filename)
            dst_image_path = os.path.join(split_dataset_folder_path, split_value, 'image', filename)
            shutil.copy2(src_image_path, dst_image_path)

            # Copy corresponding label
            label_filename = f"{original_id}.txt"  # Assuming label has the same name as the image
            save_label_filename = f"{id}.txt"

            src_label_path = os.path.join(label_folder_path, label_filename)
            if os.path.exists(src_label_path):
                dst_label_path = os.path.join(split_dataset_folder_path, split_value, 'label', save_label_filename)
                shutil.copy2(src_label_path, dst_label_path)
            else:
                print(f"Label not found for: {filename}, skipping label.")


In [15]:
def check_split_dataset(split_dataset_folder_path):
    expected_splits = ['train', 'val', 'test']
    expected_subfolders = ['image', 'label']

    for split in expected_splits:
        print(f"\nChecking split: {split}")
        for subfolder in expected_subfolders:
            folder_path = os.path.join(split_dataset_folder_path, split, subfolder)
            if os.path.exists(folder_path):
                file_count = len([
                    f for f in os.listdir(folder_path)
                    if os.path.isfile(os.path.join(folder_path, f))
                ])
                print(f"  {subfolder.capitalize()} folder exists. File count: {file_count}")
            else:
                print(f"  {subfolder.capitalize()} folder MISSING!")

## Train Case 1: Pano

In [18]:
pano_img_folder_path = '/content/drive/MyDrive/capstone_periodontal/data_all_fin/pano'
split_case1_folder_path = '/content/drive/MyDrive/capstone_periodontal/Datasets_for_training/checked_datasets_corrected_test/training_case_1'

In [19]:
len(os.listdir(pano_img_folder_path))

189

In [20]:
split_dataset_by_patient_id(pano_img_folder_path, text_label_path, split_case1_folder_path, pano_split_df)

In [21]:
check_split_dataset(split_case1_folder_path)


Checking split: train
  Image folder exists. File count: 157
  Label folder exists. File count: 157

Checking split: val
  Image folder exists. File count: 14
  Label folder exists. File count: 14

Checking split: test
  Image folder exists. File count: 18
  Label folder exists. File count: 18


## Train Case 2: Augmented Pano

In [22]:
augmented_pano_img_folder_path = '/content/drive/MyDrive/capstone_periodontal/augmented_datasets/pano'
split_case2_folder_path = '/content/drive/MyDrive/capstone_periodontal/Datasets_for_training/checked_datasets_corrected_test/training_case_2'

In [23]:
len(os.listdir(augmented_pano_img_folder_path))

945

In [24]:
split_dataset_by_patient_id(augmented_pano_img_folder_path, text_label_path, split_case2_folder_path, pano_split_df)

In [25]:
check_split_dataset(split_case2_folder_path)


Checking split: train
  Image folder exists. File count: 785
  Label folder exists. File count: 785

Checking split: val
  Image folder exists. File count: 70
  Label folder exists. File count: 70

Checking split: test
  Image folder exists. File count: 90
  Label folder exists. File count: 90


## Train Case 3: Augmented Pano + Peri

In [26]:
augmented_pano_img_folder_path = '/content/drive/MyDrive/capstone_periodontal/augmented_datasets/pano'
augmented_peri_img_folder_path= '/content/drive/MyDrive/capstone_periodontal/augmented_datasets/peri'
split_case3_folder_path = '/content/drive/MyDrive/capstone_periodontal/Datasets_for_training/checked_datasets_corrected_test/training_case_3'

In [27]:
len(os.listdir(augmented_pano_img_folder_path)) + len(os.listdir(augmented_peri_img_folder_path))

1675

In [29]:
split_dataset_by_patient_id(augmented_pano_img_folder_path, text_label_path, split_case3_folder_path, all_split_df)

In [30]:
split_dataset_by_patient_id(augmented_peri_img_folder_path, text_label_path, split_case3_folder_path, all_split_df)

In [31]:
check_split_dataset(split_case3_folder_path)


Checking split: train
  Image folder exists. File count: 890
  Label folder exists. File count: 890

Checking split: val
  Image folder exists. File count: 465
  Label folder exists. File count: 465

Checking split: test
  Image folder exists. File count: 320
  Label folder exists. File count: 320
