In [1]:
from IPython.display import Image 
import os 
import shutil
import xml.etree.ElementTree as ET
from xml.dom import minidom
from tqdm import tqdm
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import cv2
from pathlib import Path 
 
def extract_info_from_xml(xml_file):
    root = ET.parse(xml_file).getroot()
    
    # Initialise the info dict 
    info_dict = {}
    info_dict['bboxes'] = []

    # Parse the XML Tree
    for elem in root:
        # Get the file name 
        if elem.tag == "filename":
            info_dict['filename'] = elem.text
            
        # Get the image size
        elif elem.tag == "size":
            image_size = []
            for subelem in elem:
                image_size.append(int(subelem.text))
            
            info_dict['image_size'] = tuple(image_size)
        
        # Get details of the bounding box 
        elif elem.tag == "object":
            bbox = {}
            for subelem in elem:
                if subelem.tag == "name":
                    bbox["class"] = subelem.text
                    
                elif subelem.tag == "bndbox":
                    for subsubelem in subelem:
                        bbox[subsubelem.tag] = int(subsubelem.text)            
            info_dict['bboxes'].append(bbox)
    
    return info_dict

In [2]:
# Dictionary that maps class names to IDs
class_name_to_id_mapping = {"vehicle":0, "rider":1, "pedestrian":2}

images = []
annotations = []

rootpath = '/root/ubi/UBI_Dataset/'        
for catalog_folder in tqdm(os.listdir(rootpath)):
    file_folder = os.path.join(rootpath, catalog_folder)
    if catalog_folder == 'Annotations':
        file_folder = file_folder + '/All'
        for i in os.listdir(file_folder):
            filepath = os.path.join(file_folder, i)
            for j in os.listdir(filepath):
                annotations.append(os.path.join(filepath, j))
    elif catalog_folder == 'JPEGImages':
        file_folder = file_folder + '/All'
        for i in os.listdir(file_folder):
            filepath = os.path.join(file_folder, i)
            for j in os.listdir(filepath):
                images.append(os.path.join(filepath, j))

images.sort() 
annotations.sort()
# Split the dataset into train-valid-test splits by 7:2:1 
train_img, val_img, train_anns, val_anns = train_test_split(images, annotations, test_size = 0.3, random_state = 113)
val_img, test_img, val_anns, test_anns = train_test_split(val_img, val_anns, test_size = 0.3, random_state = 113)

print("Training set:", len(train_img), "validation set:", len(val_img), "test set:", len(test_img))
# print(train_img[0], train_anns[0].rstrip('.xml'))


100%|██████████| 4/4 [00:00<00:00, 14.67it/s]

Training set: 43760 validation set: 13128 test set: 5627





In [3]:
def move_files_to_folder(list_of_files, destination_folder):
    if not os.path.isdir(destination_folder):
        os.mkdir(destination_folder)
        
    for f in tqdm(list_of_files):
        try:
            shutil.copy(f, destination_folder)
        except:
            print(f)
            assert False

train_files = Path("../../Data/train")
train_files.mkdir(parents=True, exist_ok=True)
val_files = Path("../../Data/val")
val_files.mkdir(parents=True, exist_ok=True)
test_files = Path("../../Data/test")
test_files.mkdir(parents=True, exist_ok=True)

# Move the splits into their folders
move_files_to_folder(train_anns, '../../Data/train/labels')
move_files_to_folder(train_img, '../../Data/train/images')

move_files_to_folder(val_anns, '../../Data/val/labels')
move_files_to_folder(val_img, '../../Data/val/images')

move_files_to_folder(test_anns, '../../Data/test/labels')
move_files_to_folder(test_img, '../../Data/test/images')

100%|██████████| 50012/50012 [06:00<00:00, 138.57it/s]
100%|██████████| 50012/50012 [10:06<00:00, 82.44it/s] 
100%|██████████| 8752/8752 [01:09<00:00, 125.05it/s]
100%|██████████| 8752/8752 [01:47<00:00, 81.24it/s] 
100%|██████████| 3751/3751 [00:31<00:00, 118.69it/s]
100%|██████████| 3751/3751 [00:44<00:00, 84.34it/s] 


In [4]:
rootpath = '../../Data'        
data = {'train':[], 'val':[], 'test':[]}

for catalog_folder in tqdm(os.listdir(rootpath)):  #train、val、test
    abspath = os.path.join(rootpath, catalog_folder)
    file_folder = os.path.join(rootpath, catalog_folder, 'images')
    for filename in os.listdir(file_folder):        #filename
        data[catalog_folder].append(filename.rstrip('.jpg'))
    data[catalog_folder].sort()

100%|██████████| 3/3 [00:00<00:00, 28.87it/s]


In [6]:
for item in tqdm(data):
    
    with open(f'../../Data/{item}/{item}.txt', 'w') as f:
        for line in data[item]:
            f.write(f"{line}\n")

100%|██████████| 3/3 [00:00<00:00, 174.58it/s]
