In [35]:
import os 
import shutil
import xml.etree.ElementTree as ET
from tqdm import tqdm
import numpy as np
from sklearn.model_selection import train_test_split
import json
from pathlib import Path 
 
# Dictionary that maps class names to IDs
class_name_to_id_mapping = {"BACKGROUND": 0, "vehicle": 1, "rider": 2, "pedestrian": 3}

In [36]:
!rm -rf /root/ubi/Data/

In [37]:
images = []
annotations = []

rootpath = '/root/ubi/UBI_Dataset/'        
for catalog_folder in tqdm(os.listdir(rootpath)):
    file_folder = os.path.join(rootpath, catalog_folder)
    if catalog_folder == 'Annotations':
        file_folder = file_folder + '/All'
        for i in os.listdir(file_folder):
            filepath = os.path.join(file_folder, i)
            for j in os.listdir(filepath):
                annotations.append(os.path.join(filepath, j))
    elif catalog_folder == 'JPEGImages':
        file_folder = file_folder + '/All'
        for i in os.listdir(file_folder):
            filepath = os.path.join(file_folder, i)
            for j in os.listdir(filepath):
                images.append(os.path.join(filepath, j))

images.sort() 
annotations.sort()
# Split the dataset into train-valid-test splits by 7:2:1 
train_img, val_img, train_anns, val_anns = train_test_split(images, annotations, test_size = 0.3, random_state = 113)
val_img, test_img, val_anns, test_anns = train_test_split(val_img, val_anns, test_size = 0.3, random_state = 113)

print("Training set:", len(train_img), "validation set:", len(val_img), "test set:", len(test_img))
# print(train_img[0], train_anns[0].rstrip('.xml'))

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:01<00:00,  2.32it/s]

Training set: 43760 validation set: 13128 test set: 5627





In [38]:
def move_files_to_folder(list_of_files, destination_folder):
    if not os.path.isdir(destination_folder):
        os.mkdir(destination_folder)
        
    for f in tqdm(list_of_files):
        try:
            shutil.copy(f, destination_folder)
        except:
            print(f)
            assert False

train_files = Path("../../Data/train")
train_files.mkdir(parents=True, exist_ok=True)
val_files = Path("../../Data/val")
val_files.mkdir(parents=True, exist_ok=True)
test_files = Path("../../Data/test")
test_files.mkdir(parents=True, exist_ok=True)

# Move the splits into their folders
move_files_to_folder(train_anns, '../../Data/train/labels')
move_files_to_folder(train_img, '../../Data/train/images')

move_files_to_folder(val_anns, '../../Data/val/labels')
move_files_to_folder(val_img, '../../Data/val/images')

move_files_to_folder(test_anns, '../../Data/test/labels')
move_files_to_folder(test_img, '../../Data/test/images')

100%|██████████| 43760/43760 [05:36<00:00, 130.12it/s]
100%|██████████| 43760/43760 [09:49<00:00, 74.25it/s] 
100%|██████████| 13128/13128 [01:45<00:00, 124.55it/s]
100%|██████████| 13128/13128 [02:44<00:00, 79.92it/s] 
100%|██████████| 5627/5627 [00:46<00:00, 120.35it/s]
100%|██████████| 5627/5627 [01:06<00:00, 84.53it/s] 


In [39]:
rootpath = '../../Data'        
data = {'train':[], 'val':[], 'test':[]}

for catalog_folder in tqdm(os.listdir(rootpath)):  #train、val、test
    abspath = os.path.join(rootpath, catalog_folder)
    file_folder = os.path.join(rootpath, catalog_folder, 'images')
    for filename in os.listdir(file_folder):        #filename
        data[catalog_folder].append(filename.rstrip('.jpg'))
    data[catalog_folder].sort()
for item in tqdm(data):
    with open(f'{rootpath}/{item}/{item}.txt', 'w') as f:
        for line in data[item]:
            f.write(f"{line}\n")

100%|██████████| 3/3 [00:00<00:00, 35.70it/s]
100%|██████████| 3/3 [00:00<00:00, 55.54it/s]


In [40]:
def extract_info_from_xml(path, xml_id):

    objects = ET.parse(f"{path}/{xml_id}.xml").findall("object")
    boxes = []
    labels = []
    is_difficult = []
    for obj in objects:
        class_name = obj.find('name').text.lower().strip()
        # we're only concerned with clases in our list
        if class_name in class_name_to_id_mapping:
            bbox = obj.find('bndbox')

            # VOC dataset format follows Matlab, in which indexes start from 0
            x1 = float(bbox.find('xmin').text)
            y1 = float(bbox.find('ymin').text)
            x2 = float(bbox.find('xmax').text)
            y2 = float(bbox.find('ymax').text)

            if x1 > x2:
                x1, x2 = x2, x1
            if y1 > y2:
                y1, y2 = y2, y1

            boxes.append([x1, y1, x2, y2])

            labels.append(class_name_to_id_mapping[class_name])

            is_difficult.append(
                int(obj.find('difficult').text) if obj.find('difficult') else 0)

    return {'filename': xml_id, 'boxes': boxes, 'labels': labels, 'is_difficult': is_difficult}

In [41]:
for dataset_type in os.listdir(rootpath):
    length = len(os.listdir(f"{rootpath}/{dataset_type}/labels")) 
    jsonfile = f"{rootpath}/{dataset_type}/{dataset_type}.json"
    data = []
    with open(f"{rootpath}/{dataset_type}/{dataset_type}.txt", 'r') as f:
        for xml_id in tqdm(f, total=length):
            data.append(extract_info_from_xml(
                f"{rootpath}/{dataset_type}/labels", xml_id.rstrip()))
    with open(jsonfile, 'w', encoding='utf8') as f:
        json.dump(data, f, ensure_ascii=False, indent=3)

100%|██████████| 43760/43760 [09:29<00:00, 76.79it/s] 
100%|██████████| 13128/13128 [00:01<00:00, 8801.52it/s] 
100%|██████████| 5627/5627 [00:00<00:00, 9064.34it/s]
