# build_new_dataset.ipynb

In [10]:
NEW_DATASET_DIR = '/home/aubrey/Desktop/Guam07-training-set/datasets/rawdatasubset'
IMAGE_DIR_PATH = '/home/aubrey/Desktop/Guam07-training-set/rawdatasubset'
OBJECTS_DB_FILE_PATH = '/home/aubrey/Desktop/Guam07-training-set/code/rawdatasubset.sqlite3'

In [11]:
import os
import shutil
import subprocess
import sqlite3

## Create data structure for the new dataset

In [12]:
# deletes folders and files if they exist
dirpath = NEW_DATASET_DIR
if os.path.exists(dirpath) and os.path.isdir(dirpath):
    shutil.rmtree(dirpath)

# create folder structure
for dir in ['train', 'val', 'test']:
    os.makedirs(f'{NEW_DATASET_DIR}/{dir}')

# save data.yaml in top level folder
yaml = f'''{NEW_DATASET_DIR}
train: train
val: val
test: test
names:
  0: zero
  1: low
  2: medium
  3: high 
  4: fatal
  5: vcut'''

with open(f'{NEW_DATASET_DIR}/data.yaml', 'w') as f:
    f.write(yaml)
    
# save classes.txt in train, val and test folders
classes = f'''zero
low
medium
high
fatal
vcut'''

for dir in ['train', 'val', 'test']:
    with open(f'{NEW_DATASET_DIR}/{dir}/classes.txt', 'w') as f:
        f.write(classes)

## Populate folders with symlinks to original images (\*.jpg) and labels files (\*.txt)

In [13]:
conn = sqlite3.connect(OBJECTS_DB_FILE_PATH)
conn.row_factory = sqlite3.Row   # enables accessing values in results by field name

for row in conn.execute('SELECT imagepath, subset FROM detected_objects GROUP BY imagepath, subset;'):
    imagepath = row['imagepath'] 
    subset = row['subset']
    print(imagepath, subset) 
    
    # Create symlink to image (<imagepath>.jpg)
    src = imagepath
    dst = f'{NEW_DATASET_DIR}/{subset}'
    subprocess.run(['ln', '-s', src, dst])
    
    # Write labels file (<imagepath>.txt)
    s = ''
    for r in conn.execute(f'SELECT cls, x, y, w, h FROM detected_objects WHERE imagepath == "{imagepath}" ORDER BY x;'):
        s += f"{r['cls']} {r['x']:.6f} {r['y']:.6f} {r['w']:.6f} {r['h']:.6f}\n"
    s = s[:-1]   # delete final newline char
    filename = os.path.basename(imagepath).replace('.jpg', '.txt')
    filepath = f'{NEW_DATASET_DIR}/{subset}/{filename}'
    with open(filepath, 'w') as f:
        f.write(s)
    
conn.close()

/home/aubrey/Desktop/Guam07-training-set/rawdatasubset/IMG_20221115_111715.jpg train
/home/aubrey/Desktop/Guam07-training-set/rawdatasubset/IMG_20221115_111717.jpg train
/home/aubrey/Desktop/Guam07-training-set/rawdatasubset/IMG_20221115_111718.jpg test
/home/aubrey/Desktop/Guam07-training-set/rawdatasubset/IMG_20221115_111719.jpg train
/home/aubrey/Desktop/Guam07-training-set/rawdatasubset/IMG_20221115_111720.jpg test
/home/aubrey/Desktop/Guam07-training-set/rawdatasubset/IMG_20221115_111721.jpg train
/home/aubrey/Desktop/Guam07-training-set/rawdatasubset/IMG_20221115_111722.jpg train
/home/aubrey/Desktop/Guam07-training-set/rawdatasubset/IMG_20221115_111723.jpg train
/home/aubrey/Desktop/Guam07-training-set/rawdatasubset/IMG_20221115_111724.jpg train
/home/aubrey/Desktop/Guam07-training-set/rawdatasubset/IMG_20221115_111725.jpg test
/home/aubrey/Desktop/Guam07-training-set/rawdatasubset/IMG_20221115_111728.jpg train
/home/aubrey/Desktop/Guam07-training-set/rawdatasubset/IMG_20221115_