# Dataset Formatting
The Berkeley Deep Drive dataset is surely cool, but we need to filter the enormous amount of info that are given with each image.

In [2]:
#Importing main packages

import numpy as np
import tensorflow as tf
import cv2
import pathlib
import json

## Files preprocessing
The main problem is that a 1.5GB JSON file is pretty much untractable, so i decided to split it in one JSON per image, deleting all useless information like lines and drivable areas and saving the new files elsewhere.

Other than that, it seems like there are 140 images w/o label. That's pretty strange since i didn't see anyone ever mentioning it on the forums, but for now i will delete them and act like they never existed.

In [3]:
main_folder = str(pathlib.Path.cwd().joinpath('dataset_bdd', 'labels'))

json_train_dir = main_folder + '/bdd100k_labels_images_train.json'
json_val_dir = main_folder + '/bdd100k_labels_images_validation.json'

folder_train = main_folder + '/train_jsons/'
folder_train_csv = main_folder + '/train_label_raw/'
folder_val = main_folder + '/val_jsons/'
folder_val_csv = main_folder + '/val_label_raw/'

### New data form
And using more useful coordinates to describe the bounding boxes $(x, y, w, h)$ instead of $(x_1, x_2, y_1, y_2)$ and deleting all useless info

In [4]:
#A function to get more meaningful values to describe the bounding boxes
def pov_change(x1,x2,y1,y2):
    xb = (x1+x2)/(2)
    yb = (y1+y2)/(2)
    wb = abs(x1-x2)/2
    hb = abs(y1-y2)/2
    return xb,yb,wb,hb

#Loading a JSON file into a Python variable
def json_parser(path):
    with open(path, 'r') as read_file:
        data = json.load(read_file)
    return data

In [5]:
#Cleaning a JSON file from all the useless information
def json_cleaner(data):
    # Cleaning timestamps and not so useful attributes
    for item in data:
        del item['attributes']
        del item['timestamp']

    # Cleaning drivable area and lanes
    for item in data:
        storing_indexes = []
        for index, i in enumerate(item['labels']):
            del i['attributes']
            del i['manualShape']
            del i['manualAttributes']
            if 'box2d' in i:
                xb,yb,wb,hb = pov_change(i['box2d']['x1'],i['box2d']['x2'],i['box2d']['y1'],i['box2d']['y2'])
                i['box2d']['xb'] = round(xb)
                i['box2d']['yb'] = round(yb)
                i['box2d']['wb'] = round(wb)
                i['box2d']['hb'] = round(hb)
                del i['box2d']['x1']
                del i['box2d']['x2']
                del i['box2d']['y1']
                del i['box2d']['y2']
            # Checking if anything is corrupted
            if not 'poly2d' in i and not 'box2d' in i: print('no box2d?' + str(i['id']))
            if 'box3d' in i: print('wtf')
                
            del i['id']
            if i['category'] == 'lane' or i['category'] == 'drivable area':
                storing_indexes.append(index)
        storing_indexes.sort(reverse=True)
        for indexes in storing_indexes:
            del item['labels'][indexes]
    return data

### Splitting labels in single files
We're gonna create a json file for every image, with the same name + '.json'

In [6]:
#Dividing a single JSON into multiple ones
def split_data(data,path):
    for item in data:
        name = item['name']
        with open(path + name + '.json', 'w') as file_to_write:
            json.dump(item, file_to_write, indent = 4)
    return

## Update: splitting in raw files
It doesn't seem like a good idea to stick with json data format, since there are way more efficient way to store info and load them to tensorflow. The best choice may be TFRecord, but we'll make it simple using raw files.
I also decided to encode the category label into one-hot vector and add the objectiveness label, in such a way i get the 1x15 target vector already ok.

In [27]:
class_names = ['bus', 'traffic light', 'traffic sign', 'person', 'bike', 'truck', 'motor', 'car', 'train', 'rider']

def split_data_csv(data, path):
    for item in data:
        name = item['name']
        line = []
        for objects in item['labels']:
            one_hot = [int(objects['category'] == x) for x in class_names]
            line_to_append = [objects['box2d']['xb'],
                    objects['box2d']['yb'], 
                    objects['box2d']['wb'],
                    objects['box2d']['hb'],
                    1]
            for logic in one_hot: line_to_append.append(logic)
            line.append(line_to_append)
        array = np.array(line, dtype = 'int16')
        array.tofile(path + name + '.rawlabel')


May take a while to complete the code below, be aware!

In [25]:
!pwd

/home/andrea/AI/ispr_yolo/data


In [28]:
runtime = True
if runtime:
    data_train = json_parser(json_train_dir)
    data_train = json_cleaner(data_train)
    print('Starting...')
    split_data_csv(data_train, folder_train_csv)
    print('Training data: done')
    data_val = json_parser(json_val_dir)
    data_val = json_cleaner(data_val)
    split_data_csv(data_val, folder_val_csv)
    print('Validation data: done')

Starting...
Training data: done
Validation data: done


Starting...
Training data: done
Validation data: done


### Deleting corrupted images
Making sure every image in the train folder has a json file attached to it, otherwise delete the image.

In [8]:
def detect_missing_labels(delete = False):
    counter = 1
    main_dir = pathlib.Path.cwd().joinpath('dataset_bdd')
    img_dir = main_dir.joinpath('images', '100k', 'train').glob('*.jpg')
    label_dir = main_dir.joinpath('labels', 'train_jsons')
    for img in img_dir:
        img_name = img.name
        label_path = label_dir.joinpath(img_name + '.json')
        if not label_path.is_file():
            counter = counter + 1
            if delete:
                img.unlink()
                print('deleted')
    print('Images w/o label:' + str(counter))
    
detect_missing_labels()

Images w/o label:1


## New Data Format

Here below you can see how data was initially formatted, and how we cleaned it up

In [9]:
#Example function to show the difference between the data before and after the cleaning
def showme_data_format(path):
    with open(path, 'r') as read_file:
        data = json.load(read_file)
    print('Data type: ' + str(type(data)))
    print('Element of the list: ' + str(type(data[0])))
    print('Keys of the dictionaries: ')
    for key in data[0]:
        print('    ' + str(key))
    print('Dict example:')
    for key, value in data[0].items():
        if key != 'labels':
            print('    Key: ' + str(key))
            print('      Value: ' + str(value))
        else:
            print('    Key: ' + str(key))
            print('      Value: it is a ' + str(type(value)) + ' made of ' + str(type(value[0])))
            for ondex, obj in enumerate(value):
                if ondex < 2: print(obj)
    print('\n\n\n')

print('Initial Setup of the dataset:')
showme_data_format(main_folder + '/try.json')
data = json_parser(main_folder + '/try.json')
data_cleaned = json_cleaner(data)

Initial Setup of the dataset:
Data type: <class 'list'>
Element of the list: <class 'dict'>
Keys of the dictionaries: 
    name
    attributes
    timestamp
    labels
Dict example:
    Key: name
      Value: b1c66a42-6f7d68ca.jpg
    Key: attributes
      Value: {'weather': 'overcast', 'scene': 'city street', 'timeofday': 'daytime'}
    Key: timestamp
      Value: 10000
    Key: labels
      Value: it is a <class 'list'> made of <class 'dict'>
{'category': 'traffic sign', 'attributes': {'occluded': False, 'truncated': False, 'trafficLightColor': 'none'}, 'manualShape': True, 'manualAttributes': True, 'box2d': {'x1': 1000.698742, 'y1': 281.992415, 'x2': 1040.626872, 'y2': 326.91156}, 'id': 0}
{'category': 'traffic sign', 'attributes': {'occluded': False, 'truncated': False, 'trafficLightColor': 'none'}, 'manualShape': True, 'manualAttributes': True, 'box2d': {'x1': 214.613695, 'y1': 172.190058, 'x2': 274.505889, 'y2': 229.586743}, 'id': 1}






In [10]:
print('Down here we have a full example of a cleaned single JSON \n')
print(data_cleaned[0])

Down here we have a full example of a cleaned single JSON 

{'name': 'b1c66a42-6f7d68ca.jpg', 'labels': [{'category': 'traffic sign', 'box2d': {'xb': 1020.66, 'yb': 304.45, 'wb': 19.96, 'hb': 22.46}}, {'category': 'traffic sign', 'box2d': {'xb': 244.56, 'yb': 200.89, 'wb': 29.95, 'hb': 28.7}}, {'category': 'traffic sign', 'box2d': {'xb': 813.54, 'yb': 327.54, 'wb': 16.22, 'hb': 14.35}}, {'category': 'traffic sign', 'box2d': {'xb': 668.8, 'yb': 309.44, 'wb': 16.22, 'hb': 6.24}}, {'category': 'traffic light', 'box2d': {'xb': 711.84, 'yb': 320.05, 'wb': 4.37, 'hb': 8.11}}, {'category': 'traffic light', 'box2d': {'xb': 631.36, 'yb': 306.32, 'wb': 4.99, 'hb': 10.61}}, {'category': 'traffic light', 'box2d': {'xb': 323.17, 'yb': 298.21, 'wb': 6.24, 'hb': 8.73}}, {'category': 'traffic sign', 'box2d': {'xb': 290.1, 'yb': 295.09, 'wb': 19.34, 'hb': 6.86}}, {'category': 'traffic sign', 'box2d': {'xb': 226.47, 'yb': 306.32, 'wb': 5.61, 'hb': 5.61}}, {'category': 'car', 'box2d': {'xb': 243.94, 'yb'