# Data Preprocessing and Loading

In [88]:
#Importing main packages

import numpy as np
import tensorflow as tf

### Dividing data into different single JSONs
The main problem is that a 1.5GB JSON file is pretty much untractable, so i decided to split in one JSON per each image, deleting all useless information like lines and drivable areas.

In [58]:
main_folder = str(pathlib.Path.cwd().joinpath('dataset_bdd', 'labels'))

json_train_dir = main_folder + '/bdd100k_labels_images_train.json'
json_val_dir = main_folder + '/bdd100k_labels_images_validation.json'

folder_train = main_folder + '/train_jsons/'
folder_val = main_folder + '/val_jsons/'

In [52]:
import json
import pathlib


def pov_change(x1,x2,y1,y2):
    xb = (x1+x2)/(2)
    yb = (y1+y2)/(2)
    wb = abs(x1-x2)/2
    hb = abs(y1-y2)/2
    return xb,yb,wb,hb

def json_parser(path):
    # Parsing
    with open(path, 'r') as read_file:
        data = json.load(read_file)
    return data

def json_cleaner(data):
    # Cleaning timestamps and not so useful attributes
    for item in data:
        del item['attributes']
        del item['timestamp']

    # Cleaning drivable area and lanes
    for item in data:
        storing_indexes = []
        for index, i in enumerate(item['labels']):
            del i['attributes']
            del i['manualShape']
            del i['manualAttributes']
            if 'box2d' in i:
                xb,yb,wb,hb = pov_change(i['box2d']['x1'],i['box2d']['x2'],i['box2d']['y1'],i['box2d']['y2'])
                i['box2d']['xb'] = round(xb,2)
                i['box2d']['yb'] = round(yb,2)
                i['box2d']['wb'] = round(wb,2)
                i['box2d']['hb'] = round(hb,2)
                del i['box2d']['x1']
                del i['box2d']['x2']
                del i['box2d']['y1']
                del i['box2d']['y2']
            # Checking if anything is corrupted
            if not 'poly2d' in i and not 'box2d' in i: print('no box2d?' + str(i['id']))
            if 'box3d' in i: print('wtf')
                
            del i['id']
            if i['category'] == 'lane' or i['category'] == 'drivable area':
                storing_indexes.append(index)
        storing_indexes.sort(reverse=True)
        for indexes in storing_indexes:
            del item['labels'][indexes]
    return data

def showme_data_format(path):
    with open(path, 'r') as read_file:
        data = json.load(read_file)
    print('Data type: ' + str(type(data)))
    print('Element of the list: ' + str(type(data[0])))
    print('Keys of the dictionaries: ')
    for key in data[0]:
        print('    ' + str(key))
    print('Dict example:')
    for key, value in data[0].items():
        if key != 'labels':
            print('    Key: ' + str(key))
            print('      Value: ' + str(value))
        else:
            print('    Key: ' + str(key))
            print('      Value: it is a ' + str(type(value)) + ' made of ' + str(type(value[0])))
            for ondex, obj in enumerate(value):
                if ondex < 2: print(obj)
    print('\n\n\n')
            
def split_data(data,path):
    for item in data:
        name = item['name']
        with open(path + name + '.json', 'w') as file_to_write:
            json.dump(item, file_to_write, indent = 4)
    return

In [59]:
print('Initial Setup of the dataset:')
showme_data_format(main_folder + '/try.json')
data = json_parser(main_folder + '/try.json')
data_cleaned = json_cleaner(data)
print('Down here we have a full example of a cleaned single JSON \n')
print(data_cleaned[0])

Initial Setup of the dataset:
Data type: <class 'list'>
Element of the list: <class 'dict'>
Keys of the dictionaries: 
    name
    attributes
    timestamp
    labels
Dict example:
    Key: name
      Value: b1c66a42-6f7d68ca.jpg
    Key: attributes
      Value: {'weather': 'overcast', 'scene': 'city street', 'timeofday': 'daytime'}
    Key: timestamp
      Value: 10000
    Key: labels
      Value: it is a <class 'list'> made of <class 'dict'>
{'category': 'traffic sign', 'attributes': {'occluded': False, 'truncated': False, 'trafficLightColor': 'none'}, 'manualShape': True, 'manualAttributes': True, 'box2d': {'x1': 1000.698742, 'y1': 281.992415, 'x2': 1040.626872, 'y2': 326.91156}, 'id': 0}
{'category': 'traffic sign', 'attributes': {'occluded': False, 'truncated': False, 'trafficLightColor': 'none'}, 'manualShape': True, 'manualAttributes': True, 'box2d': {'x1': 214.613695, 'y1': 172.190058, 'x2': 274.505889, 'y2': 229.586743}, 'id': 1}




Down here we have a full example of a clea

In [60]:
runtime = True
if runtime:
    data_train = json_parser(json_train_dir)
    data_train = json_cleaner(data_train)
    split_data(data_train, folder_train)
    print('Training data: done')
    data_val = json_parser(json_val_dir)
    data_val = json_cleaner(data_val)
    split_data(data_val, folder_val)
    print('Validation data: done')

Training data: done
Validation data: done


### Defining dataset parameters

In [3]:
batch_size = 32
img_height = 720
img_width = 1280

### Loading data from disk

In [85]:
import pathlib

data_dir = pathlib.Path.cwd().joinpath('dataset_bdd', 'images', '100k')

#We're gonna create a tf.data.Dataset object

train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    data_dir,
    label_mode = None,
    seed=123,
    image_size=(img_height, img_width),
    batch_size=batch_size)


Found 100000 files belonging to 3 classes.
