# Data Processing and Loading

In [1]:
#Importing main packages

import numpy as np
import tensorflow as tf

## Data preprocessing
The main problem is that a 1.5GB JSON file is pretty much untractable, so i decided to split in one JSON per each image, deleting all useless information like lines and drivable areas.

Other than that, it seems like there are 140 images w/o label. That's pretty strange since i didn't see anyone ever mentioning it on the forums, but for now i will delete them and act like they never existed.

In [2]:
import pathlib
main_folder = str(pathlib.Path.cwd().joinpath('dataset_bdd', 'labels'))

json_train_dir = main_folder + '/bdd100k_labels_images_train.json'
json_val_dir = main_folder + '/bdd100k_labels_images_validation.json'

folder_train = main_folder + '/train_jsons/'
folder_val = main_folder + '/val_jsons/'

### Dividing data into different single JSONs
And using more useful coordinates to describe the bounding boxes

In [3]:
import json

#A function to get more meaningful values to describe the bounding boxes
def pov_change(x1,x2,y1,y2):
    xb = (x1+x2)/(2)
    yb = (y1+y2)/(2)
    wb = abs(x1-x2)/2
    hb = abs(y1-y2)/2
    return xb,yb,wb,hb

#Loading a JSON file into a Python variable
def json_parser(path):
    # Parsing
    with open(path, 'r') as read_file:
        data = json.load(read_file)
    return data

#Cleaning a JSON file from all the useless information (in respect to our task)
def json_cleaner(data):
    # Cleaning timestamps and not so useful attributes
    for item in data:
        del item['attributes']
        del item['timestamp']

    # Cleaning drivable area and lanes
    for item in data:
        storing_indexes = []
        for index, i in enumerate(item['labels']):
            del i['attributes']
            del i['manualShape']
            del i['manualAttributes']
            if 'box2d' in i:
                xb,yb,wb,hb = pov_change(i['box2d']['x1'],i['box2d']['x2'],i['box2d']['y1'],i['box2d']['y2'])
                i['box2d']['xb'] = round(xb,2)
                i['box2d']['yb'] = round(yb,2)
                i['box2d']['wb'] = round(wb,2)
                i['box2d']['hb'] = round(hb,2)
                del i['box2d']['x1']
                del i['box2d']['x2']
                del i['box2d']['y1']
                del i['box2d']['y2']
            # Checking if anything is corrupted
            if not 'poly2d' in i and not 'box2d' in i: print('no box2d?' + str(i['id']))
            if 'box3d' in i: print('wtf')
                
            del i['id']
            if i['category'] == 'lane' or i['category'] == 'drivable area':
                storing_indexes.append(index)
        storing_indexes.sort(reverse=True)
        for indexes in storing_indexes:
            del item['labels'][indexes]
    return data

#Dividing a single JSON into multiple ones
def split_data(data,path):
    for item in data:
        name = item['name']
        with open(path + name + '.json', 'w') as file_to_write:
            json.dump(item, file_to_write, indent = 4)
    return

Down below you can see how ugly formatted was the main JSON file

In [7]:
#Example function to show the difference between the data before and after the cleaning
def showme_data_format(path):
    with open(path, 'r') as read_file:
        data = json.load(read_file)
    print('Data type: ' + str(type(data)))
    print('Element of the list: ' + str(type(data[0])))
    print('Keys of the dictionaries: ')
    for key in data[0]:
        print('    ' + str(key))
    print('Dict example:')
    for key, value in data[0].items():
        if key != 'labels':
            print('    Key: ' + str(key))
            print('      Value: ' + str(value))
        else:
            print('    Key: ' + str(key))
            print('      Value: it is a ' + str(type(value)) + ' made of ' + str(type(value[0])))
            for ondex, obj in enumerate(value):
                if ondex < 2: print(obj)
    print('\n\n\n')

print('Initial Setup of the dataset:')
showme_data_format(main_folder + '/try.json')
data = json_parser(main_folder + '/try.json')
data_cleaned = json_cleaner(data)
print('Down here we have a full example of a cleaned single JSON \n')
print(data_cleaned[0])

Initial Setup of the dataset:
Data type: <class 'list'>
Element of the list: <class 'dict'>
Keys of the dictionaries: 
    name
    attributes
    timestamp
    labels
Dict example:
    Key: name
      Value: b1c66a42-6f7d68ca.jpg
    Key: attributes
      Value: {'weather': 'overcast', 'scene': 'city street', 'timeofday': 'daytime'}
    Key: timestamp
      Value: 10000
    Key: labels
      Value: it is a <class 'list'> made of <class 'dict'>
{'category': 'traffic sign', 'attributes': {'occluded': False, 'truncated': False, 'trafficLightColor': 'none'}, 'manualShape': True, 'manualAttributes': True, 'box2d': {'x1': 1000.698742, 'y1': 281.992415, 'x2': 1040.626872, 'y2': 326.91156}, 'id': 0}
{'category': 'traffic sign', 'attributes': {'occluded': False, 'truncated': False, 'trafficLightColor': 'none'}, 'manualShape': True, 'manualAttributes': True, 'box2d': {'x1': 214.613695, 'y1': 172.190058, 'x2': 274.505889, 'y2': 229.586743}, 'id': 1}




Down here we have a full example of a clea

Here we have the actual code that splits the original file: be aware that if you run it it may take really long!

In [5]:
runtime = False
if runtime:
    data_train = json_parser(json_train_dir)
    data_train = json_cleaner(data_train)
    split_data(data_train, folder_train)
    print('Training data: done')
    data_val = json_parser(json_val_dir)
    data_val = json_cleaner(data_val)
    split_data(data_val, folder_val)
    print('Validation data: done')

### Deleting corrupted image
Making sure every image in the train folder has a json file attached to it, otherwise delete the image.

In [76]:
def detect_missing_labels(delete = False):
    counter = 1
    main_dir = pathlib.Path.cwd().joinpath('dataset_bdd')
    img_dir = main_dir.joinpath('images', '100k', 'train').glob('*.jpg')
    label_dir = main_dir.joinpath('labels', 'train_jsons')
    for img in img_dir:
        img_name = img.name
        label_path = label_dir.joinpath(img_name + '.json')
        if not label_path.is_file():
            counter = counter + 1
            if delete:
                img.unlink()
                print('deleted')
    print('Images w/o label:' + str(counter))
    
detect_missing_labels()

Images w/o label:138


In [78]:
detect_missing_labels(delete=True)

Images w/o label:1


## Data Processing

In [6]:
batch_size = 32
img_height = 720
img_width = 1280

### Loading data from disk

In [None]:
#Here below a function to convert a file path to a tuple with both image and labels

data_dir = str(pathlib.Path.cwd().joinpath('dataset_bdd', 'images', '100k'))
pathlist_img = pathlib.Path(data_dir).glob('train/*.jpg')

def data_encoding(data): #Convert the class labels given as a dict into one-hot vector encoding, and concatenate it with the bounding box coordinates.
    #It return a Tensorflow Tensor
    labels = tf.constant([])
    class_names = ['bus', 'traffic light', 'traffic sign', 'person', 'bike', 'truck', 'motor', 'car', 'train', 'rider']
    for index,obj in enumerate(data['labels']):
        label_class = class_names.index(obj['category'])
        temp1 = tf.one_hot(label_class, len(class_names))
        temp2 = tf.constant([x for x in obj['box2d'].values()])
        temp3 = tf.concat([temp2, temp1], 0)
        labels = tf.concat([labels,temp3], 0)
    labels = tf.reshape(labels, [-1,14])
    return labels

def get_label(image_path): #Given the image path, it loads the labels from the JSON
    if isinstance(image_path, str):
        image_path = pathlib.Path(image_path)
    file_name = image_path.name + '.json'
    label_path = image_path.parent.parent.parent.parent.joinpath('labels', 'train_jsons', file_name)
    with open(label_path, 'r') as read_file:
        data = json.load(read_file)
    labels = data_encoding(data)
    return labels

def process_path(file_path): #Given an image path, it return a tuple (img, labels) where both are tensors
    labels = get_label(file_path)
    # load the raw data from the file as a string
    filepath = tf.constant(str(file_path))
    img = tf.io.read_file(filepath)
    img = tf.image.decode_jpeg(img, channels=3)
    return img, labels


def dataset_generator():
    for path in pathlist_img:
        img, labels = process_path(path)
        yield img, labels 

### Generating the tf.data.Dataset

In [None]:
dataset = tf.data.Dataset.from_generator(dataset_generator, (tf.uint8, tf.float32) , (tf.TensorShape([720, 1280, 3]), tf.TensorShape([None, 14])))
print(dataset)

dataset_s = dataset.shuffle(70000, reshuffle_each_iteration=False)
print(dataset)

val_size = int(7000)
train_ds = dataset_s.skip(val_size)
val_ds = dataset_s.take(val_size)

print(train_ds)
print(val_ds)
