# Reads in the dataset

In [1]:
# Imports libraries
import json

In [2]:
# Declares constants
dataset_filename = "bdd100k_labels_images_train.json"

In [3]:
# Reads in the declared file
with open(dataset_filename, "r") as read_file:
    dataset = json.load(read_file)

# Gets properties of the dataset
dataset_len = len(dataset)
dataset_keys = dataset[0].keys()

## Describes the dataset

In [4]:
print('Number of Groups: {0}'.format(dataset_len))

Number of Groups: 69863


In [5]:
print('Number of keys per Group: {0}'.format(len(dataset_keys)))

# Print each key
for key in dataset_keys:
    print('  key: {0}'.format(key))

Number of keys per Group: 4
  key: name
  key: attributes
  key: timestamp
  key: labels


In [6]:
import random

sample_index = random.randint(0, dataset_len)
sample = dataset[sample_index]

sample

{'name': '45cee415-3919c3d5.jpg',
 'attributes': {'weather': 'clear',
  'scene': 'city street',
  'timeofday': 'night'},
 'timestamp': 10000,
 'labels': [{'category': 'traffic light',
   'attributes': {'occluded': False,
    'truncated': False,
    'trafficLightColor': 'red'},
   'manualShape': True,
   'manualAttributes': True,
   'box2d': {'x1': 423.868242,
    'y1': 239.22934,
    'x2': 459.752644,
    'y2': 247.030297},
   'id': 757793},
  {'category': 'traffic light',
   'attributes': {'occluded': False,
    'truncated': False,
    'trafficLightColor': 'red'},
   'manualShape': True,
   'manualAttributes': True,
   'box2d': {'x1': 494.076855,
    'y1': 239.22934,
    'x2': 517.479726,
    'y2': 250.15068},
   'id': 757794},
  {'category': 'traffic light',
   'attributes': {'occluded': False,
    'truncated': False,
    'trafficLightColor': 'red'},
   'manualShape': True,
   'manualAttributes': True,
   'box2d': {'x1': 1076.028249,
    'y1': 240.789532,
    'x2': 1099.43112,
    'y

## Helper functions

In [7]:
def get_labels(group, include):
    def filter_categories(label):
        return label['category'] in include
    
    return list(filter(filter_categories, group['labels']))

car_labels = get_labels(dataset[0], {'car'})

In [8]:
car_labels

[{'category': 'car',
  'attributes': {'occluded': False,
   'truncated': False,
   'trafficLightColor': 'none'},
  'manualShape': True,
  'manualAttributes': True,
  'box2d': {'x1': 45.240919,
   'y1': 254.530367,
   'x2': 357.805838,
   'y2': 487.906215},
  'id': 4},
 {'category': 'car',
  'attributes': {'occluded': False,
   'truncated': False,
   'trafficLightColor': 'none'},
  'manualShape': True,
  'manualAttributes': True,
  'box2d': {'x1': 507.82755,
   'y1': 221.727518,
   'x2': 908.367588,
   'y2': 442.715126},
  'id': 5}]

## Converts the dataset

In [9]:
def flatten_labels(include):
    labels = []
    
    # Iterates through each group in the dataset
    for i in range(dataset_len):
        group_labels = get_labels(dataset[i], include)
        
        # Iterates through each label from this group
        for j in range(len(group_labels)):
            group_labels[j].update( {'name'       : dataset[i]['name']} )
            group_labels[j].update( {'properties' : dataset[i]['attributes']} )
            group_labels[j].update( {'timestamp'  : dataset[i]['timestamp']} )
        
        # Adds this group's labels to the list of labels
        labels = labels + group_labels
    return labels

pedestrian_data = flatten_labels({'person'})

In [10]:
pedestrian_data

[{'category': 'person',
  'attributes': {'occluded': False,
   'truncated': False,
   'trafficLightColor': 'none'},
  'manualShape': True,
  'manualAttributes': True,
  'box2d': {'x1': 423.158909,
   'y1': 336.241987,
   'x2': 434.176547,
   'y2': 358.277263},
  'id': 14,
  'name': '0000f77c-62c2a288.jpg',
  'properties': {'weather': 'clear',
   'scene': 'highway',
   'timeofday': 'dawn/dusk'},
  'timestamp': 10000},
 {'category': 'person',
  'attributes': {'occluded': False,
   'truncated': False,
   'trafficLightColor': 'none'},
  'manualShape': True,
  'manualAttributes': True,
  'box2d': {'x1': 563.939839,
   'y1': 324.000165,
   'x2': 573.733294,
   'y2': 348.483804},
  'id': 15,
  'name': '0000f77c-62c2a288.jpg',
  'properties': {'weather': 'clear',
   'scene': 'highway',
   'timeofday': 'dawn/dusk'},
  'timestamp': 10000},
 {'category': 'person',
  'attributes': {'occluded': True,
   'truncated': False,
   'trafficLightColor': 'none'},
  'manualShape': True,
  'manualAttributes'

In [11]:
# Transforms the obtained data
def transform_data(transform, data):
    transformed_data = []
    
    # Transforms each row in the given data
    for label in data:
        new_label = dict(label)
        
        # Transforms the data with the given transform function
        transformed_label = transform(new_label)
        transformed_data.append(transformed_label)
    
    return transformed_data

In [12]:
# Adds new columns
def flatten_data(label):
    # flattens the box2d data
    label.update( {'x1' : label['box2d']['x1']} )
    label.update( {'y1' : label['box2d']['y1']} )
    label.update( {'x2' : label['box2d']['x2']} )
    label.update( {'y2' : label['box2d']['y2']} )
    
    # flattens the attributes data
    label.update( {'occluded'          : label['attributes']['occluded']} )
    label.update( {'truncated'         : label['attributes']['truncated']} )
    label.update( {'trafficLightColor' : label['attributes']['trafficLightColor']} )
    
    # flattens the attributes data
    label.update( {'weather'   : label['properties']['weather']} )
    label.update( {'scene'     : label['properties']['scene']} )
    label.update( {'timeofday' : label['properties']['timeofday']} )
    
    # removes the hierarchical data
    del label['box2d']
    del label['attributes']
    del label['properties']
    
    # returns the transformed data
    return label

# Deletes unused columns
def dump_columns(label):
    del label['manualShape']
    del label['manualAttributes']
    
    # returns the transformed data
    return label

In [13]:
flattened_data = transform_data(flatten_data, pedestrian_data)
cleaned_data = transform_data(dump_columns, flattened_data)

In [14]:
cleaned_data

[{'category': 'person',
  'id': 14,
  'name': '0000f77c-62c2a288.jpg',
  'timestamp': 10000,
  'x1': 423.158909,
  'y1': 336.241987,
  'x2': 434.176547,
  'y2': 358.277263,
  'occluded': False,
  'truncated': False,
  'trafficLightColor': 'none',
  'weather': 'clear',
  'scene': 'highway',
  'timeofday': 'dawn/dusk'},
 {'category': 'person',
  'id': 15,
  'name': '0000f77c-62c2a288.jpg',
  'timestamp': 10000,
  'x1': 563.939839,
  'y1': 324.000165,
  'x2': 573.733294,
  'y2': 348.483804,
  'occluded': False,
  'truncated': False,
  'trafficLightColor': 'none',
  'weather': 'clear',
  'scene': 'highway',
  'timeofday': 'dawn/dusk'},
 {'category': 'person',
  'id': 16,
  'name': '0000f77c-62c2a288.jpg',
  'timestamp': 10000,
  'x1': 656.977672,
  'y1': 315.838956,
  'x2': 666.771127,
  'y2': 345.219323,
  'occluded': True,
  'truncated': False,
  'trafficLightColor': 'none',
  'weather': 'clear',
  'scene': 'highway',
  'timeofday': 'dawn/dusk'},
 {'category': 'person',
  'id': 60,
  'na

## Outputs the dataset

In [15]:
# Deletes unused columns
def formats_data(label):
    del label['category']
    del label['name']
    del label['occluded']
    del label['truncated']
    del label['weather']
    del label['scene']
    del label['timeofday']
    
    # returns the transformed data
    return label

In [16]:
output_data = transform_data(formats_data, cleaned_data)

In [17]:
output_data

[{'id': 14,
  'timestamp': 10000,
  'x1': 423.158909,
  'y1': 336.241987,
  'x2': 434.176547,
  'y2': 358.277263,
  'trafficLightColor': 'none'},
 {'id': 15,
  'timestamp': 10000,
  'x1': 563.939839,
  'y1': 324.000165,
  'x2': 573.733294,
  'y2': 348.483804,
  'trafficLightColor': 'none'},
 {'id': 16,
  'timestamp': 10000,
  'x1': 656.977672,
  'y1': 315.838956,
  'x2': 666.771127,
  'y2': 345.219323,
  'trafficLightColor': 'none'},
 {'id': 60,
  'timestamp': 10000,
  'x1': 695.316564,
  'y1': 343.179158,
  'x2': 712.961393,
  'y2': 386.140482,
  'trafficLightColor': 'none'},
 {'id': 133,
  'timestamp': 10000,
  'x1': 912.122852,
  'y1': 350.279126,
  'x2': 949.046862,
  'y2': 436.1025,
  'trafficLightColor': 'none'},
 {'id': 206,
  'timestamp': 10000,
  'x1': 199.056526,
  'y1': 335.816133,
  'x2': 316.518755,
  'y2': 604.510982,
  'trafficLightColor': 'none'},
 {'id': 207,
  'timestamp': 10000,
  'x1': 727.531679,
  'y1': 390.56191,
  'x2': 745.151014,
  'y2': 444.888191,
  'traffic

In [18]:
import pandas as pd

output = pd.DataFrame(output_data)

In [19]:
output

Unnamed: 0,id,timestamp,trafficLightColor,x1,x2,y1,y2
0,14,10000,none,423.158909,434.176547,336.241987,358.277263
1,15,10000,none,563.939839,573.733294,324.000165,348.483804
2,16,10000,none,656.977672,666.771127,315.838956,345.219323
3,60,10000,none,695.316564,712.961393,343.179158,386.140482
4,133,10000,none,912.122852,949.046862,350.279126,436.102500
5,206,10000,none,199.056526,316.518755,335.816133,604.510982
6,207,10000,none,727.531679,745.151014,390.561910,444.888191
7,208,10000,none,862.613240,877.296017,384.688798,433.141967
8,209,10000,none,884.637407,896.383632,383.220520,419.927465
9,210,10000,none,872.891186,890.510521,384.688798,434.610245


In [20]:
output.to_csv(dataset_filename + '.output.csv')