# Dataset Creation with nuImages devkit.

### Mounting Google Drive (Optional, Requires lots of disk space)

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/nuImages-yolo

## Downloading and Installing Dependencies

In [None]:
%mkdir -p ./data/sets/nuimages  # Make the directory to store the nuImages dataset in.

!wget https://d36yt3mvayqw5m.cloudfront.net/public/nuimages-v1.0/nuimages-v1.0-all-samples.tgz # Download samples.

!tar -xf nuimages-v1.0-all-samples.tgz -C ./data/sets/nuimages  # Uncompress.

!wget https://d36yt3mvayqw5m.cloudfront.net/public/nuimages-v1.0/nuimages-v1.0-all-metadata.tgz # Download metadata

!tar -xf nuimages-v1.0-all-metadata.tgz -C ./data/sets/nuimages  # Uncompress.

--2024-05-24 17:03:28--  https://d36yt3mvayqw5m.cloudfront.net/public/nuimages-v1.0/nuimages-v1.0-all-samples.tgz
Resolving d36yt3mvayqw5m.cloudfront.net (d36yt3mvayqw5m.cloudfront.net)... 108.156.78.96, 108.156.78.61, 108.156.78.205, ...
Connecting to d36yt3mvayqw5m.cloudfront.net (d36yt3mvayqw5m.cloudfront.net)|108.156.78.96|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16381401772 (15G) [application/x-tar]
Saving to: ‘nuimages-v1.0-all-samples.tgz’


2024-05-24 17:16:15 (20.4 MB/s) - ‘nuimages-v1.0-all-samples.tgz’ saved [16381401772/16381401772]

--2024-05-24 17:21:14--  https://d36yt3mvayqw5m.cloudfront.net/public/nuimages-v1.0/nuimages-v1.0-all-metadata.tgz
Resolving d36yt3mvayqw5m.cloudfront.net (d36yt3mvayqw5m.cloudfront.net)... 108.156.78.205, 108.156.78.96, 108.156.78.61, ...
Connecting to d36yt3mvayqw5m.cloudfront.net (d36yt3mvayqw5m.cloudfront.net)|108.156.78.205|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 639386088 

## Initialization

In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os
import json
import yaml
from shutil import copyfile

In [None]:
root = './data/sets/nuimages/' # directory of the downloaded data
split = 'test' # the split of the dataset. Can be train/val/test
path = './datasets/nuImages' # the root directory of the created dataset

In [None]:
# load the tables

tables = {'sample_data': None, 'object_ann': None, 'category': None}
for name in tables.keys():
  with open(os.path.join(root, f'v1.0-{split}', f'{name}.json')) as table_file:
    tables[name] = json.load(table_file)

## Obtaining the classes

In [None]:
class_indices = {}
classes = {}
c = 0

In [None]:
def get_class(category_token):
  for category in tables['category']:
    if category['token'] == category_token:
      return category['name']
  raise KeyError()
  return None

In [None]:
cat_count = len(tables['category'])
for i, category in enumerate(tables['category']):
  print(f'category: {i+1}/{cat_count}')
  class_indices[category['name']] = i
  classes[category['token']] = i

## Creating the samples.

In [None]:
samples = {}
start = 0
sample_count = 8000

In [None]:
def get_filename(data_token):
  for sample in tables['sample_data']:
    if sample['token'] == data_token:
      return sample['filename']
  raise KeyError()
  return None


def get_class(category_token):
  for category in tables['category']:
    if category['token'] == category_token:
      return category['name']
  raise KeyError()
  return None


def relative_bbox(raw_bbox, size):
  x_center = round((raw_bbox[0] + raw_bbox[2]) / (2 * size[1]), 6)
  y_center = round((raw_bbox[1] + raw_bbox[3]) / (2 * size[0]), 6)
  width = round((raw_bbox[2] - raw_bbox[0]) / size[1], 6)
  height = round((raw_bbox[3] - raw_bbox[1]) / size[0], 6)
  if x_center > 1 or y_center > 1 or width < 0 or height < 0:
    raise ValueError() #debugging
  return [x_center, y_center, width, height]

In [None]:
# create the dataset dictionary

sample_data_list = list(filter(lambda s: s['is_key_frame'], tables['sample_data']))
for i, sample in enumerate(sample_data_list[start:start+sample_count]):
  print(f'sample {i+1}/{sample_count}', end='')
  if not sample['is_key_frame']:
    print('not a keyframe')
    continue
  else:
    print('')
  token = sample['token']
  objects = list(filter(lambda obj: obj['sample_data_token']==token, tables['object_ann']))
  labels = []
  for obj in objects:
    size = obj['mask']['size'] if obj.get('mask', None) else [900, 1600]
    bbox = relative_bbox(obj['bbox'], size)
    obj_class = classes[obj['category_token']]
    obj_label = [obj_class] + bbox
    obj_label = ' '.join([str(x) for x in obj_label])
    labels.append(obj_label)
  samples[token] = {}
  samples[token]['filename'] = sample['filename']
  samples[token]['label'] = '\n'.join(labels)


In [None]:
# save the samples to the files
for i, (token, sample) in enumerate(samples.items()):
  print(f'sample {i + 1}/{sample_count}')
  data_full_path = os.path.join(root, sample['filename'])
  data_new_path = os.path.join(path, 'images', split)
  if not os.path.exists(data_new_path):
    os.makedirs(data_new_path)
  copyfile(data_full_path, os.path.join(data_new_path, f'{token}.jpg'))

  label_path = os.path.join(path, 'labels', split)
  if not os.path.exists(label_path):
    os.makedirs(label_path)
  with open(os.path.join(label_path, f'{token}.txt'), 'w') as label_file:
    label_file.write(sample['label'])

In [None]:
!tar -czvf nuImages-val-2.tar.gz ./datasets/

In [None]:
!rm -rf ./datasets

## YAML Creation

In [None]:
# Run this if the previous processes were not executed in the current runtime.
os.makedirs(path)
path = './datasets/nuImages'

In [None]:
def create_coco_yaml(class_dict, file_name):
    # Construct the categories list in COCO format
    categories = {idx: name for name, idx in class_dict.items()}

    # Construct the final COCO dataset dictionary
    coco_dict = {
        'path': path,
        'train': 'images/train',
        'val': 'images/val',
        'names': categories
    }

    # Write the COCO dictionary to a YAML file
    try:
        with open(file_name, 'w') as yaml_file:
            yaml.dump(coco_dict, yaml_file, default_flow_style=False)
        print(f"COCO dataset YAML successfully written to {file_name}")
    except Exception as e:
        print(f"Error writing COCO dataset to YAML file: {e}")

In [None]:
create_coco_yaml(class_indices, os.path.join(path, 'data.yaml'))

COCO dataset YAML successfully written to /datasets/nuImages/data.yaml


In [None]:
!tar -czvf nuImages-yaml.tar.gz ../datasets/

## The Test set

In [None]:
samples = {}
start = 0

9752

In [None]:
 # run this if training set procedure was not completed in the current runtime

 def get_filename(data_token):
  for sample in tables['sample_data']:
    if sample['token'] == data_token:
      return sample['filename']
  raise KeyError()
  return None

In [None]:
sample_data_list = list(filter(lambda s: s['is_key_frame'], tables['sample_data']))
for i, sample in enumerate(sample_data_list):
  print(f'sample {i+1}/{sample_count}', end='')
  if not sample['is_key_frame']:
    print('not a keyframe')
    continue
  else:
    print('')
  token = sample['token']
  samples[token] = {}
  samples[token]['filename'] = sample['filename']
  samples[token]['label'] = []


for i, (token, sample) in enumerate(samples.items()):
  print(f'sample {i + 1}/{sample_count}')
  data_full_path = os.path.join(root, sample['filename'])
  data_new_path = os.path.join(path, 'images', split)
  if not os.path.exists(data_new_path):
    os.makedirs(data_new_path)
  copyfile(data_full_path, os.path.join(data_new_path, f'{token}.jpg'))

In [None]:
!tar -czvf nuImages-test.tar.gz ./datasets/