# Dataset Creation with nuImages devkit.

### Mounting Google Drive (Optional, Requires lots of disk space)

In [None]:
%pip install google-colab

In [None]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive/nuImages-yolo

## Initialization

In [None]:
%pip install wget
%pip install pyyaml

In [None]:
import os
import wget
import tarfile
import json
import yaml
from shutil import copyfile, rmtree

In [None]:
home_path = './'
raw_path = os.path.join(home_path, '/data/raw')
data_path = os.path.join(home_path, '/data/sets/nuimages/')
dataset_path = os.path.join(home_path, '/datasets/nuImages')
if not os.path.exists(raw_path):
    os.makedirs(raw_path)
if not os.path.exists(data_path):
    os.makedirs(data_path)
if not os.path.exists(dataset_path):
    os.makedirs(dataset_path)

## Downloading and Extracting the Dataset

In [None]:
wget.download('https://d36yt3mvayqw5m.cloudfront.net/public/nuimages-v1.0/nuimages-v1.0-all-samples.tgz', out=raw_path)
wget.download('https://d36yt3mvayqw5m.cloudfront.net/public/nuimages-v1.0/nuimages-v1.0-all-metadata.tgz', out=raw_path)

In [None]:
archives = os.listdir(raw_path)
for archive_name in archives:
    with tarfile.open(os.path.join(raw_path, archive_name)) as archive:
        archive.extractall(path=data_path)
    os.remove(os.path.join(raw_path, archive_name))

## Loading Dataset Tables

In [None]:
split = 'test' # the split of the dataset. Can be train/val/test

In [None]:
# load the tables
tables = {'sample_data': None, 'object_ann': None, 'category': None}
for name in tables.keys():
    with open(os.path.join(data_path, f'v1.0-{split}', f'{name}.json')) as table_file:
        tables[name] = json.load(table_file)

## Obtaining the Object Classes

In [None]:
class_indices = {}
classes = {}
c = 0

In [None]:
# Get class name given the token
def get_class(category_token):
    for category in tables['category']:
        if category['token'] == category_token:
            return category['name']
    return None

In [None]:
# Adding the classes
cat_count = len(tables['category'])
for i, category in enumerate(tables['category']):
    name = category['name']
    print(f'category: {i+1}/{cat_count} name: {name}')
    class_indices[name] = i
    classes[category['token']] = i

## Creating the samples

In [None]:
samples = {}
start = 0 # Start index, can be used for dividing the dataset
sample_count = 8000 # Number of samples to be created

In [None]:
# Get the filename given the input token
def get_filename(data_token):
    for sample in tables['sample_data']:
        if sample['token'] == data_token:
    return sample['filename']

# Compute the Normalized (Relative) bbox
def relative_bbox(raw_bbox, size):
    x_center = round((raw_bbox[0] + raw_bbox[2]) / (2 * size[1]), 6)
    y_center = round((raw_bbox[1] + raw_bbox[3]) / (2 * size[0]), 6)
    width = round((raw_bbox[2] - raw_bbox[0]) / size[1], 6)
    height = round((raw_bbox[3] - raw_bbox[1]) / size[0], 6)
    if x_center > 1 or y_center > 1 or width < 0 or height < 0:
        raise ValueError() #debugging
    return [x_center, y_center, width, height]

In [None]:
# Create the dataset dictionary

sample_data_list = list(filter(lambda s: s['is_key_frame'], tables['sample_data']))
sample_data_list = sample_data_list[start:start+sample_count]
sample_count = len(sample_data_list)
for i, sample in enumerate(sample_data_list):
    print(f'sample {i+1}/{sample_count}', end='')
    if not sample['is_key_frame']:
        print('not a keyframe')
        continue
    else:
        print('')
    token = sample['token']
    objects = list(filter(lambda obj: obj['sample_data_token']==token, tables['object_ann']))
    labels = []
    for obj in objects:
        size = obj['mask']['size'] if obj.get('mask', None) else [900, 1600]
        bbox = relative_bbox(obj['bbox'], size)
        obj_class = classes[obj['category_token']]
        obj_label = [obj_class] + bbox
        obj_label = ' '.join([str(x) for x in obj_label])
        labels.append(obj_label)
    samples[token] = {}
    samples[token]['filename'] = sample['filename']
    samples[token]['label'] = '\n'.join(labels)


In [None]:
# Save the samples to the files

for i, (token, sample) in enumerate(samples.items()):
    print(f'sample {i + 1}/{sample_count}')
    data_full_path = os.path.join(data_path, sample['filename'])
    data_new_path = os.path.join(dataset_path, 'images', split)
    if not os.path.exists(data_new_path):
        os.makedirs(data_new_path)
    copyfile(data_full_path, os.path.join(data_new_path, f'{token}.jpg'))
    label_path = os.path.join(dataset_path, 'labels', split)
    if not os.path.exists(label_path):
        os.makedirs(label_path)
    with open(os.path.join(label_path, f'{token}.txt'), 'w') as label_file:
        label_file.write(sample['label'])

In [None]:
# Optionally, archive the obtained samples and delete the folder to repeat the process.

archive_path = os.path.join(home_path, '/data/archives/')
if not os.path.exists(archive_path):
    os.makedirs(archive_path)
archive_name = 'nuImages-train-1.tar.gz'
with tarfile.open(os.path.join(archive_path, archive_name), "w:gz") as archive:
    archive.add(dataset_path, arcname=os.path.basename(dataset_path))

In [None]:
def remove(rm_path):
    if os.path.isfile(rm_path) or os.path.islink(rm_path):
        os.remove(rm_path)  # remove the file
    elif os.path.isdir(rm_path):
        rmtree(rm_path)  # remove dir and all contains
    else:
        raise ValueError(f'file {rm_path} is not a file or dir.')
    
remove(dataset_path)

## YAML Creation

In [None]:
# Create COCO format yaml file for the dataset 

def create_coco_yaml(class_dict, file_name):
    categories = {idx: name for name, idx in class_dict.items()}
    coco_dict = {
        'path': dataset_path,
        'train': 'images/train',
        'val': 'images/val',
        'names': categories
    }
    try:
        with open(file_name, 'w') as yaml_file:
            yaml.dump(coco_dict, yaml_file, default_flow_style=False)
        print(f"COCO dataset YAML successfully written to {file_name}")
    except Exception as e:
        print(f"Error writing COCO dataset to YAML file: {e}")

In [None]:
create_coco_yaml(class_indices, os.path.join(dataset_path, 'data.yaml'))

In [None]:
# Archive the yaml file

archive_name = 'nuImages-yaml.tar.gz'
with tarfile.open(os.path.join(archive_path, archive_name), "w:gz") as archive:
    archive.add(dataset_path, arcname=os.path.basename(dataset_path))