In [None]:
import hub
import numpy as np
import boto3
import os
from tqdm import tqdm
from pycocotools.coco import COCO
from PIL import Image

## Upload COCO-train with Bounding Boxes Only

#### Coco will be uploaded using linked tensors, which means that instead of copying the image data to Hub format, the Hub dataset will store references to the S3 URLs where the images are stored.

### Define the path to the bucket with the source data and create a new hub dataset

In [None]:
# Create the connection to the source data
dataset_bucket = 'non-hub-datasets'

s3 = boto3.resource('s3',
         aws_access_key_id=os.environ.get('aws_access_key_id'), 
         aws_secret_access_key=os.environ.get('aws_secret_access_key'))

s3_bucket = s3.Bucket(dataset_bucket)

In [None]:
# Load the annotations locally for easier processing
ann_path = 'coco/annotations/instances_train2017.json'
local_ann_path = 'anns_train.json'

s3_bucket.download_file(ann_path, local_ann_path)
coco = COCO(local_ann_path)

category_info = coco.loadCats(coco.getCatIds())

In [None]:
# Create the hub dataset and connect it to managed credentials
ds = hub.empty('hub://dl-corp/coco-train', token = 'Insert API Token')

creds_name = "my_s3_creds"
ds.add_creds_key(creds_name, managed = True)

### Define the dataset tensors and create the parallel uploading function

In [None]:
# Create a list with all the cetegories
category_names = [category['name'] for category in category_info]

# Image ids for uploading
img_ids = sorted(coco.getImgIds())

# Create tensors
with ds:
    ds.create_tensor('images', htype = 'link[image]')
    ds.create_tensor('boxes', htype = 'bbox')
    ds.create_tensor('categories', htype = 'class_label', class_names = category_names)

In [None]:
# Parallel uploading function decorated with @hub.compute
@hub.compute
def coco_2_hub(img_id, sample_out, coco_api, bucket, creds_key):

    anns = coco_api.loadAnns(coco_api.getAnnIds(img_id))
    img_coco = coco_api.loadImgs(img_id)[0]
            
    # First Create empty arrays for all annotations
    categories = np.zeros((len(anns)))
    boxes = np.zeros((len(anns),4))
    
    # Then populate the arrays with the annotations data
    for i, ann in enumerate(anns):
        boxes[i,:] = ann['bbox']
        categories[i] = category_names.index([category_info[i]['name'] for i in range(len(category_info)) if category_info[i]['id']==ann['category_id']][0])
    
    img_url = "s3://{}/coco/train2017/{}".format(bucket, img_coco['file_name'])

    # Append data to the sample after all the annotations have been parsed
    sample_out.append({"images": hub.link(img_url, creds_key=creds_key),
                        "boxes": boxes.astype('float32'),
                        "categories": categories.astype('uint32')})
    
    return sample_out

### Run the parallel uploading function and commit the dataset

In [None]:
# Run the uplading function
coco_2_hub(coco_api = coco, bucket = dataset_bucket, creds_key = creds_name).eval(img_ids, ds, num_workers = 8)

In [None]:
ds.commit('Uploaded the dataset')