## Install Hub, Coco API

In [None]:
!pip install 'git+https://github.com/cocodataset/cocoapi.git#subdirectory=PythonAPI'
!pip install hub

In [None]:
!pip install git+https://github.com/activeloopai/Hub

## Download and Unzip COCO Data

In [None]:
# ONLY RUN ONCE
!mkdir ./Datasets/coco
!mkdir ./Datasets/coco/annotations
!wget -P ./Datasets/coco http://images.cocodataset.org/zips/train2017.zip
!wget -P ./Datasets/coco http://images.cocodataset.org/zips/val2017.zip
!wget -P ./Datasets/coco http://images.cocodataset.org/zips/test2017.zip
!wget -P ./Datasets/coco http://images.cocodataset.org/zips/unlabeled2017.zip
    
!wget -P ./Datasets/coco http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!wget -P ./Datasets/coco http://images.cocodataset.org/annotations/stuff_annotations_trainval2017.zip
!wget -P ./Datasets/coco http://images.cocodataset.org/annotations/image_info_test2017.zip
!wget -P ./Datasets/coco http://images.cocodataset.org/annotations/image_info_unlabeled2017.zip

!unzip -q ./Datasets/coco/train2017.zip -d ./Datasets/coco
!unzip -q ./Datasets/coco/val2017.zip -d ./Datasets/coco
!unzip -q ./Datasets/coco/test2017.zip -d ./Datasets/coco
!unzip -q ./Datasets/coco/unlabeled2017.zip -d ./Datasets/coco

!unzip -q ./Datasets/coco/annotations_trainval2017.zip -d ./Datasets/coco
!unzip -q ./Datasets/coco/stuff_annotations_trainval2017.zip -d ./Datasets/coco
!unzip -q ./Datasets/coco/image_info_test2017.zip -d ./Datasets/coco
!unzip -q ./Datasets/coco/image_info_unlabeled2017.zip -d ./Datasets/coco

!unzip -q ./Datasets/coco/annotations/stuff_val2017_pixelmaps.zip -d ./Datasets/coco/annotations
!unzip -q ./Datasets/coco/annotations/stuff_train2017_pixelmaps.zip -d ./Datasets/coco/annotations

!rm -r ./Datasets/coco/train2017.zip
!rm -r ./Datasets/coco/val2017.zip
!rm -r ./Datasets/coco/test2017.zip
!rm -r ./Datasets/coco/unlabeled2017.zip
!rm -r ./Datasets/coco/stuff_annotations_trainval2017.zip
!rm -r ./Datasets/coco/image_info_unlabeled2017.zip
!rm -r ./Datasets/coco/image_info_test2017.zip
!rm -r ./Datasets/coco/annotations_trainval2017.zip

## Import Dataset To Hub

In [1]:
%env BUGGER_OFF=true
!activeloop reporting --off
import hub
import numpy as np
import os
from pycocotools.coco import COCO
from PIL import Image
import time
from tqdm import tqdm

env: BUGGER_OFF=true
Traceback (most recent call last):
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/bin/activeloop", line 5, in <module>
    from hub.cli.commands import cli
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/hub/__init__.py", line 26, in <module>
    from .api.dataset import dataset
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/hub/api/dataset.py", line 6, in <module>
    from hub.auto.unstructured.image_classification import ImageClassification
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/hub/auto/unstructured/image_classification.py", line 14, in <module>
    from hub.core.dataset import Dataset
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/hub/core/dataset/__init__.py", line 1, in <module>
    from .dataset import Dataset
  File "/home/ec2-user/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-package

ModuleNotFoundError: No module named 'hub.core.tiling'

### User Inputs

In [None]:
data_dir='./Datasets/coco'
data_type='val' # Valid choices are 'train' and 'val'. Testing is a special case at the end of the notebook

hub_path = './Datasets/coco_local_{}'.format(data_type) # 'hub://my_worksace/coco_{}'.format(data_type)

limit = 1e10 # Limit the number of images

### Load Annotations

In [None]:
ann_file='{}/annotations/instances_{}2017.json'.format(data_dir,data_type)
ann_file_kp = '{}/annotations/person_keypoints_{}2017.json'.format(data_dir,data_type)
ann_file_stuff = '{}/annotations/stuff_{}2017.json'.format(data_dir,data_type)
img_root='{}/{}2017/'.format(data_dir,data_type)

coco = COCO(ann_file)
coco_kp=COCO(ann_file_kp)
coco_stuff=COCO(ann_file_stuff)

category_info = coco.loadCats(coco.getCatIds())
category_info_kp = coco_kp.loadCats(coco_kp.getCatIds())
category_info_stuff = coco_stuff.loadCats(coco_stuff.getCatIds())

### Create hub dataset

In [None]:
# Login to activeloop if using Activeloop Storage (hub://.....)
!activeloop login -u 'username' -p 'password'

In [None]:
#Specify dataset path
ds = hub.empty(hub_path, overwrite = True) # Set overwrite = True if you need to start over

### Create lists for all the class_names

In [None]:
cat_names = [category['name'] for category in category_info]
super_cat_names = list(set([category['supercategory'] for category in category_info]))
cat_names_kp = [category['name'] for category in category_info_kp]
super_cat_names_kp = list(set([category['supercategory'] for category in category_info_kp]))
cat_names_stuff = [category['name'] for category in category_info_stuff]
super_cat_names_stuff = list(set([category['supercategory'] for category in category_info_stuff]))

### Upload data to Hub dataset

In [None]:
img_ids = sorted(coco.getImgIds()) # Image ids for uploading
count = 1

start_time = time.time()

with ds:
    
    ## ---- Create Tensors ----- ##
    
    #Primary Data
    ds.create_tensor('images', htype = 'image', sample_compression = 'jpg')
    ds.create_tensor('images_meta', htype = 'json')
    ds.create_tensor('masks', htype = 'binary_mask', sample_compression = 'lz4')
    ds.create_tensor('boxes', htype = 'bbox')
    ds.create_tensor('categories', htype = 'class_label', class_names = cat_names)
    ds.create_tensor('super_categories', htype = 'class_label', class_names = super_cat_names)
    ds.create_tensor('areas', dtype = 'uint32')
    ds.create_tensor('iscrowds', dtype = 'bool')
    
    #Pose
    ds.create_group('pose')
    ds.pose.create_tensor('categories', htype = 'class_label', class_names = cat_names_kp)
    ds.pose.create_tensor('super_categories', htype = 'class_label', class_names = super_cat_names_kp)
    ds.pose.create_tensor('boxes', htype = 'bbox')
    ds.pose.create_tensor('keypoints', htype = 'keypoints_coco', dtype = 'int32')
    ds.pose.create_tensor('masks', htype = 'binary_mask', sample_compression = 'lz4')
    
    #Stuff Segmentation
    ds.create_group('stuff')
    ds.stuff.create_tensor('masks', htype = 'binary_mask', sample_compression = 'lz4')
    ds.stuff.create_tensor('boxes', htype = 'bbox')
    ds.stuff.create_tensor('categories', htype = 'class_label', class_names = cat_names)
    ds.stuff.create_tensor('super_categories', htype = 'class_label', class_names = super_cat_names)
    ds.stuff.create_tensor('areas', dtype = 'uint32')
    ds.stuff.create_tensor('iscrowds', dtype = 'bool')
    
    #Further updates to meta information
    ds.categories.info.update(category_info = category_info, notes = 'Numeric labels for categories represent the position of the class in the ds.categories.info.class_names list, and not the COCO category id.')
    ds.super_categories.info.update(category_info = category_info, notes = 'Numeric labels for super_categories represent the position of the class in the ds.super_categories.info.class_names list, and not the COCO category id.')
    ds.masks.info.update(notes = 'All segmentation polygons and RLEs were converted to stacked binary masks')
    ds.pose.masks.info.update(category_info = category_info_kp, notes = 'All segmentation polygons and RLEs were converted to stacked binary masks')
    ds.pose.keypoints.info.update(keypoints = [category['keypoints'] for category in category_info_kp][0], connections = [category['skeleton'] for category in category_info_kp][0])
    ds.stuff.masks.info.update(category_info = category_info_stuff, notes = 'All segmentation polygons and RLEs were converted to stacked binary masks')
    
    ## ---- Iterate through each image and upload data ----- ##
    for img_id in img_ids:
        ann_ids = coco.getAnnIds(img_id)
        ann_ids_kp = coco_kp.getAnnIds(img_id)
        ann_ids_stuff = coco_stuff.getAnnIds(img_id)
        
        anns = coco.loadAnns(ann_ids)
        anns_kp = coco_kp.loadAnns(ann_ids_kp)
        anns_stuff = coco_stuff.loadAnns(ann_ids_stuff)
        
        img_coco = coco.loadImgs(img_id)[0]
        img_fn = os.path.join(img_root, img_coco['file_name'])
        img = Image.open(img_fn)
        dims = img.size
        
        
        #Iterate through annotations and parse each
        
        #First Create empty arrays for all annotations
        masks = np.zeros((dims[1], dims[0], len(anns)))
        boxes = np.zeros((len(anns),4))
        categories = np.zeros((len(anns)))
        supercats = np.zeros((len(anns)))
        areas = np.zeros((len(anns)))
        iscrowds = np.zeros((len(anns)))
        supercats = np.zeros((len(anns)))
        
        #Then populate the arrays with the annotations data
        for i, ann in enumerate(anns):
            mask = coco.annToMask(ann) #Convert annotation to mask
            masks[:,:,i] = mask
            boxes[i,:] = ann['bbox']
            
            # Do a brute force search and make no assumptions between order of relationship of category ids
            categories[i] = cat_names.index([category_info[i]['name'] for i in range(len(category_info)) if category_info[i]['id']==ann['category_id']][0])
            supercats[i] = super_cat_names.index([category_info[i]['supercategory'] for i in range(len(category_info)) if category_info[i]['id']==ann['category_id']][0])
            
            areas[i] = ann['area']
            iscrowds[i] = ann['iscrowd']            

            if 'segmentation' not in ann:
                print('--- No segmentation found in annotations. ---')
                print('Annotation length: {}'.format(len(anns)))
                print('--- image id: {} ---'.format(img_id))        
        
        #Iterate through keypoints and parse each
    
        categories_kp = np.zeros((len(anns_kp)))
        supercats_kp = np.zeros((len(anns_kp)))
        masks_kp = np.zeros((dims[1], dims[0], len(anns_kp)))
        boxes_kp = np.zeros((len(anns_kp),4))
        keypoints_kp = np.zeros((51,len(anns_kp)))

        for j, ann_kp in enumerate(anns_kp):
            categories_kp[j] = cat_names_kp.index([category_info_kp[i]['name'] for i in range(len(category_info_kp)) if category_info_kp[i]['id']==ann_kp['category_id']][0])
            supercats_kp[j] = super_cat_names_kp.index([category_info_kp[i]['supercategory'] for i in range(len(category_info_kp)) if category_info_kp[i]['id']==ann_kp['category_id']][0])
            mask_kp = coco.annToMask(ann_kp) #Convert annotation to mask
            masks_kp[:,:,j] = mask_kp
            boxes_kp[j,:] = ann_kp['bbox']
            keypoints_kp[:,j] = np.array(ann_kp['keypoints'])

            
        #Iterate through stuff and parse each
        
        masks_stuff = np.zeros((dims[1], dims[0], len(anns_stuff)))
        boxes_stuff = np.zeros((len(anns_stuff),4))
        categories_stuff = np.zeros((len(anns_stuff)))
        supercats_stuff = np.zeros((len(anns_stuff)))
        areas_stuff = np.zeros((len(anns_stuff)))
        iscrowds_stuff = np.zeros((len(anns_stuff)))
        supercats_stuff = np.zeros((len(anns_stuff)))
        
        for k, ann_stuff in enumerate(anns_stuff):
            mask_stuff = coco.annToMask(ann_stuff) #Convert annotation to mask
            masks_stuff[:,:,k] = mask_stuff
            boxes_stuff[k,:] = ann['bbox']
            
            # Do a brute force search and make no assumptions between order of relationship of category ids
            categories_stuff[k] = cat_names_stuff.index([category_info_stuff[i]['name'] for i in range(len(category_info_stuff)) if category_info_stuff[i]['id']==ann_stuff['category_id']][0])
            supercats_stuff[k] = super_cat_names_stuff.index([category_info_stuff[i]['supercategory'] for i in range(len(category_info_stuff)) if category_info_stuff[i]['id']==ann_stuff['category_id']][0])
            
            areas_stuff[k] = ann_stuff['area']
            iscrowds_stuff[k] = ann_stuff['iscrowd']            

            if 'segmentation' not in ann_stuff:
                print('--- No segmentation found in stuff annotations. ---')
                print('Annotation length: {}'.format(len(anns)))
                print('--- image id: {} ---'.format(img_id))        
            
            
        #Append data to hub. Only do this after all annotations have been parsed.
        try:
            ds.images.append(hub.read(img_fn, verify = True))
            ds.images_meta.append(img_coco)
            ds.masks.append(masks.astype('bool'))
            ds.boxes.append(boxes.astype('float32'))
            ds.categories.append(categories.astype('uint32'))
            ds.super_categories.append(supercats.astype('uint32'))
            ds.areas.append(areas.astype('uint32'))
            ds.iscrowds.append(iscrowds.astype('bool'))

            ds.pose.categories.append(categories_kp.astype('uint32'))
            ds.pose.super_categories.append(supercats_kp.astype('uint32'))
            ds.pose.boxes.append(boxes_kp.astype('float32'))
            ds.pose.masks.append(masks_kp.astype('bool'))
            ds.pose.keypoints.append(keypoints_kp.astype('int32')) 

            ds.stuff.masks.append(masks_stuff.astype('bool'))
            ds.stuff.boxes.append(boxes_stuff.astype('float32'))
            ds.stuff.categories.append(categories_stuff.astype('uint32'))
            ds.stuff.super_categories.append(supercats_stuff.astype('uint32'))
            ds.stuff.areas.append(areas_stuff.astype('uint32'))
            ds.stuff.iscrowds.append(iscrowds_stuff.astype('bool'))

        except Exception as e:
            print(e)

        if count%100==0:
            print('Uploaded {} images'.format(count))

        if count>=limit:
            break
            
        count+=1   

    print('Finished')
    
end_time = time.time()

print('Upload took {} seconds'.format(end_time-start_time))

## Special case - COCO Test dataset without annotations

In [None]:
data_dir='./Datasets/coco'
data_type='test'

hub_path = './Datasets/coco_local_{}'.format(data_type) # 'hub://my_worksace/coco_{}'.format(data_type)

limit = 1e10 # Limit the number of images

In [None]:
ann_file='{}/annotations/image_info_{}2017.json'.format(data_dir,data_type) #There are no actual annotations, just images
img_root='{}/{}2017/'.format(data_dir,data_type)

coco = COCO(ann_file)

In [None]:
#Specify dataset path
ds = hub.empty(hub_path) # Set overwrite = True if you need to start over

In [None]:
img_ids = sorted(coco.getImgIds()) # Image ids for uploading
count = 1

start_time = time.time()

with ds:
    
    ## ---- Create Tensors ----- ##
    
    ds.create_tensor('images', htype = 'image', sample_compression = 'jpg')
    ds.create_tensor('images_meta', htype = 'json')
    
    
    ## ---- Iterate through each image and upload data ----- ##
        
    for img_id in img_ids:

        img_coco = coco.loadImgs(img_id)[0]
        img_fn = os.path.join(img_root, img_coco['file_name'])
        img = Image.open(img_fn)
        dims = img.size
                
        #Append data to hub
        try:
            ds.images.append(hub.read(img_fn, verify = True))
            ds.images_meta.append(img_coco)

        except Exception as e:
            print(e)

        if count%100==0:
            print('Uploaded {} images'.format(count))

        if count>=limit:
            break
            
        count+=1   

    print('Finished')
    
end_time = time.time()

print('Upload took {} seconds'.format(end_time-start_time))