In [1]:
from __future__ import division
import argparse
import os
import os.path as osp
import warnings
import xml.dom.minidom as minidom
import scipy.io as sio
import numpy as np
import json
import PIL
from PIL import Image
from tqdm import tqdm

In [2]:
imagenetHome = '../data/imagenet'
devkit_path = osp.join(imagenetHome,'devkit')

In [3]:
synsets_image = sio.loadmat(os.path.join(devkit_path, 'data', 'meta_det.mat'))
synsets_video = sio.loadmat(os.path.join(devkit_path, 'data', 'meta_vid.mat'))
classes_image = ('__background__',)
wnid_image = (0,)
classes = ('__background__',)
wnid = (0,)
for i in range(200):
    classes_image = classes_image + (synsets_image['synsets'][0][i][2][0],)
    wnid_image = wnid_image + (synsets_image['synsets'][0][i][1][0],)

for i in range(30):
    classes = classes + (synsets_video['synsets'][0][i][2][0],)
    wnid = wnid + (synsets_video['synsets'][0][i][1][0],)

wnid_to_ind_image = dict(zip(wnid_image, range(201)))
class_to_ind_image = dict(zip(classes_image, range(201)))

wnid_to_ind = dict(zip(wnid, range(31)))
class_to_ind = dict(zip(classes, range(31)))

#check for valid intersection between video and image classes
valid_image_flag = [0]*201

for i in range(1,201):
    if wnid_image[i] in wnid_to_ind:
        valid_image_flag[i] = 1

In [4]:
def get_data_from_tag(node, tag):
    return node.getElementsByTagName(tag)[0].childNodes[0].data
def has_tag(node, tag):
    return node.getElementsByTagName(tag).length>0

In [5]:
def parseImagenetAnnObject(objs, imgSize=None, has_trackid=False):
    num_objs = len(objs)
    if imgSize is not None:
        width, height = imgSize
    boxes = np.zeros((num_objs, 4), dtype=np.float32)
    gt_classes = np.zeros((num_objs), dtype=np.int64)
    if has_trackid:
        track_ids = np.zeros((num_objs), dtype=np.int64)
        
    # Load object bounding boxes into a data frame.
    for ix, obj in enumerate(objs):
        x1 = max(float(get_data_from_tag(obj, 'xmin')),0)
        y1 = max(float(get_data_from_tag(obj, 'ymin')),0)
        x2 = max(float(get_data_from_tag(obj, 'xmax')),0)
        y2 = max(float(get_data_from_tag(obj, 'ymax')),0)
        if imgSize is not None:
            # if image size is known, box bound can be trimmed.
            x1 = min(x1, width -1)
            y1 = min(y1, height-1)
            x2 = min(x2, width -1)
            y2 = min(y2, height-1)
        
        cls = wnid_to_ind[str(get_data_from_tag(obj, "name")).lower().strip()]
        boxes[ix, :] = [x1, y1, x2, y2]
        gt_classes[ix] = cls
        if has_trackid:
            track_ids[ix] = int(get_data_from_tag(obj, 'trackid'))
    
    if has_trackid is False:
        return {'bboxes':boxes.tolist(),
                'labels':gt_classes.tolist(),}
    else:
        return {'bboxes':boxes.tolist(),
                'labels':gt_classes.tolist(),
                'trackids':track_ids.tolist(),}
            

In [6]:
def load_imagenet_annotation(index_image, img_prefix):
    """
    Load image and bounding boxes info from txt files of imagenet.
    """
    # Get image infos
    filename = osp.join(img_prefix, index_image)
    assert os.path.exists(filename),'%s'%(filename)
    with Image.open(filename) as img:
        imgSize = img.size
    
    # Get ann infos
    filename = osp.join(img_prefix, index_image).replace('Data','Annotations').replace('.JPEG','.xml')
    index = filename
    assert os.path.exists(filename),'%s'%(filename)
    with open(filename) as f:
        data = minidom.parseString(f.read())

    objs = data.getElementsByTagName('object')
    num_objs = len(objs)
    #filter the objects not in video synsets.
    used_objs = []
    for id, obj in enumerate(objs):
        if str(get_data_from_tag(obj, "name")).lower().strip() in wnid_to_ind:
            used_objs.append(obj)
    objs = used_objs
    if len(objs)>0 and has_tag(objs[0],'trackid'):
        ann = parseImagenetAnnObject(objs,imgSize=img.size,has_trackid=True)
    else:
        ann = parseImagenetAnnObject(objs,imgSize=img.size,has_trackid=False)
    return {
            'filename':index_image,
            'width':img.size[0],
            'height':img.size[1],
            'ann': ann
            }

In [7]:
def getCustomAnnByImagenetIndex(index,img_prefix):
    '''
    Annotation format:
    [
        {
            'filename': 'a.jpg',
            'width': 1280,
            'height': 720,
            'ann': {
                'bboxes': <np.ndarray> (n, 4),
                'labels': <np.ndarray> (n, ),
                'bboxes_ignore': <np.ndarray> (k, 4),
                'labels_ignore': <np.ndarray> (k, 4) (optional field)
            }
        },
        ...
    ]
    
    Args:
        index(str): relative file path.
        
    '''
    return load_imagenet_annotation(index,img_prefix)

In [8]:
def ImagenetDETVIDSet2CustomAnn(imageDataPrefix, imagesetPath, cumstomStyleAnnPath):
    '''Convert Imagenet DETVID annotations to custom style annotations.
    
       The imagenet dataset is configured by 1 txt file in "ImageSets"
       directory(contain image index), while custom style annotations 
       have all anns in 1 file.
       
    Args:
        imagesetPath(str): imagenet imageset path.
        cumstomStyleAnnPath(str): the path to save annotations.
    '''
    assert cumstomStyleAnnPath.endswith('.json')
    file_indexes = []
    with open(imagesetPath,'r') as fr:
        lines = fr.readlines()
        for l in lines:
            items = l.split()
            file_indexes.append(items[0].replace('data/imagenet/ILSVRC/','')+'.JPEG')
    print('total number of files are %d.'%(len(file_indexes)))
    
    ann_list = []
    for i in tqdm(range(len(file_indexes))):
        index = file_indexes[i]
        ann_dict = getCustomAnnByImagenetIndex(index,imageDataPrefix)
        ann_list.append(ann_dict)
        
    with open(cumstomStyleAnnPath,'w') as fw:
        json.dump(ann_list, fw, indent=2)        

In [9]:
def ImagenetVIDSet2CustomDETAnn(imageDataPrefix, imagesetPath, cumstomStyleAnnPath):
    assert cumstomStyleAnnPath.endswith('.json')
    file_indexes = []
    with open(imagesetPath,'r') as fr:
        lines = fr.readlines()
        for l in lines:
            items = l.split()
            file_indexes.append(items[0]+'.JPEG')
    print('total number of files are %d.'%(len(file_indexes)))
    
    ann_list = []
    for i in tqdm(range(len(file_indexes))):
        index = file_indexes[i]
        ann_dict = getCustomAnnByImagenetIndex(index,imageDataPrefix)
        ann_list.append(ann_dict)
        
    with open(cumstomStyleAnnPath,'w') as fw:
        json.dump(ann_list, fw, indent=2)

#### DET

In [10]:
def main_DETVID_train():
    imagesetPath = '/media/yelyu/18339a64-762e-4258-a609-c0851cd8163e/YeLyu/Work/FastVOD/data/imagenet/ILSVRC/ImageSets/trainr_DETVID.txt'
    cumstomStyleAnnPath = imagesetPath.replace('.txt','.json')
    ImagenetDETVIDSet2CustomAnn(imagenetHome, imagesetPath,cumstomStyleAnnPath)

#### VID

In [11]:
def main_VID_val():
    imagesetPath = '/media/yelyu/18339a64-762e-4258-a609-c0851cd8163e/YeLyu/Work/FastVOD/data/imagenet/ILSVRC/ImageSets/VID_val.txt'
    cumstomStyleAnnPath = imagesetPath.replace('.txt','.json')
    ImagenetVIDSet2CustomDETAnn(osp.join(imagenetHome,'Data/VID/val'), imagesetPath,cumstomStyleAnnPath)

In [12]:
if __name__=='__main__':
    main_VID_val()

  0%|          | 27/176126 [00:00<10:58, 267.31it/s]

total number of files are 176126.


100%|██████████| 176126/176126 [15:42<00:00, 298.05it/s]
