In [None]:
# Copyright 2020 Fagner Cunha
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import json
import random
import pandas as pd
from functools import partial 

Loading metadata:

In [2]:
def load_caltech_images_from_json(json_filename):
    with open(json_filename) as json_file:
        cct_metadata = json.load(json_file)
    cct_images = pd.DataFrame(cct_metadata['images'])
    cct_annotations = pd.DataFrame(cct_metadata['annotations'])
    cct_images_labeled = pd.merge(cct_images,
                                  cct_annotations,
                                  how='outer',
                                  left_on='id',
                                  right_on='image_id')
    
    non_single_spc_instances = cct_images_labeled[
                                        cct_images_labeled[['image_id']].duplicated(keep=False)]
    non_single_spc_instances = non_single_spc_instances.image_id.unique()
    cct_images_labeled = cct_images_labeled[
                                ~cct_images_labeled.image_id.isin(non_single_spc_instances)].copy()
    
    return cct_images_labeled

In [3]:
with open('../data/CaltechCameraTrapsSplits_v0.json') as json_file:
    recommend_train_val_splits = json.load(json_file)

In [4]:
cct_images_all = load_caltech_images_from_json('../data/caltech_images_20200316.json')
cct_images_bbox = load_caltech_images_from_json('../data/caltech_bboxes_20200316.json')
cct_images_bbox['category_id'] = cct_images_bbox['category_id'].astype('int64')

Initial split based on images with bbox:

In [5]:
recommend_train = set(recommend_train_val_splits['splits']['train'])
recommend_val = set(recommend_train_val_splits['splits']['val'])

In [6]:
bbox_all_locations = set(cct_images_bbox.location.unique())
bbox_train = bbox_all_locations - recommend_val
bbox_val = bbox_all_locations - bbox_train

In [7]:
len(bbox_all_locations), len(bbox_train), len(bbox_val)

(85, 56, 29)

In [8]:
#bbox_val_dev = set(random.sample(bbox_train, 10))
bbox_val_dev = {'10', '115', '4', '52', '55', '56', '58', '59', '62', '73'}
bbox_train = bbox_train - bbox_val_dev

Extrapolate split to all locations:

In [9]:
val = recommend_val.copy()
train = bbox_train.copy()
val_dev = bbox_val_dev.copy()

In [10]:
remaining_train = recommend_train - train - val_dev
#remaining_val_dev = set(random.sample(remaining_train, 10))
remaining_val_dev = {'106', '107', '113', '118', '119', '122', '123', '137', '83', '96'}
remaining_train = remaining_train - remaining_val_dev

In [11]:
train = train.union(remaining_train)
val_dev = val_dev.union(remaining_val_dev)

In [12]:
len(train), len(val_dev), len(val)

(80, 20, 40)

Visualize partitioning:

In [13]:
def mark_split(train, val_dev, val, row):
    if row['location'] in val_dev:
        return 'val_dev'
    elif row['location'] in train:
        return 'train'
    else:
        return 'val'

In [14]:
marK_split_fn = partial(mark_split, train, val_dev, val)

In [15]:
instances = cct_images_bbox.copy()
instances['split'] = instances.apply(marK_split_fn, axis=1)

In [16]:
pd.crosstab(instances.category_id, instances.split)

split,train,val,val_dev
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,6743,4928,335
3,2047,4313,330
5,1631,794,187
6,2011,2664,158
7,640,459,87
8,1409,686,56
9,3565,2047,319
10,3908,705,1101
11,1643,729,246
14,21,123,29


In [17]:
def binarize_categories(row):
    if row['category_id'] == 30:
        return 0
    else:
        return 1

In [18]:
instances['category'] = instances.apply(binarize_categories, axis=1)

In [19]:
pd.crosstab(instances.category, instances.split)

split,train,val,val_dev
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,121,23,36
1,32032,23410,3877


In [20]:
instances_all = cct_images_all.copy()
instances_all['split'] = instances_all.apply(marK_split_fn, axis=1)

In [21]:
pd.crosstab(instances_all.category_id, instances_all.split)

split,train,val,val_dev
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,9446,6632,497
3,4705,5347,532
5,2654,1412,350
6,3779,3562,488
7,1151,585,150
8,2636,1156,106
9,7418,6067,2636
10,8064,1817,2303
11,6183,2188,954
14,93,171,45


In [22]:
instances_all['category'] = instances_all.apply(binarize_categories, axis=1)

In [23]:
pd.crosstab(instances_all.category, instances_all.split)

split,train,val,val_dev
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,59420,19892,46433
1,63284,41773,10729


### Select empty images for training set:

In [24]:
def cap_instances_by_location(instances, cap=1000):
    locations = instances.location.unique()
    instances_selected_list = []
    for location in locations:
        instances_per_location = instances[instances.location == location]
        if len(instances_per_location) > cap:
            instances_selected_list.append(instances_per_location.sample(cap))
        else:
            instances_selected_list.append(instances_per_location)
    return pd.concat(instances_selected_list)

In [25]:
noempty_bbox_locations = instances[instances.category==1].copy()
empty_all_locations = instances_all[instances_all.category==0].copy()
empty_train_capped = cap_instances_by_location(empty_all_locations[empty_all_locations.split=='train'])
empty_val_dev_capped = cap_instances_by_location(empty_all_locations[empty_all_locations.split=='val_dev'])
empty_val = empty_all_locations[empty_all_locations.split=='val'].copy()
selected_instances = pd.concat([noempty_bbox_locations, empty_train_capped, empty_val_dev_capped, empty_val])

In [26]:
pd.crosstab(selected_instances.category, selected_instances.split)

split,train,val,val_dev
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,8574,19892,2824
1,32032,23410,3877


### Saving data:

In [27]:
caltech_split_v3 = {'train': list(train), 'val_dev': list(val_dev), 'val': list(val)}
with open('../data/caltech_splits_v3.json', 'w') as outfile:
    json.dump(caltech_split_v3, outfile, indent=2)

In [28]:
with open('../data/caltech_images_20200316.json') as json_file:
    all_images = json.load(json_file)
all_images['info']['version'] = '20201110ufam'

instances_2_json = selected_instances[['seq_num_frames',
                                        'date_captured',
                                        'seq_id',
                                        'height',
                                        'width',
                                        'location',
                                        'rights_holder',
                                        'file_name',
                                        'id_x',
                                        'frame_num']]
instances_2_json = instances_2_json.rename(columns={'id_x': 'id'})
instances_2_json = instances_2_json.to_dict('records')
all_images['images'] = instances_2_json
with open('../data/caltech_images_20201110ufam.json', 'w') as outfile_images:
    json.dump(all_images, outfile_images, indent=2)