In [None]:
# Copyright 2021 Fagner Cunha
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import json
import random
import numpy as np
import pandas as pd

random.seed(42)
np.random.seed(42)

Loading Caltech metadata:

In [2]:
images_list_json = '/data/fagner/training/data/caltech_images_20201110ufam.json'
with open(images_list_json) as f:
    images = json.load(f)
test_images = pd.DataFrame(images['images'])

split_json = '../data/caltech_splits_v3.json'
with open(split_json) as f:
    split = json.load(f)

test_images = test_images[test_images['location'].isin(split['val'])].copy()
cct_annotations = pd.DataFrame(images['annotations'])
test_images_labeled = pd.merge(test_images,
                              cct_annotations,
                              how='left',
                              left_on='id',
                              right_on='image_id')

In [3]:
def binarize_categories(row):
    if row['category_id'] == 30:
        return 0
    else:
        return 1

In [4]:
test_images_labeled['category'] = test_images_labeled.apply(binarize_categories, axis=1)

#### Sampling 5000 instances from the test set
* Must include at least one empty and one nonempty image for each location
* Must keep the binary class proportion

In [5]:
def get_instance_per_location(df):
    selected_instances = []
    empty_count = 0
    nonempty_count = 0
    for location in df.location.unique():
        empty = df[(df.location == location) & (df.category == 0)]
        if len(empty) > 0:
            selected_instances.append(empty.sample(1)['image_id'].values[0])
            empty_count += 1
        
        nonempty = df[(df.location == location) & (df.category == 1)]
        if len(nonempty) > 0:
            selected_instances.append(nonempty.sample(1)['image_id'].values[0])
            nonempty_count += 1
    
    return selected_instances, empty_count, nonempty_count

In [6]:
def sample_instances(df, category, num_samples):
    samples = df[df.category == category].sample(num_samples)
    
    return list(samples['image_id'])

In [7]:
NUM_SAMPLES = 5000

In [8]:
num_empty_samples = int(NUM_SAMPLES * test_images_labeled.category.value_counts()[0]/len(test_images_labeled))
num_nonempty_samples = NUM_SAMPLES - num_empty_samples

In [9]:
sel_instances, empty_count, nonempty_count = get_instance_per_location(test_images_labeled)

In [10]:
empty_samples = sample_instances(
    test_images_labeled[~test_images_labeled.image_id.isin(sel_instances)],
    category=0,
    num_samples=(num_empty_samples - empty_count))

In [11]:
nonempty_samples = sample_instances(
    test_images_labeled[~test_images_labeled.image_id.isin(sel_instances)],
    category=1,
    num_samples=(num_nonempty_samples - nonempty_count))

In [12]:
sel_instances = sel_instances + empty_samples + nonempty_samples

#### Save test_sample spliting

In [13]:
selected_instances = test_images_labeled[test_images_labeled.image_id.isin(sel_instances)]

In [14]:
with open(images_list_json) as json_file:
    all_images = json.load(json_file)
all_images['info']['version'] = '20201110ufam_test_small'

instances_2_json = selected_instances[['seq_num_frames',
                                        'date_captured',
                                        'seq_id',
                                        'height',
                                        'width',
                                        'location',
                                        'rights_holder',
                                        'file_name',
                                        'id_x',
                                        'frame_num']]
instances_2_json = instances_2_json.rename(columns={'id_x': 'id'})
instances_2_json = instances_2_json.to_dict('records')
all_images['images'] = instances_2_json
metadata_json = '/data/fagner/training/data/caltech_images_20201110ufam_test_sample.json'
with open(metadata_json, 'w') as outfile_images:
    json.dump(all_images, outfile_images, indent=2)