In [1]:
# Copyright 2021 Fagner Cunha
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [2]:
import os
import random

import json
import pandas as pd
import numpy as np
import tensorflow as tf

from functools import partial 

In [3]:
random_seed = 42

random.seed(random_seed)
np.random.seed(random_seed)
tf.random.set_seed(random_seed)

#### Auxiliary functions

In [4]:
def load_images_from_cocojson(json_filename):
    with open(json_filename) as json_file:
        metadata = json.load(json_file)
    images = pd.DataFrame(metadata['images'])
    annotations = pd.DataFrame(metadata['annotations'])
    images_labeled = pd.merge(images,
                              annotations,
                              how='outer',
                              left_on='id',
                              right_on='image_id')
    
    non_single_spc_instances = images_labeled[
                                        images_labeled[['image_id']].duplicated(keep=False)]
    non_single_spc_instances = non_single_spc_instances.image_id.unique()
    images_labeled = images_labeled[
                            ~images_labeled.image_id.isin(non_single_spc_instances)].copy()
    
    return images_labeled

In [5]:
def get_split(splits, loc_column, row):
    for split_name in splits.keys():
        if row[loc_column] in splits[split_name]:
            return split_name
    
    return 'none'

In [6]:
def load_images_n_split(dataset_json, split_json, loc_column='location'):
    images_labeled = load_images_from_cocojson(dataset_json)
    
    with open(split_json) as json_file:
        splits = json.load(json_file)
    
    get_split_fn = partial(get_split, splits, loc_column)
    images_labeled['split'] = images_labeled.apply(get_split_fn, axis=1)
    
    return images_labeled

In [7]:
def check_file_existance(dataset_folder, row):
    file_path = os.path.join(dataset_folder, row['file_name'])

    return os.path.isfile(file_path)

In [8]:
def check_files(dataset_df, dataset_folder):
    check_file_existance_fn = partial(check_file_existance, dataset_folder)
    dataset_df['file_ok'] = dataset_df.apply(check_file_existance_fn, axis=1)
    
    return dataset_df[dataset_df['file_ok']].copy()

In [9]:
def sample_empty_class(dataset_df, empty_class, seq_id_column):
    empty_ds = dataset_df[(dataset_df.category_id==empty_class)&(dataset_df.split=='train')]
    nonempty_ds = dataset_df[~((dataset_df.category_id==empty_class)&(dataset_df.split=='train'))]
    
    empty_ds = empty_ds.sample(len(empty_ds), replace=False)
    empty_ds = empty_ds[~empty_ds[[seq_id_column]].duplicated(keep='first')]
    
    dataset_sampled = pd.concat([nonempty_ds, empty_ds])
    dataset_sampled = dataset_sampled.sort_index().copy()
    
    return dataset_sampled

In [10]:
def save_subset_data(selected_instances,
                     source_file,
                     target_file,
                     images_columns,
                     images_columns_rename,
                     annotations_columns,
                     annotations_columns_rename,
                     version='20210928ufam'):

    with open(source_file) as json_file:
        all_images = json.load(json_file)

    all_images['info']['version'] = version

    instances_2_json = selected_instances[images_columns]
    instances_2_json = instances_2_json.rename(columns=images_columns_rename)
    instances_2_json = instances_2_json.to_dict('records')
    all_images['images'] = instances_2_json

    annot_2_json = selected_instances[annotations_columns]
    annot_2_json = annot_2_json.rename(columns=annotations_columns_rename)
    annot_2_json = annot_2_json.to_dict('records')
    all_images['annotations'] = annot_2_json

    with open(target_file, 'w') as outfile_images:
        json.dump(all_images, outfile_images, indent=2)

    return all_images

In [11]:
version='20211210ufam'

#### Caltech Spliting

In [12]:
cct_json = '../data/caltech_images_20200316.json'

In [13]:
cct_images = load_images_n_split(cct_json, '../data/caltech_splits_v3.json')
cct_images['file_name'] = cct_images['file_name'].str.replace(r'train_val/', '')
cct_images['file_name'] = cct_images['file_name'].str.replace(r'test/', '')
cct_images = check_files(cct_images, '/data/fagner/coruja/datasets/caltech/cct_images_resized')

### Exclude images from classes no present on test set
cct_images = cct_images[~cct_images.category_id.isin([39, 66, 97])].copy()

In [14]:
saved_data = save_subset_data(cct_images,
                              cct_json,
                              '../data/caltech_images_%s.json' % version,
                              images_columns=['seq_num_frames', 'date_captured', 'seq_id', 'height', 'width', 'location', 'rights_holder', 'file_name', 'id_x', 'frame_num'],
                              images_columns_rename={'id_x': 'id'},
                              annotations_columns=['id_y', 'category_id', 'image_id'],
                              annotations_columns_rename={'id_y': 'id'},
                              version=version)

In [15]:
cct_images['split'].value_counts()

train      122693
val         61665
minival     57162
Name: split, dtype: int64

In [16]:
pd.crosstab(cct_images.category_id, cct_images.split)

split,minival,train,val
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,497,9446,6632
3,532,4705,5347
5,350,2654,1412
6,488,3779,3562
7,150,1151,585
8,106,2636,1156
9,2636,7418,6067
10,2303,8064,1817
11,954,6183,2188
14,45,93,171


#### Snapshot Serengeti Spliting

In [17]:
ss_json = '../data/SnapshotSerengeti_S1-11_v2.1.json'

In [18]:
ss_images = load_images_n_split(ss_json,
                                '../data/snapshot_serengeti_splits_v3.json',
                                loc_column='location_x')
ss_images = ss_images[ss_images.season.isin(['S1', 'S2', 'S3', 'S4', 'S5', 'S6'])].copy()
ss_images = check_files(ss_images, '/data/fagner/coruja/datasets/serengeti/serengeti_600x1024')

### Exclude images from classes no present on test set and human class (1)
ss_images = ss_images[~ss_images.category_id.isin([1, 48])].copy()

ss_images = sample_empty_class(ss_images, 0, 'seq_id_x')

In [19]:
saved_data = save_subset_data(ss_images,
                              ss_json,
                              '../data/snapshot_serengeti_images_%s.json' % version,
                              images_columns=['id_x', 'file_name', 'frame_num', 'seq_id_x', 'width', 'height', 'corrupt', 'location_x', 'seq_num_frames', 'datetime_x'],
                              images_columns_rename={'id_x': 'id', 'seq_id_x': 'seq_id', 'location_x': 'location', 'datetime_x': 'datetime'},
                              annotations_columns=['sequence_level_annotation', 'id_y', 'category_id', 'seq_id_y', 'season', 'datetime_y', 'subject_id', 'count', 'standing', 'resting', 'moving', 'interacting', 'young_present', 'image_id', 'location_y'],
                              annotations_columns_rename={'id_y': 'id', 'seq_id_y': 'seq_id', 'location_y': 'location', 'datetime_y': 'datetime'},
                              version=version)

In [20]:
ss_images['split'].value_counts()

train      1059793
val         738404
minival     360317
Name: split, dtype: int64

In [21]:
pd.crosstab(ss_images.category_id, ss_images.split)

split,minival,train,val
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,278578,553218,535817
2,1510,13650,5828
3,370,3050,942
4,100,2962,427
5,18132,100088,29832
6,41,271,132
7,13114,63272,37539
8,779,6753,3106
9,1933,13347,6011
10,2780,17667,1451


#### WCS Spliting

In [22]:
wcs_json = '../data/wcs_camera_traps.json'

In [23]:
wcs_images = load_images_n_split(wcs_json,
                                '../data/wcs_splits.json',
                                loc_column='location')

wcs_images = check_files(wcs_images, '/data/fagner/coruja/datasets/resized/wcs_450x768/')

### Exclude images from classes no present on test set or train set
wcs_images = wcs_images[~wcs_images.category_id.isin([22, 33, 34, 39, 41, 42, 43, 45, 46, 47, 48, 49, 52, 55, 57, 58, 59, 61, 63, 64, 65, 66, 67, 68, 82, 84, 87, 88, 89, 93, 98, 107, 109, 124, 126, 131, 135, 143, 148, 158, 164, 165, 168, 172, 173, 174, 179, 180, 183, 184, 185, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 200, 201, 202, 205, 206, 207, 208, 210, 211, 213, 214, 217, 218, 219, 220, 222, 223, 224, 225, 226, 228, 229, 232, 236, 237, 238, 239, 241, 244, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 257, 261, 263, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 287, 288, 297, 298, 301, 304, 310, 311, 313, 315, 323, 324, 325, 326, 329, 330, 331, 332, 333, 335, 336, 338, 339, 341, 342, 343, 345, 346, 349, 350, 351, 352, 353, 355, 356, 357, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 373, 375, 381, 382, 383, 384, 385, 387, 388, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 405, 406, 407, 408, 409, 410, 411, 412, 413, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 440, 441, 448, 449, 450, 455, 456, 457, 458, 459, 460, 462, 463, 464, 465, 471, 479, 487, 488, 489, 491, 492, 493, 494, 495, 496, 498, 499, 500, 501, 502, 503, 504, 505, 506, 507, 509, 511, 512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524, 525, 526, 527, 528, 530, 531, 532, 533, 534, 535, 536, 538, 539, 540, 541, 543, 544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569, 570, 571, 572, 573, 574, 575, 576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675
])].copy()

### Exclude images from unknown classes and similar ones (start, end, misfire)
wcs_images = wcs_images[~wcs_images.category_id.isin([75, 79, 177, 178, 182, 183, 186, 187, 189, 192, 198, 203, 226, 270, 290, 312, 314, 347, 348, 404, 454, 469, 505, 511, 512, 534, 561, 567, 569, 529, 599, 608, 609, 631, 650, 654, 656, 664])].copy()

In [24]:
saved_data = save_subset_data(wcs_images,
                              wcs_json,
                              '../data/wcs_images_%s.json' % version,
                              images_columns=['id_x', 'wcs_id', 'file_name', 'frame_num', 'seq_id', 'country_code', 'match_level', 'datetime', 'location', 'width', 'height', 'corrupt', 'seq_num_frames', 'status'],
                              images_columns_rename={'id_x': 'id'},
                              annotations_columns=['count', 'sex', 'age', 'id_y', 'category_id', 'image_id'],
                              annotations_columns_rename={'id_y': 'id'},
                              version=version)

In [25]:
wcs_images['split'].value_counts()

train    599040
none     377458
val       79906
test      62734
Name: split, dtype: int64

In [26]:
pd.crosstab(wcs_images.category_id, wcs_images.split)

split,none,test,train,val
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,255095,34844,269622,32171
1,0,331,6747,1136
2,6804,5279,74620,9085
3,6840,584,7525,911
4,725,631,4104,1199
...,...,...,...,...
508,5,6,14,3
510,0,4,149,45
537,41,277,1181,0
542,38,188,1165,0


In [27]:
wcs_images['split'].value_counts()

train    599040
none     377458
val       79906
test      62734
Name: split, dtype: int64

In [28]:
pd.crosstab(wcs_images.category_id, wcs_images.split)

split,none,test,train,val
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,255095,34844,269622,32171
1,0,331,6747,1136
2,6804,5279,74620,9085
3,6840,584,7525,911
4,725,631,4104,1199
...,...,...,...,...
508,5,6,14,3
510,0,4,149,45
537,41,277,1181,0
542,38,188,1165,0


#### Wellington Spliting

In [29]:
wlt_json = '../data/wellington_camera_traps.json'

In [30]:
wlt_images = load_images_n_split(wlt_json,
                                '../data/wellington_split.json',
                                loc_column='site')
wlt_images = check_files(wlt_images, '/data/fagner/coruja/datasets/wellington/images/')

### Exclude images from classes no present on test set or train set and unclassifiable class (12)
wlt_images = wlt_images[~wlt_images.category_id.isin([4, 10, 12])].copy()

In [31]:
saved_data = save_subset_data(wlt_images,
                              wlt_json,
                              '../data/wellington_images_%s.json' % version,
                              images_columns=['id_x', 'file_name', 'seq_id', 'frame_num', 'site', 'camera', 'datetime', 'width', 'height'],
                              images_columns_rename={'id_x': 'id'},
                              annotations_columns=['id_y', 'image_id', 'category_id'],
                              annotations_columns_rename={'id_y': 'id'},
                              version=version)

In [32]:
wlt_images['split'].value_counts()

train     160937
test       55424
val        49127
ignore       521
Name: split, dtype: int64

In [33]:
pd.crosstab(wlt_images.category_id, wlt_images.split)

split,ignore,test,train,val
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,219,29418,99254,25634
1,125,8734,16373,5053
2,91,9709,26957,8964
3,42,1872,6321,2471
5,22,851,3142,336
6,4,1005,1872,3600
7,5,173,435,385
8,0,15,160,18
9,10,558,3974,1172
11,3,608,1513,633
