In [None]:
# Copyright 2020 Fagner Cunha
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import os

import random
import json
import pandas as pd

In [2]:
random.seed(42)

### Loading data

In [3]:
serengeti_annotations = pd.read_csv('../data/SnapshotSerengeti_v2_1_annotations.csv')
serengeti_annotations = serengeti_annotations[['capture_id', 'season', 'site', 'question__species']].copy()

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
serengeti_images = pd.read_csv('../data/SnapshotSerengeti_v2_1_images.csv')
serengeti_images = serengeti_images.drop('Unnamed: 0', axis=1)

In [5]:
serengeti_images_labeled = pd.merge(serengeti_images, serengeti_annotations, on='capture_id', how='outer')

We will only use seasons 1-6:

In [6]:
serengeti_images_labeled = serengeti_images_labeled[
    serengeti_images_labeled.season.isin(['S1', 'S2', 'S3', 'S4', 'S5', 'S6'])].copy()

Remove images with more than one species identified:

In [7]:
non_single_spc_instances = serengeti_images_labeled[
    serengeti_images_labeled[['image_path_rel']].duplicated(keep=False)]
non_single_spc_instances = non_single_spc_instances.image_path_rel.unique()

In [8]:
serengeti_images_labeled = serengeti_images_labeled[
    ~serengeti_images_labeled.image_path_rel.isin(non_single_spc_instances)].copy()

Verify if images were correctly resized:

In [9]:
ss_path = '/data/fagner/coruja/datasets/serengeti/serengeti_600x1024/'

In [10]:
all_images_download = [value['image_path_rel']
                       for key, value
                       in
                       serengeti_images_labeled.iterrows()
                       if os.path.isfile(ss_path + value['image_path_rel'])]

In [11]:
serengeti_images_labeled = serengeti_images_labeled[
    serengeti_images_labeled.image_path_rel.isin(all_images_download)].copy()

In [12]:
serengeti_images_labeled.head()

Unnamed: 0,capture_id,image_rank_in_capture,image_path_rel,season,site,question__species
0,SER_S1#B04#1#1,1,S1/B04/B04_R1/S1_B04_R1_PICT0001.JPG,S1,B04,human
1,SER_S1#B04#1#2,1,S1/B04/B04_R1/S1_B04_R1_PICT0002.JPG,S1,B04,human
2,SER_S1#B04#1#3,1,S1/B04/B04_R1/S1_B04_R1_PICT0003.JPG,S1,B04,blank
3,SER_S1#B04#1#4,1,S1/B04/B04_R1/S1_B04_R1_PICT0004.JPG,S1,B04,blank
4,SER_S1#B04#1#5,1,S1/B04/B04_R1/S1_B04_R1_PICT0005.JPG,S1,B04,blank


#### Select empty events

In [13]:
species_capture_events = serengeti_images_labeled[
    ~(serengeti_images_labeled.question__species == 'blank')].capture_id.unique()

In [14]:
empty_capture_events = serengeti_images_labeled[
    serengeti_images_labeled.question__species == 'blank'].capture_id.unique()

In [15]:
len(species_capture_events)

309602

In [16]:
len(empty_capture_events)

859520

In [17]:
sampled_empty_capture_events = random.sample(list(empty_capture_events), len(species_capture_events))

In [18]:
sampled_capture_events = sampled_empty_capture_events[:]

In [19]:
sampled_capture_events = sampled_empty_capture_events[:]
sampled_capture_events.extend(species_capture_events)

### Split train/val_dev/val

In [20]:
train_percent = 0.9
val_dev_percent = 0.05
val_percent = 0.05

In [21]:
train_capture_events = random.sample(sampled_capture_events, int(train_percent*len(sampled_capture_events)))

In [22]:
val_capture_events = list(set(sampled_capture_events) - set(train_capture_events))
val_dev_new_percent = val_dev_percent/(val_dev_percent+val_percent)
val_dev_capture_events = random.sample(val_capture_events, int(val_dev_new_percent*len(val_capture_events)))
val_capture_events = list(set(val_capture_events) - set(val_dev_capture_events))

In [23]:
len(train_capture_events)

557283

In [24]:
len(val_dev_capture_events)

30960

In [25]:
len(val_capture_events)

30961

##### Mark split

In [26]:
val_dev = set(val_dev_capture_events)
val = set(val_capture_events)

def mark_split(row):
    if row['capture_id'] in val_dev:
        return 'val_dev'
    elif row['capture_id'] in val:
        return 'val'
    else:
        return 'train'

In [27]:
instances = serengeti_images_labeled[serengeti_images_labeled.capture_id.isin(sampled_capture_events)].copy()

In [28]:
instances['split'] = instances.apply(mark_split, axis=1)

In [29]:
pd.crosstab(instances.question__species, instances.split)

split,train,val,val_dev
question__species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aardvark,510,33,25
aardwolf,285,10,17
baboon,4038,212,249
batEaredFox,682,40,31
blank,757548,42004,41933
buffalo,30428,1552,1674
bushbuck,319,16,18
caracal,168,9,6
cheetah,3109,150,143
civet,61,5,6


#### Binarize labels

In [30]:
def binarize_categories(row):
    if row['question__species'] == 'blank':
        return 0
    else:
        return 1

In [31]:
instances = instances[['image_path_rel', 'question__species', 'split']].copy()
instances['category'] = instances.apply(binarize_categories, axis=1)

In [32]:
pd.crosstab(instances.category, instances.split)

split,train,val,val_dev
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,757548,42004,41933
1,735443,41084,41078


### Save csv files:

In [33]:
def save_split(data, split, col_file_name, col_category, file_patern):
    data_processed = data[data.split == split].copy()
    data_processed['file_name'] = data_processed[col_file_name]
    data_processed['category'] = data_processed[col_category]
    
    file_name = file_patern % split
    
    
    data_processed[['file_name', 'category']].to_csv(file_name, index=False)

In [34]:
save_split(instances, 'train', 'image_path_rel', 'category', '../data/ss_event_%s_empty.csv')
save_split(instances, 'val_dev', 'image_path_rel', 'category', '../data/ss_event_%s_empty.csv')
save_split(instances, 'val', 'image_path_rel', 'category', '../data/ss_event_%s_empty.csv')

In [35]:
save_split(instances, 'train', 'image_path_rel', 'question__species', '../data/ss_event_%s_species.csv')
save_split(instances, 'val_dev', 'image_path_rel', 'question__species', '../data/ss_event_%s_species.csv')
save_split(instances, 'val', 'image_path_rel', 'question__species', '../data/ss_event_%s_species.csv')