In [1]:
import os
import random
import json
import pandas as pd

#### Loading Snaposhot Serengeti data

In [2]:
with open('../data/SnapshotSerengetiSplits_v0.json') as json_file:
    recommend_train_val_splits = json.load(json_file)

In [3]:
serengeti_annotations = pd.read_csv('../data/SnapshotSerengeti_v2_1_annotations.csv')
serengeti_annotations = serengeti_annotations[['capture_id', 'season', 'site', 'question__species']].copy()

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
serengeti_images = pd.read_csv('../data/SnapshotSerengeti_v2_1_images.csv')
serengeti_images = serengeti_images.drop('Unnamed: 0', axis=1)

In [5]:
serengeti_images_labeled = pd.merge(serengeti_images, serengeti_annotations, on='capture_id', how='outer')

We will only use seasons 1-6:

In [6]:
serengeti_images_labeled = serengeti_images_labeled[
    serengeti_images_labeled.season.isin(['S1', 'S2', 'S3', 'S4', 'S5', 'S6'])].copy()

#### Remove images with more than one species identified

In [7]:
non_single_spc_instances = serengeti_images_labeled[
    serengeti_images_labeled[['image_path_rel']].duplicated(keep=False)]
non_single_spc_instances = non_single_spc_instances.image_path_rel.unique()

In [8]:
serengeti_images_labeled = serengeti_images_labeled[
    ~serengeti_images_labeled.image_path_rel.isin(non_single_spc_instances)].copy()

### Split by site:

#### Mark train/val images:

In [9]:
val_dev = ['D09',
 'J06',
 'C12',
 'B12',
 'F11',
 'C08',
 'E07',
 'O09',
 'Q07',
 'C13',
 'E04',
 'I06',
 'D10',
 'I08',
 'M11',
 'F02',
 'D06',
 'G09',
 'N03',
 'E10',
 'J09',
 'H13',
 'T13']
#val_dev = random.sample(recommend_train_val_splits['splits']['train'], 23)

In [10]:
serengeti_images_labeled_split = serengeti_images_labeled.copy()

In [11]:
def mark_split(row):
    if row['site'] in val_dev:
        return 'val_dev'
    elif row['site'] in recommend_train_val_splits['splits']['train']:
        return 'train'
    else:
        return 'val'

In [12]:
serengeti_images_labeled_split['split'] = serengeti_images_labeled_split.apply(mark_split, axis=1)

In [13]:
pd.crosstab(serengeti_images_labeled_split.question__species, serengeti_images_labeled_split.split)

split,train,val,val_dev
question__species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aardvark,352,162,54
aardwolf,222,75,15
baboon,3734,500,265
batEaredFox,543,168,42
blank,1522420,535832,278594
buffalo,24531,6707,2417
bushbuck,302,37,14
caracal,130,52,1
cheetah,2284,815,303
civet,57,12,3


Select instances:

In [14]:
serengeti_images_labeled_split

Unnamed: 0,capture_id,image_rank_in_capture,image_path_rel,season,site,question__species,split
0,SER_S1#B04#1#1,1,S1/B04/B04_R1/S1_B04_R1_PICT0001.JPG,S1,B04,human,train
1,SER_S1#B04#1#2,1,S1/B04/B04_R1/S1_B04_R1_PICT0002.JPG,S1,B04,human,train
2,SER_S1#B04#1#3,1,S1/B04/B04_R1/S1_B04_R1_PICT0003.JPG,S1,B04,blank,train
3,SER_S1#B04#1#4,1,S1/B04/B04_R1/S1_B04_R1_PICT0004.JPG,S1,B04,blank,train
4,SER_S1#B04#1#5,1,S1/B04/B04_R1/S1_B04_R1_PICT0005.JPG,S1,B04,blank,train
...,...,...,...,...,...,...,...
3242474,SER_S6#V10#2#33,2,S6/V10/V10_R2/S6_V10_R2_IMAG0096.JPG,S6,V10,blank,train
3242475,SER_S6#V10#2#33,3,S6/V10/V10_R2/S6_V10_R2_IMAG0097.JPG,S6,V10,blank,train
3242476,SER_S6#V10#2#34,1,S6/V10/V10_R2/S6_V10_R2_IMAG0098.JPG,S6,V10,gazelleGrants,train
3242477,SER_S6#V10#2#34,2,S6/V10/V10_R2/S6_V10_R2_IMAG0099.JPG,S6,V10,gazelleGrants,train


In [15]:
def binarize_categories(row):
    if row['question__species'] == 'blank':
        return 0
    else:
        return 1

In [16]:
instances = serengeti_images_labeled_split[['image_path_rel', 'question__species', 'split']].copy()

In [17]:
instances['category'] = instances.apply(binarize_categories, axis=1)

In [18]:
pd.crosstab(instances.category, instances.split)

split,train,val,val_dev
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1522420,535832,278594
1,524804,209185,84537


Verify if images were sized correctly:

In [19]:
ss_path = '/data/fagner/coruja/datasets/serengeti/serengeti_600x1024/'

In [20]:
all_images_download = [value['image_path_rel']
                       for key, value
                       in
                       instances.iterrows()
                       if os.path.isfile(ss_path + value['image_path_rel'])]

In [21]:
len(all_images_download)

3154176

In [22]:
len(instances)

3155372

In [23]:
instances = instances[instances.image_path_rel.isin(all_images_download)].copy()

### Saving csv files

In [24]:
def save_split(data, split, col_file_name, col_category, file_patern):
    data_processed = data[data.split == split].copy()
    data_processed['file_name'] = data_processed[col_file_name]
    data_processed['category'] = data_processed[col_category]
    
    file_name = file_patern % split
    
    
    data_processed[['file_name', 'category']].to_csv(file_name, index=False)

In [25]:
save_split(instances, 'train', 'image_path_rel', 'category', '../data/ss_%s_empty.csv')
save_split(instances, 'val_dev', 'image_path_rel', 'category', '../data/ss_%s_empty.csv')
save_split(instances, 'val', 'image_path_rel', 'category', '../data/ss_%s_empty.csv')

In [26]:
save_split(instances, 'train', 'image_path_rel', 'question__species', '../data/ss_%s_species.csv')
save_split(instances, 'val_dev', 'image_path_rel', 'question__species', '../data/ss_%s_species.csv')
save_split(instances, 'val', 'image_path_rel', 'question__species', '../data/ss_%s_species.csv')

Balancing classes for empty/nonempty model:

In [27]:
train_empty_sample = instances[(instances.split == 'train') & (instances.category == 0)].sample(524804).copy()

In [28]:
instances_bal = pd.concat([train_empty_sample,
                           instances[(instances.split == 'train') & (instances.category == 1)]])

In [29]:
save_split(instances_bal, 'train', 'image_path_rel', 'category', '../data/ss_%s_empty_bal.csv')

### Split by time:

In [30]:
serengeti_images_labeled_split_time = serengeti_images_labeled.copy()

In [31]:
def mark_time_split(row):
    if row['season'] in ['S6'] :
        return 'val'
    elif row['season'] in ['S5']:
        return 'val_dev'
    else:
        return 'train'

In [32]:
serengeti_images_labeled_split_time['split'] = serengeti_images_labeled_split_time.apply(mark_time_split, axis=1)

In [33]:
serengeti_images_labeled_split_time

Unnamed: 0,capture_id,image_rank_in_capture,image_path_rel,season,site,question__species,split
0,SER_S1#B04#1#1,1,S1/B04/B04_R1/S1_B04_R1_PICT0001.JPG,S1,B04,human,train
1,SER_S1#B04#1#2,1,S1/B04/B04_R1/S1_B04_R1_PICT0002.JPG,S1,B04,human,train
2,SER_S1#B04#1#3,1,S1/B04/B04_R1/S1_B04_R1_PICT0003.JPG,S1,B04,blank,train
3,SER_S1#B04#1#4,1,S1/B04/B04_R1/S1_B04_R1_PICT0004.JPG,S1,B04,blank,train
4,SER_S1#B04#1#5,1,S1/B04/B04_R1/S1_B04_R1_PICT0005.JPG,S1,B04,blank,train
...,...,...,...,...,...,...,...
3242474,SER_S6#V10#2#33,2,S6/V10/V10_R2/S6_V10_R2_IMAG0096.JPG,S6,V10,blank,val
3242475,SER_S6#V10#2#33,3,S6/V10/V10_R2/S6_V10_R2_IMAG0097.JPG,S6,V10,blank,val
3242476,SER_S6#V10#2#34,1,S6/V10/V10_R2/S6_V10_R2_IMAG0098.JPG,S6,V10,gazelleGrants,val
3242477,SER_S6#V10#2#34,2,S6/V10/V10_R2/S6_V10_R2_IMAG0099.JPG,S6,V10,gazelleGrants,val


In [34]:
pd.crosstab(serengeti_images_labeled_split_time.question__species, serengeti_images_labeled_split_time.split)

split,train,val,val_dev
question__species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
aardvark,411,47,110
aardwolf,251,19,42
baboon,2291,1029,1179
batEaredFox,690,35,28
blank,1364312,383985,588549
buffalo,19734,5298,8623
bushbuck,231,48,74
caracal,130,45,8
cheetah,2242,132,1028
civet,60,6,6


In [35]:
def binarize_categories(row):
    if row['question__species'] == 'blank':
        return 0
    else:
        return 1

In [36]:
instances = serengeti_images_labeled_split_time[['image_path_rel', 'question__species', 'split']].copy()

In [37]:
instances['category'] = instances.apply(binarize_categories, axis=1)

In [38]:
pd.crosstab(instances.category, instances.split)

split,train,val,val_dev
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,1364312,383985,588549
1,516635,75332,226559


In [39]:
ss_path = '/data/fagner/coruja/datasets/serengeti/serengeti_600x1024/'

In [40]:
all_images_download = [value['image_path_rel']
                       for key, value
                       in
                       instances.iterrows()
                       if os.path.isfile(ss_path + value['image_path_rel'])]

In [41]:
instances = instances[instances.image_path_rel.isin(all_images_download)].copy()

In [42]:
len(instances)

3154176

In [43]:
save_split(instances, 'train', 'image_path_rel', 'category', '../data/ss_time_%s_empty.csv')
save_split(instances, 'val_dev', 'image_path_rel', 'category', '../data/ss_time_%s_empty.csv')
save_split(instances, 'val', 'image_path_rel', 'category', '../data/ss_time_%s_empty.csv')

In [44]:
save_split(instances, 'train', 'image_path_rel', 'question__species', '../data/ss_time_%s_species.csv')
save_split(instances, 'val_dev', 'image_path_rel', 'question__species', '../data/ss_time_%s_species.csv')
save_split(instances, 'val', 'image_path_rel', 'question__species', '../data/ss_time_%s_species.csv')

In [45]:
train_empty_sample = instances[(instances.split == 'train') & (instances.category == 0)].sample(516635).copy()

In [46]:
instances_bal = pd.concat([train_empty_sample,
                           instances[(instances.split == 'train') & (instances.category == 1)]])

In [47]:
save_split(instances_bal, 'train', 'image_path_rel', 'category', '../data/ss_time_%s_empty_bal.csv')