In [None]:
# Copyright 2021 Fagner Cunha
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

In [1]:
import json
import random
import numpy as np
import pandas as pd

random.seed(42)
np.random.seed(42)

#### Loading Snaposhot Serengeti data

In [2]:
def add_location(row):
    return row['file_name'].split('/')[1]

In [3]:
serengeti_site_test = pd.read_csv('/data/fagner/training/data/ss_val_empty.csv')
serengeti_site_test['location'] = serengeti_site_test.apply(add_location, axis=1)

serengeti_time_test = pd.read_csv('/data/fagner/training/data/ss_time_val_empty.csv')
serengeti_time_test['location'] = serengeti_time_test.apply(add_location, axis=1)

#### Sampling 5000 instances from the test set
* Must include at least one empty and one nonempty image for each location
* Must keep the binary class proportion

In [4]:
def get_instance_per_location(df):
    selected_instances = []
    empty_count = 0
    nonempty_count = 0
    for location in df.location.unique():
        empty = df[(df.location == location) & (df.category == 0)]
        if len(empty) > 0:
            selected_instances.append(empty.sample(1)['file_name'].values[0])
            empty_count += 1
        
        nonempty = df[(df.location == location) & (df.category == 1)]
        if len(nonempty) > 0:
            selected_instances.append(nonempty.sample(1)['file_name'].values[0])
            nonempty_count += 1
    
    return selected_instances, empty_count, nonempty_count

In [5]:
def sample_instances(df, category, num_samples):
    samples = df[df.category == category].sample(num_samples)
    
    return list(samples['file_name'])

In [6]:
def sample_testset(df, num_samples):
    num_empty_samples = int(num_samples * df.category.value_counts()[0]/len(df))
    num_nonempty_samples = num_samples - num_empty_samples
    
    sel_instances, empty_count, nonempty_count = get_instance_per_location(df)
    
    empty_samples = sample_instances(df[~df.file_name.isin(sel_instances)],
                                    category=0,
                                    num_samples=(num_empty_samples - empty_count))
    
    nonempty_samples = sample_instances(df[~df.file_name.isin(sel_instances)],
                                        category=1,
                                        num_samples=(num_nonempty_samples - nonempty_count))
    
    sel_instances = sel_instances + empty_samples + nonempty_samples
    
    return df[df.file_name.isin(sel_instances)].copy()

In [7]:
NUM_SAMPLES = 5000

In [8]:
selected_instances_site = sample_testset(serengeti_site_test, NUM_SAMPLES)
selected_instances_time = sample_testset(serengeti_time_test, NUM_SAMPLES)

#### Save test_small spliting

In [9]:
def save_to_csv(df, file_name):
    df = df[['file_name', 'category', 'split', 'location']]
    df.to_csv(file_name, index=False)

In [10]:
selected_instances_site['split'] = 'val_small'
selected_instances_time['split'] = 'val_small'

In [11]:
save_to_csv(selected_instances_site, '/data/fagner/training/data/ss_val_small_empty.csv')
save_to_csv(selected_instances_time, '/data/fagner/training/data/ss_time_val_small_empty.csv')