In [None]:
import numpy as np
import scipy.io as sio
import json
import csv
import pandas as pd

In [None]:
filename = 'species_list.csv'

df_species = pd.read_csv(filename, index_col=0)
    
print(df_species.shape)

In [None]:
df_species.head(10)

## iNaturalist 2017 classes

In [None]:
import json

DATAPATH = '/scratch/data/iNaturalist'

In [None]:
train_file = DATAPATH+'/2017/train_val2017/train2017.json'

with open(train_file) as json_file:
    train_annotations = json.load(json_file)

val_file = DATAPATH+'/2017/train_val2017/val2017.json'

with open(val_file) as json_file:
    val_annotations = json.load(json_file)

test_file = DATAPATH+'/2017/test2017/test2017.json'

with open(test_file) as json_file:
    test_annotations = json.load(json_file)


In [None]:
train_annotations.keys()

In [None]:
names_inat = [x['name'] for x in train_annotations['categories'] if x['supercategory'] == 'Aves']

df1 = df_species[df_species.sci_name.isin(names_inat)]
print(df1.shape)
sci_names_inat = list(df1.sci_name)


In [None]:
train_subset = [x for x  in  train_annotations['images'] if x['file_name'].split('/')[2] in sci_names_inat]

val_subset = [x for x  in  val_annotations['images'] if x['file_name'].split('/')[2] in sci_names_inat]


In [None]:
print('Train  complete {} subset {}'.format(len(train_annotations['images']), len(train_subset)))

print('Val  complete {} subset {}'.format(len(val_annotations['images']), len(val_subset)))

print('Test  complete {} subset'.format(len(test_annotations['images'])))

In [None]:
train_keys = [x['file_name'][:-4] for x in train_subset]

### Create class splits

In [None]:
np.random.seed(1)

results = []
for order,  group in df1.groupby('order'):
    for family, group1 in group.groupby('family_name'):
        for genus, group2 in group1.groupby('genus'):
            for species, group3 in group2.groupby('sci_name'):
                results.append({'order':order,'n_families':group['family_name'].drop_duplicates().shape[0],
                                'family':family,'n_genus':group1['genus'].drop_duplicates().shape[0],
                                'genus':genus,'n_species':group2['sci_name'].drop_duplicates().shape[0],
                                'sci_name':species,'is_train':np.random.random() > 0.2
                                })

In [None]:
df2 = pd.DataFrame(results)
print(df2.shape)
df2.head(10)

In [None]:
index = (df2['n_families'] > 1) & (df2['n_genus'] > 1) & (df2['n_species'] > 1)

df2.loc[~index, 'is_train'] = False
print(df2.is_train.mean())

df3 = df2[index]

In [None]:
print(df3.shape)

# df2.head(10)

In [None]:
df_train = df3[df3['is_train']]
df_train.shape

In [None]:
df_val = df2[~df2['is_train']]
df_val.shape

In [None]:
index_1hop = df_val.genus.isin(df_train.genus)

df_val_1hop = df_val[index_1hop]

df_val_1hop.shape

In [None]:
index_2hop = df_val.family.isin(df_train.family) & ~df_val.genus.isin(df_train.genus)

df_val_2hop = df_val[index_2hop]

df_val_2hop.shape

In [None]:
index_3hop = df_val.order.isin(df_train.order) & ~df_val.family.isin(df_train.family)

df_val_3hop = df_val[index_3hop]

df_val_3hop.shape

In [None]:
index_4hop = ~df_val.order.isin(df_train.order)

df_val_4hop = df_val[index_4hop]

df_val_4hop.shape

In [None]:
val_sets = {'train':df_train,
            'val_seen':df_train,
            'val allhop':df_val,
            'val 1hop':df_val_1hop,
            'val 2hop':df_val_2hop,
            'val 3hop':df_val_3hop,
            'val 4hop':df_val_4hop,}

In [None]:
is_overwrite = False 
import pickle

file = DATAPATH+'/2017/zsl_splits/all_splits.pickle'
if is_overwrite:
    with open(file, 'wb') as handle:
        pickle.dump(val_sets, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open(file, 'rb') as handle:
        val_sets = pickle.load(handle)




In [None]:

df_train = val_sets['train']

list_out = []
for key, val in val_sets.items():
    
    list_ = list(val['sci_name'])
    
    dset_zsl = [x for x  in  train_subset if x['file_name'].split('/')[2] in list_]
    n1 = len(dset_zsl)
    # else:
    dset_zsl = [x for x  in  val_subset if x['file_name'].split('/')[2] in list_]
    n2 = len(dset_zsl)
    n3 = n1 if key == 'train' else n2
        
    # print( f' {key} n sci_name = {val.shape[0]} , n_images = {n1 + n2}')

    list_out.append({'Set':key,
        #  'n1':n1,'n2':n2,'n1+n2':n1+n2,
        'n_samples':n3})
    for level in ['order','family','genus','sci_name']:
        index_ = val[level].drop_duplicates().isin(df_train[level])
        list_out[-1][level+'_train'] = index_.sum()
        list_out[-1][level] = index_.shape[0]
        # print(f'\t {level} in train {index_.sum()} ({index_.mean():.1%})')




In [None]:
df_summary_17 = pd.DataFrame(list_out).set_index('Set')


In [None]:

def use_f_2(x):
    return f'{x:,.0f}'

df_summary_17[['n_samples','sci_name']].to_latex(
        # 'tables/inat17_splits.tex',
        formatters=[use_f_2,use_f_2]
        )

In [None]:
save_dir = DATAPATH+'/2017/zsl_splits/'

is_overwrite = False
if is_overwrite:
    df_train['sci_name'].to_csv(save_dir + "seen_classes.txt", index = False,header=False)
    df_val['sci_name'].to_csv(save_dir + "unseen_allhop_classes.txt", index = False,header=False)
    df_val_1hop['sci_name'].to_csv(save_dir + "unseen_1hop_classes.txt", index = False,header=False)
    df_val_2hop['sci_name'].to_csv(save_dir + "unseen_2hop_classes.txt", index = False,header=False)
    df_val_3hop['sci_name'].to_csv(save_dir + "unseen_3hop_classes.txt", index = False,header=False)
    df_val_4hop['sci_name'].to_csv(save_dir + "unseen_4hop_classes.txt", index = False,header=False)

    df2['sci_name'].to_csv(save_dir + "all_classes.txt", index = False,header=False)

## iNaturalist 2021 classes

In [None]:
import json

In [None]:
train_file = DATAPATH+'/2021/train.json'

with open(train_file) as json_file:
    train_annotations = json.load(json_file)

train_mini_file = DATAPATH+'/2021/train_mini.json'

with open(train_mini_file) as json_file:
    train_mini_annotations = json.load(json_file)

val_file = DATAPATH+'/2021/val.json'

with open(val_file) as json_file:
    val_annotations = json.load(json_file)


test_file = DATAPATH+'/2021/public_test.json'

with open(test_file) as json_file:
    test_annotations = json.load(json_file)

In [None]:
test_annotations.keys()

In [None]:
names_inat = [x['name'] for x in val_annotations['categories'] if x['supercategory'] == 'Birds']

In [None]:
dirs_val = [x['image_dir_name'] for x in val_annotations['categories'] if x['supercategory'] == 'Birds']

In [None]:
len(names_inat)

In [None]:
df_species.columns

In [None]:
# subseting to only species that have samples in Billow
df_illustrations = pd.read_csv('illustrations_list.txt', index_col=0)

df_merged = df_illustrations.reset_index().set_index('sci_name').join(df_species.set_index('sci_name'), how='left').reset_index().set_index('index').sort_index()
# 
df_count = df_merged.groupby('sci_name')['sample'].count()
df_count = df_count[df_count > 0]
df2 = df_count[df_count.index.isin(names_inat)]
print(df2.shape)

sci_names_inat = list(df2.index)

df1 = df_species[df_species.sci_name.isin(sci_names_inat)]
print(df1.shape)


In [None]:
get_sciname = lambda x: ' '.join(x.split('/')[1].split('_')[-2:])    

In [None]:
train_subset = [x for x  in  train_annotations['images'] if get_sciname(x['file_name']) in sci_names_inat]

train_mini_subset = [x for x  in  train_mini_annotations['images'] if get_sciname(x['file_name']) in sci_names_inat]

val_subset = [x for x  in  val_annotations['images'] if get_sciname(x['file_name']) in sci_names_inat]

In [None]:
print('Train  complete {} subset {}'.format(len(train_annotations['images']), len(train_subset)))
print('Train-mini  complete {} subset {}'.format(len(train_mini_annotations['images']), len(train_mini_subset)))
print('Val  complete {} subset {}'.format(len(val_annotations['images']), len(val_subset)))
print('Test  complete {}'.format(len(test_annotations['images'])))

### Create class splits

In [None]:
df1

In [None]:
np.random.seed(1)

results = []
for order,  group in df1.groupby('order'):
    for family, group1 in group.groupby('family_name'):
        for genus, group2 in group1.groupby('genus'):
            for species, group3 in group2.groupby('sci_name'):
                results.append({'order':order,'n_families':group['family_name'].drop_duplicates().shape[0],
                                'family':family,'n_genus':group1['genus'].drop_duplicates().shape[0],
                                'genus':genus,'n_species':group2['sci_name'].drop_duplicates().shape[0],
                                'sci_name':species,'is_train':np.random.random() > 0.2
                                })

In [None]:
df2 = pd.DataFrame(results)
print(df2.shape)
df2.head(10)

In [None]:
index = (df2['n_families'] > 1) & (df2['n_genus'] > 1) & (df2['n_species'] > 1)

df2.loc[~index, 'is_train'] = False
print(df2.is_train.mean())

df3 = df2[index]

In [None]:
print(df3.shape)

# df2.head(10)

In [None]:
df_train = df3[df3['is_train']]
df_train.shape

In [None]:
df_val = df2[~df2['is_train']]
df_val.shape

In [None]:
index_1hop = df_val.genus.isin(df_train.genus)

df_val_1hop = df_val[index_1hop]

df_val_1hop.shape

In [None]:
index_2hop = df_val.family.isin(df_train.family) & ~df_val.genus.isin(df_train.genus)

df_val_2hop = df_val[index_2hop]

df_val_2hop.shape

In [None]:
index_3hop = df_val.order.isin(df_train.order) & ~df_val.family.isin(df_train.family)

df_val_3hop = df_val[index_3hop]

df_val_3hop.shape

In [None]:
index_4hop = ~df_val.order.isin(df_train.order)

df_val_4hop = df_val[index_4hop]

df_val_4hop.shape

In [None]:
val_sets = {'train':df_train,
            'val_seen':df_train,
            'val allhop':df_val,
            'val 1hop':df_val_1hop,
            'val 2hop':df_val_2hop,
            'val 3hop':df_val_3hop,
            'val 4hop':df_val_4hop,}


is_overwrite = False
import pickle

file = DATAPATH+'/2021/zsl_splits/all_splits.pickle'
if is_overwrite:
    with open(file, 'wb') as handle:
        pickle.dump(val_sets, handle, protocol=pickle.HIGHEST_PROTOCOL)
else:
    with open(file, 'rb') as handle:
        val_sets = pickle.load(handle)


In [None]:


df_train = val_sets['train']

list_out = []
for key, val in val_sets.items():
    
    list_ = list(val['sci_name'])
    
    dset_zsl = [x for x  in  train_subset if get_sciname(x['file_name']) in list_]
    n1 = len(dset_zsl)
    # else:
    dset_zsl = [x for x  in  val_subset if get_sciname(x['file_name']) in list_]
    n2 = len(dset_zsl)
    n3 = n1 if key == 'train' else n2
        
    # print( f' {key} n sci_name = {val.shape[0]} , n_images = {n1 + n2}')

    list_out.append({'Set':key,
        #  'n1':n1,'n2':n2,'n1+n2':n1+n2,
        'n_samples':n3})
    for level in ['order','family','genus','sci_name']:
        index_ = val[level].drop_duplicates().isin(df_train[level])
        list_out[-1][level+'_train'] = index_.sum()
        list_out[-1][level] = index_.shape[0]
        # print(f'\t {level} in train {index_.sum()} ({index_.mean():.1%})')




In [None]:
df_summary_21 = pd.DataFrame(list_out).set_index('Set')
def use_f_2(x):
    return f'{x:,.0f}'

# df.to_latex(formatters=[None, use_f_2, use_f_2])

df_summary_21[['n_samples','sci_name']].to_latex(
        # 'tables/inat21_splits.tex',
        # float_format="{:0.2f}".format
        formatters=[use_f_2,use_f_2]
        )

In [None]:
save_dir = DATAPATH+'/2021/zsl_splits/'

is_overwrite = False
if is_overwrite:
    df_train['sci_name'].to_csv(save_dir + "seen_classes.txt", index = False,header=False)
    df_val['sci_name'].to_csv(save_dir + "unseen_allhop_classes.txt", index = False,header=False)
    df_val_1hop['sci_name'].to_csv(save_dir + "unseen_1hop_classes.txt", index = False,header=False)
    df_val_2hop['sci_name'].to_csv(save_dir + "unseen_2hop_classes.txt", index = False,header=False)
    df_val_3hop['sci_name'].to_csv(save_dir + "unseen_3hop_classes.txt", index = False,header=False)
    df_val_4hop['sci_name'].to_csv(save_dir + "unseen_4hop_classes.txt", index = False,header=False)
    df2['sci_name'].to_csv(save_dir + "all_classes.txt", index = False,header=False)

In [None]:
df_summary_17

In [None]:
df_summary_17['Year'] = 2017
df_summary_21['Year'] = 2021

In [None]:
df_summary = pd.concat((df_summary_17,df_summary_21))

In [None]:
df_summary.index, df_summary.columns

In [None]:
df_out = df_summary.reset_index().set_index(['Set','Year'])[['n_samples','sci_name']].unstack(1)[[('n_samples', 2017),
            ( 'sci_name', 2017),
            ('n_samples', 2021),
            ( 'sci_name', 2021)]].swaplevel(axis=1)
df_out.stack().T

In [None]:
df_out.stack().T.to_latex(
        # 'tables/inat_splits.T.tex',
        # formatters=4*[use_f_2]
        )

