In [59]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import os

import torch
from torch.utils.data.sampler import WeightedRandomSampler
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset, DataLoader

In [30]:

path = '../dataset/herbarium_2022/'
train_dir = path + 'train_images/'
test_dir = path + 'test_images/'

with open( path + "train_metadata.json") as json_file:
    train_meta = json.load(json_file)
with open(path + "test_metadata.json") as json_file:
    test_meta = json.load(json_file)


In [31]:
train_meta.keys()

dict_keys(['annotations', 'images', 'categories', 'genera', 'institutions', 'distances', 'license'])

In [32]:
train_meta['annotations'][:2]

[{'genus_id': 1,
  'institution_id': 0,
  'category_id': 0,
  'image_id': '00000__001'},
 {'genus_id': 1,
  'institution_id': 0,
  'category_id': 0,
  'image_id': '00000__002'}]

In [33]:
train_meta['distances'][:2]

[{'genus_id_x': 2508, 'genus_id_y': 2576, 'distance': 0.0325632},
 {'genus_id_x': 2508, 'genus_id_y': 2518, 'distance': 0.0254073}]

In [34]:
image_ids = [image["image_id"] for image in train_meta["images"]]
image_dirs = [train_dir + image['file_name'] for image in train_meta["images"]]
category_ids = [annotation['category_id'] for annotation in train_meta['annotations']]
genus_ids = [annotation['genus_id'] for annotation in train_meta['annotations']]
institutions_ids = [annotation['institution_id'] for annotation in train_meta['annotations']]

test_ids = [image['image_id'] for image in test_meta]
test_dirs = [test_dir + image['file_name'] for image in test_meta]

train_df = pd.DataFrame({
    "image_id" : image_ids,
    "image_dir" : image_dirs,
    "category" : category_ids,
    "genus" : genus_ids,
    "institutions" : institutions_ids})

test_df = pd.DataFrame({
    "test_id" : test_ids,
    "test_dir" : test_dirs
})

In [35]:
train_df.head()

Unnamed: 0,image_id,image_dir,category,genus,institutions
0,00000__001,../dataset/herbarium_2022/train_images/000/00/...,0,1,0
1,00000__002,../dataset/herbarium_2022/train_images/000/00/...,0,1,0
2,00000__003,../dataset/herbarium_2022/train_images/000/00/...,0,1,0
3,00000__004,../dataset/herbarium_2022/train_images/000/00/...,0,1,0
4,00000__005,../dataset/herbarium_2022/train_images/000/00/...,0,1,0


In [36]:
genus_map = {genus['genus_id'] : genus['genus'] for genus in train_meta['genera']}
train_df['genus'] = train_df['genus'].map(genus_map)
train_df.tail()

Unnamed: 0,image_id,image_dir,category,genus,institutions
839767,15504__032,../dataset/herbarium_2022/train_images/155/04/...,15504,Zygophyllum,55
839768,15504__033,../dataset/herbarium_2022/train_images/155/04/...,15504,Zygophyllum,53
839769,15504__035,../dataset/herbarium_2022/train_images/155/04/...,15504,Zygophyllum,13
839770,15504__036,../dataset/herbarium_2022/train_images/155/04/...,15504,Zygophyllum,13
839771,15504__037,../dataset/herbarium_2022/train_images/155/04/...,15504,Zygophyllum,13


In [37]:
print(len(train_df.genus.value_counts()))
print(len(train_df.category.value_counts()))
print(len(train_df.institutions.value_counts()))

2564
15501
60


In [38]:
print('Top 15 Genus ')
print(train_df['category'].value_counts().head(15))

Top 15 Genus 
2774     80
8854     80
12540    80
10805    80
12530    80
1146     80
4661     80
1145     80
12524    80
2876     80
8800     80
8794     80
4641     80
2889     80
12515    80
Name: category, dtype: int64


In [57]:
train_df.head()

Unnamed: 0,image_id,image_dir,category,genus,institutions
0,00000__001,../dataset/herbarium_2022/train_images/000/00/...,0,Abies,0
1,00000__002,../dataset/herbarium_2022/train_images/000/00/...,0,Abies,0
2,00000__003,../dataset/herbarium_2022/train_images/000/00/...,0,Abies,0
3,00000__004,../dataset/herbarium_2022/train_images/000/00/...,0,Abies,0
4,00000__005,../dataset/herbarium_2022/train_images/000/00/...,0,Abies,0


In [60]:
def stratified_split_train_val(df, target_variable_name, test_size=0.2, random_state=None):
    '''
    Replaced with the function balance_val_split, which returns subset of the original dataset,
    instead of returning subsetted pandas DataFrames.
    '''
    targets = df[target_variable_name].tolist()
    train_idx, val_idx = train_test_split(
                           np.arange(len(targets)),
                           test_size=test_size,
                           shuffle=True,
                           stratify=targets,
                           random_state=random_state)
    df_train = df.iloc[train_idx]
    df_val = df.iloc[val_idx]

    return df_train, df_val, train_idx, val_idx

In [62]:
train_df.head()

Unnamed: 0,image_id,image_dir,category,genus,institutions
0,00000__001,../dataset/herbarium_2022/train_images/000/00/...,0,Abies,0
1,00000__002,../dataset/herbarium_2022/train_images/000/00/...,0,Abies,0
2,00000__003,../dataset/herbarium_2022/train_images/000/00/...,0,Abies,0
3,00000__004,../dataset/herbarium_2022/train_images/000/00/...,0,Abies,0
4,00000__005,../dataset/herbarium_2022/train_images/000/00/...,0,Abies,0


In [65]:
targets = train_df['category']
train_idx, val_idx = train_test_split(
                       np.arange(len(train_df)),
                       test_size=0.25,
                       shuffle=True,
                       stratify=targets,
                       random_state=None)

KeyError: "None of [Int64Index([504160,  98113, 490231, 378009, 475749, 666755, 559529, 803861,\n            601639, 816016,\n            ...\n             71477, 215289, 243714, 255944, 653411,  49100, 564687,  14382,\n            822062, 384551],\n           dtype='int64', length=629829)] are in the [columns]"