Just follow this process for the datasets you imported in wildlife_datasets.datasets.py

In [1]:
from prepare_data import *

In [2]:
prepare_functions = {
    'FeralCatsAkl_maxim': prepare_feralcatsakl_maxim,
    'FeralCatsAkl_HIDiff': prepare_feralcatsakl_hidiff,
    'FeralCatsAkl_SRMNet': prepare_feralcatsakl_srmnet,
}

In [2]:
# !pip install pycocotools

# 1. Process the datasets
**Processing includes:**
- Resize images
- Crop bounding boxes
- Crop black background of segmented images
- If multiple identities exist in one image, we crop them and split them into two images.


**We save two sets of images:**
- For inference with images resized to 518x518: CLIP, DINOv2, and MegaDescriptor-L-384
- For inference with images resized to 256x256: MegaDescriptor-T-224, MegaDescriptor-S-224, MegaDescriptor-B-224, MegaDescriptor-L-224


**Note**: Stored images were further transformed (e.g. resized to 224x224) depending on model during the inference. Inference with smaller models using the stored 518x518 images is possible but it gives slightly different results that in the paper.

In [3]:
base_dataset_dir = "../../data/processed/cropped"
new_root = "../data/images/size-256"

In [4]:
# prepare_feralcatsakl_base(size=256, root=base_dataset_dir, new_root=new_root+'/fca_base')

In [5]:
fca_base_metadata_256 = pd.read_csv('/Users/fmb/GitHub/764WildlifeReID/megadescriptor/data/images/size-256/fca_base/annotations.csv', index_col=0)

In [None]:
datasets_folder = '/Users/fmb/GitHub/764WildlifeReID/data/formatted_datasets'  # Path to downloaded datasets

# Create folders with images resized to 256 and 518
for name, prepare in prepare_functions.items():
    print(name)
    prepare(size=256, root=f'{datasets_folder}/{name}', new_root=new_root+'/'+name)
    # prepare(size=518, root=f'{datasets_folder}/{name}', new_root=f'images/size-518/{name}')

    # Metadata should be the same
    # metadata_256 = pd.read_csv(f'../data/images/size-256/{name}/annotations.csv', index_col=0)
    # metadata_518 = pd.read_csv(f'images/size-518/{name}/annotations.csv', index_col=0)
    # assert metadata_256.equals(metadata_518)

# 2. Create split metadata for each dataset
**Split datasets:**
- Closed split, images with unknown identities are discarded
- Store the metadata for each dataset as CSV.
- Test set for each dataset is used for evaluation.

In [8]:
from wildlife_datasets import splits

In [8]:
fca_base_metadata_256 = pd.read_csv('/Users/fmb/GitHub/764WildlifeReID/megadescriptor/data/images/size-256/fca_base/annotations.csv', index_col=0)

splitter = splits.ClosedSetSplit(0.8, identity_skip='unknown', seed=666)
idx_train, idx_test = splitter.split(fca_base_metadata_256)[0]

fca_base_metadata_256.loc[fca_base_metadata_256.index[idx_train], 'split'] = 'train'
fca_base_metadata_256.loc[fca_base_metadata_256.index[idx_test], 'split'] = 'test'

os.makedirs(f'../metadata/datasets/fca_base/', exist_ok=True)
fca_base_metadata_256.to_csv(f'../metadata/datasets/fca_base/metadata.csv')

In [9]:
for name in prepare_functions:
    metadata = pd.read_csv(f'../data/images/size-256/{name}/annotations.csv', index_col=0)
    splitter = splits.ClosedSetSplit(0.8, identity_skip='unknown', seed=666)
    idx_train, idx_test = splitter.split(metadata)[0]

    metadata.loc[metadata.index[idx_train], 'split'] = 'train'
    metadata.loc[metadata.index[idx_test], 'split'] = 'test'

    os.makedirs(f'../metadata/datasets/{name}/', exist_ok=True)
    metadata.to_csv(f'../metadata/datasets/{name}/metadata.csv')

In [5]:
# # Create dataframe with training / test set splits
# from wildlife_datasets import splits
# for name in prepare_functions:
#     metadata = pd.read_csv(f'images/size-518/{name}/annotations.csv', index_col=0)
#     splitter = splits.ClosedSetSplit(0.8, identity_skip='unknown', seed=666)
#     idx_train, idx_test = splitter.split(metadata)[0]

#     metadata.loc[metadata.index[idx_train], 'split'] = 'train'
#     metadata.loc[metadata.index[idx_test], 'split'] = 'test'

#     os.makedirs(f'metadata/datasets/{name}/', exist_ok=True)
#     metadata.to_csv(f'metadata/datasets/{name}/metadata.csv')


# 3. Create metadata for aggregated training dataset
- Combine training sets from metadata of all datasets to single aggregated metadata
- The aggregated training set is used for training MegaDescriptors.
    - Adds dataset name to identity to prevent identity name collisions
    - Adds dataset name to the image path to enable loading the aggregated dataset using `WildlifeDataset`

#### *Note:* We don't really need to aggregate any datasets in this case, since we're treating each dataset independently

nevertheless, we just follow along with the process

In [9]:
results = []
fca_from_metadata = pd.read_csv('/Users/fmb/GitHub/764WildlifeReID/megadescriptor/metadata/datasets/fca_base/metadata.csv', index_col=0)

df = fca_from_metadata.query("split == 'train'").copy()
df['dataset'] = 'fca_base'
df['identity'] = 'fca_base' + '_' + df['identity'].astype(str)
df['path'] = 'fca_base' + '/' + df['path']
results.append(df)

In [4]:
# import pandas as pd

# results = []
# for name in prepare_functions:
#     metadata = pd.read_csv(f'metadata/datasets/{name}/metadata.csv', index_col=0)

#     df = metadata.query("split == 'train'").copy()
#     df['dataset'] = name
#     df['identity'] = name + '_' + df['identity'].astype(str)
#     df['path'] = name + '/' + df['path']
#     results.append(df)

In [10]:
combined_all = pd.concat(results)

os.makedirs('/Users/fmb/GitHub/764WildlifeReID/megadescriptor/metadata/combined', exist_ok=True)
combined_all.to_csv('/Users/fmb/GitHub/764WildlifeReID/megadescriptor/metadata/combined/combined_all.csv')