### Download full dataset
https://www.kaggle.com/datasets/veeralakrishna/200-bird-species-with-11788-images

In [None]:
!wget -O archive.zip "https://storage.googleapis.com/kaggle-data-sets/471346/883439/bundle/archive.zip?X-Goog-Algorithm=GOOG4-RSA-SHA256&X-Goog-Credential=gcp-kaggle-com%40kaggle-161607.iam.gserviceaccount.com%2F20250519%2Fauto%2Fstorage%2Fgoog4_request&X-Goog-Date=20250519T102854Z&X-Goog-Expires=259200&X-Goog-SignedHeaders=host&X-Goog-Signature=3ade82c31904ee9b6190fec9b09951a4a7c6d7b34959acdb7aef10fe64506c9899510c239127a5ead655dc85fefd211336f145229d3cff5d62dbfa4a1439a8fc4ff34f67eeac52127abe7243ed1d92a5dc1c285f2873f2f9ad667d0114eb572a6b0178403924fc5a6231acb83023f3cad8c4127c8a35166832c713904a0ac3225aacf17b74e5674316c33664ae0ec80da44ca8404fe895d686cff6b3422a5226c5ab487e54028df7e42d4a05c9fa41356920ef68498541b40c1397e23644ea9029be4e14602d90306537608a26401f89c964fff79fd2f8ad7e4abc4dd66e27bc73c537c33ba2b3f78e6ab21b6c75906aefe8720aeba98fa7d60a10d13bf0b3fa"

In [None]:
!mkdir -p downloads/birds-200-species
!mv archive.zip downloads/birds-200-species
!cd downloads/birds-200-species && unzip archive.zip
!cd downloads/birds-200-species && tar -zxvf CUB_200_2011.tgz
!rm -rf downloads/birds-200-species/archive.zip \
        downloads/birds-200-species/attributes.txt \
        downloads/birds-200-species/CUB_200_2011.tgz \
        downloads/birds-200-species/segmentations.tgz

### Evenly split between train and val sets

In [30]:
import os
import glob
import shutil
import cv2
import pandas as pd
import numpy as np
from tqdm import tqdm
from shared import resize_and_pad_image_cv2

In [95]:
for image_dir in tqdm(glob.glob("./downloads/birds-200-species/CUB_200_2011/images/*")):
    images = glob.glob(f"{image_dir}/*")
    idx = len(images) // 2

    for i in images[:idx]:
        cls = os.path.basename(os.path.dirname(i))
        img = os.path.basename(i)
        dst = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(i))), "train", cls, img)
        os.makedirs(os.path.dirname(dst), exist_ok=True)
        cv2.imwrite(dst, resize_and_pad_image_cv2(cv2.imread(i)))

    for i in images[idx:]:
        cls = os.path.basename(os.path.dirname(i))
        img = os.path.basename(i)
        dst = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(i))), "val", cls, img)
        os.makedirs(os.path.dirname(dst), exist_ok=True)
        cv2.imwrite(dst, resize_and_pad_image_cv2(cv2.imread(i)))

100%|██████████| 200/200 [00:51<00:00,  3.87it/s]


### Prepare csv for training

In [96]:
class LabelEncoder:
    def __init__(self, classes):
        self.class2id = {cls:id for id, cls in enumerate(classes)}
        self.id2class = {id:cls for cls, id in self.class2id.items()}
    
    def __repr__(self):
        return str(self.class2id)

le = LabelEncoder(os.listdir("./downloads/birds-200-species/CUB_200_2011/train"))

# Prepare Train CSV
rows = []
for image_path in tqdm(glob.glob("./downloads/birds-200-species/CUB_200_2011/train/*/*")):
    cls = os.path.basename(os.path.dirname(image_path))
    image_path = os.path.join("train", os.path.basename(os.path.dirname(image_path)), os.path.basename(image_path))
    rows.append({
        "label": le.class2id[cls],
        "path": image_path,
        "split": "train",
        "is_query": np.nan,
        "is_gallery": np.nan
    })

train_df = pd.DataFrame(rows)

# Prepare Val CSV
rows = []
query_image_found = set()

for image_path in tqdm(glob.glob("./downloads/birds-200-species/CUB_200_2011/val/*/*")):
    cls = os.path.basename(os.path.dirname(image_path))
    image_path = os.path.join("val", os.path.basename(os.path.dirname(image_path)), os.path.basename(image_path))
    
    is_query = False
    if cls not in query_image_found:
        is_query = True
        query_image_found.add(cls)

    rows.append({
        "label": le.class2id[cls],
        "path": image_path,
        "split": "validation",
        "is_query": "TRUE",
        "is_gallery": "TRUE"
    })

val_df = pd.DataFrame(rows)

df = pd.concat([train_df, val_df])
df = df.sample(frac=1.0) # random shuffle
df.to_csv("./downloads/birds-200-species/CUB_200_2011/df.csv", index=False)

100%|██████████| 9977/9977 [00:00<00:00, 200062.97it/s]
100%|██████████| 1811/1811 [00:00<00:00, 132491.75it/s]
