### Hierarchical classification training demo
Based on my previous work.
TODO: Switch to Guillaume's

In [32]:
from urllib.request import urlretrieve
from urllib.parse import urljoin

from tempfile import NamedTemporaryFile

from tqdm.notebook import tqdm as TQDM

repo = "https://anon.erda.au.dk/share_redirect/btK3WIh4Md/rebalanced75_without_larvae/"

with NamedTemporaryFile(suffix = "txt") as tmpfile:
    urlretrieve(
        urljoin(repo, "folder_index.txt"),
        tmpfile.name
    )
    with open(tmpfile.name, "r") as f:
        file_index = [line.strip() for line in f.readlines()]


### Extract the class hierarchy from the directory structure

In [33]:
hierarchy = sorted(filter(lambda x : len(x) == 3, set(tuple(path.split("/")[:-1]) for path in TQDM(file_index, desc="Finding unique classes..."))))

Finding unique classes...:   0%|          | 0/1405516 [00:00<?, ?it/s]

Count the number of images per class (species)

In [34]:
from collections import defaultdict

counts = defaultdict(lambda : 0)

for path in TQDM(file_index, desc="Counting images per class..."):
    path_from_root = tuple(path.split("/")[:-1])
    if path_from_root in hierarchy:
        counts[path_from_root] += 1

Counting images per class...:   0%|          | 0/1405516 [00:00<?, ?it/s]

Filter species with less than 1000 images, and select 200 training images and 25 test images

In [35]:
more_than_1000 = set(cls for cls in hierarchy if counts[cls] >= 1000)
print(f'Number of classes with more than 1000 images: {len(more_than_1000)}')
print(f'Number of total images for selected classes: {sum(counts[cls] for cls in more_than_1000)}')
train_counts, test_counts = defaultdict(lambda : 0), defaultdict(lambda : 0)
train_images, test_images = [], []
for path in TQDM(file_index, desc="Allocating train/test images..."):
    path_from_root = tuple(path.split("/")[:-1])
    if not (path_from_root in more_than_1000):
        continue
    if train_counts[path_from_root] < 200:
        train_counts[path_from_root] += 1
        train_images.append(path)
    elif test_counts[path_from_root] < 25:
        test_counts[path_from_root] += 1
        test_images.append(path)
print(f'Number of training images: {len(train_images)}')
print(f'Number of test images: {len(test_images)}')

Number of classes with more than 1000 images: 420
Number of total images for selected classes: 1174143


Allocating train/test images...:   0%|          | 0/1405516 [00:00<?, ?it/s]

Number of training images: 84000
Number of test images: 10500


### Download train/test images

In [43]:
import os

train_dir = os.path.join("hierarchical", "train")
test_dir = os.path.join("hierarchical", "test")

os.makedirs(train_dir, exist_ok=True)
os.makedirs(test_dir, exist_ok=True)

for path in TQDM(test_images, desc="Downloading test images..."):
    lpath = os.path.join(test_dir, path)
    os.makedirs(os.path.dirname(lpath), exist_ok=True)
    urlretrieve(
        urljoin(repo, path),
        lpath
    )

for path in TQDM(train_images, desc="Downloading train images..."):
    lpath = os.path.join(train_dir, path)
    os.makedirs(os.path.dirname(lpath), exist_ok=True)
    urlretrieve(
        urljoin(repo, path),
        lpath
    )

Downloading test images...:   0%|          | 0/10500 [00:00<?, ?it/s]

KeyboardInterrupt: 