In [28]:
import polars as pl
import json
from pathlib import Path
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
from functools import reduce

In [8]:
path_dataset = Path('../../datasets/aicrowd_food_recognition_benchmark_2022').resolve()
path_labels = path_dataset / 'labels'

In [4]:
annotations = json.loads((path_labels / 'train' / 'annotations.json').read_text())
print('Summary:')
for key, values in annotations.items():
    print(f'{key}: {len(values)}')

Summary:
categories: 498
info: 2
images: 39962
annotations: 76491


In [17]:
categories = annotations['categories']
category_names = [c['name'] for c in categories]
category_id_to_name = {c['id']: c['name'] for c in categories}

In [9]:
if False:
    with (path_dataset / 'categories.txt').open('w') as file:
        file.write('\n'.join(map(lambda x: f'{x[0]}: {x[1]}', enumerate(category_names))))

### Top 10 categories with the most images

In [23]:
category_images = defaultdict(lambda: set())
for anno in annotations['annotations']:
    category_images[anno['category_id']].add(anno['image_id'])
df = pl.DataFrame({
    'id': category_images.keys(),
    'name': [category_id_to_name[id_] for id_ in category_images.keys()],
    'count': [len(category_images[id_]) for id_ in category_images.keys()],
})
df.sort('count', descending=True).head(10)

id,name,count
i64,str,i64
2578,"""water""",2928
1040,"""salad-leaf-salad-green""",2002
1566,"""bread-white""",1891
1069,"""tomato-raw""",1865
2053,"""butter""",1601
1078,"""carrot-raw""",1482
1565,"""bread-wholemeal""",1452
2512,"""coffee-with-caffeine""",1406
1468,"""rice""",1024
2022,"""egg""",1015


In [34]:
categories_filtered = list(df.sort('count', descending=True).head(10)['id'])
images_filtered = reduce(lambda a, b: a.union(b), [images for k, images in category_images.items() if k in categories_filtered])

In [35]:
annotations_filtered = {
    'categories': [c for c in categories if c['id'] in categories_filtered],
    'images': [im for im in annotations['images'] if im['id'] in images_filtered],
    'info': {},
    'annotations': [anno for anno in annotations['annotations'] if anno['image_id'] in images_filtered],
}
print('Summary:')
for key, values in annotations_filtered.items():
    print(f'{key}: {len(values)}')

Summary:
categories: 10
images: 14055
info: 0
annotations: 34702


In [37]:
if False:
    with (path_dataset.parent / 'aicrowd_food_recognition_benchmark_2022_reduced' / 'labels' / 'train' / 'annotations.json').open('w') as file:
        json.dump(annotations_filtered, file)

In [None]:
# TODO:
# annotations of validation
# copy images
# test training