In [33]:
from collections import defaultdict
import os
import requests
import sys

sys.path.append('..')

import pandas as pd
from tqdm import tqdm

from pixel_accuracy import read_annotations_gt, convert_pixel_images SUBSTRATE_LIST, SUBSTRATE_TO_IDX

## 1. Pixel distribution for training data

In [14]:
gt_test = read_annotations_gt(file='/Users/campea/Downloads/training_set_task_2.csv', task=2)

In [15]:
pixel_images = convert_pixel_images(gt_test)

100%|██████████| 879/879 [00:33<00:00, 26.56it/s]


In [9]:
pixel_images['2018_0714_112608_061'].shape

(4032, 3024)

In [93]:
def calculate_distribution(pixel_images):
    pixel_distribution = defaultdict(int)
    number_objects = defaultdict(int)

    for image in tqdm(pixel_images.values()):
        non_background = 0
        for name, idx in SUBSTRATE_TO_IDX.items():
            pixels_substrate = (image == idx).sum()
            pixel_distribution[name] += pixels_substrate
            non_background += pixels_substrate
            number_objects[name] += 1*(pixels_substrate>0)
        pixel_distribution['background'] += (4032*3024-non_background)
        
    return pixel_distribution, number_objects

In [11]:
pixel_distribution, number_objects = calculate_distribution(pixel_images)

100%|██████████| 879/879 [03:57<00:00,  3.70it/s]


In [15]:
print(pixel_distribution['background']/(len(pixel_images)*4032*3024))
del pixel_distribution['background']

8558178890


In [None]:
print(pixel_distribution['background']/(len(pixel_images)*4032*3024))
del pixel_distribution['background']

In [16]:
s_pixels = sum(pixel_distribution.values())
s_objects = sum(number_objects.values())

normalised_pixels = {key:value/s_pixels for key, value in pixel_distribution.items()}
normalised_objects = {key:value/s_objects for key, value in number_objects.items()}

In [17]:
for key, value in normalised_pixels.items():
    print(f'{key} {100*value:.2f}%')

c_algae_macro_or_leaves 5.18%
c_fire_coral_millepora 0.09%
c_hard_coral_boulder 20.45%
c_hard_coral_branching 17.34%
c_hard_coral_encrusting 5.79%
c_hard_coral_foliose 0.94%
c_hard_coral_mushroom 0.30%
c_hard_coral_submassive 8.50%
c_hard_coral_table 3.16%
c_soft_coral 27.84%
c_soft_coral_gorgonian 0.71%
c_sponge 7.68%
c_sponge_barrel 2.02%


In [14]:
for key, value in normalised_objects.items():
    print(f'{key} {100*value:.2f}%')

c_algae_macro_or_leaves 4.47%
c_fire_coral_millepora 0.27%
c_hard_coral_boulder 18.01%
c_hard_coral_branching 14.25%
c_hard_coral_encrusting 13.08%
c_hard_coral_foliose 3.17%
c_hard_coral_mushroom 3.76%
c_hard_coral_submassive 5.95%
c_hard_coral_table 2.68%
c_soft_coral 13.45%
c_soft_coral_gorgonian 1.85%
c_sponge 14.18%
c_sponge_barrel 4.88%


## 2. Pixel distribution on test data

In [134]:
gt_test = read_annotations_gt(file='../../yam/plugins/test_set_2021_05_11/test_set_2021_05_11_task_2.csv', task=2)

In [97]:
pixel_images = convert_pixel_images(gt_test)

100%|██████████| 485/485 [00:13<00:00, 37.13it/s] 


In [98]:
pixel_distribution, number_objects = calculate_distribution(pixel_images)

100%|██████████| 485/485 [02:52<00:00,  2.81it/s]


In [99]:
print(pixel_distribution['background']/(len(pixel_images)*4032*3024))
del pixel_distribution['background']

0.775919872650282


In [100]:
s_pixels = sum(pixel_distribution.values())
s_objects = sum(number_objects.values())

normalised_pixels = {key:value/s_pixels for key, value in pixel_distribution.items()}
normalised_objects = {key:value/s_objects for key, value in number_objects.items()}

In [101]:
for key, value in normalised_pixels.items():
    print(f'{key} {100*value:.2f}%')

c_algae_macro_or_leaves 12.28%
c_fire_coral_millepora 0.03%
c_hard_coral_boulder 10.83%
c_hard_coral_branching 24.28%
c_hard_coral_encrusting 4.30%
c_hard_coral_foliose 0.03%
c_hard_coral_mushroom 0.05%
c_hard_coral_submassive 19.57%
c_hard_coral_table 9.81%
c_soft_coral 2.48%
c_soft_coral_gorgonian 0.14%
c_sponge 14.89%
c_sponge_barrel 1.31%


In [102]:
for key, value in normalised_objects.items():
    print(f'{key} {100*value:.2f}%')

c_algae_macro_or_leaves 9.96%
c_fire_coral_millepora 0.48%
c_hard_coral_boulder 19.62%
c_hard_coral_branching 11.84%
c_hard_coral_encrusting 14.89%
c_hard_coral_foliose 0.43%
c_hard_coral_mushroom 1.30%
c_hard_coral_submassive 10.05%
c_hard_coral_table 7.01%
c_soft_coral 4.30%
c_soft_coral_gorgonian 0.58%
c_sponge 14.21%
c_sponge_barrel 5.32%


## 3. Per location (train)

In [86]:
gt_train = read_annotations_gt(file='/Users/campea/Downloads/training_set_task_2.csv', task=2)

In [87]:
train_images = set((name+'.jpg').lower() for name in gt_train.keys())

In [75]:
r = requests.get('https://annotator.uk/dataset/1/images/export?reviewed=True')

annotations = r.json()

subset_1 = [image['original_file'].lower() for image in annotations['image_list'] if image['original_file'].startswith('2018_0712_') if image['reviewed']
           and image['original_file'].lower() in train_images]

In [77]:
df = pd.read_excel('../../yam/plugins/imageCLEFcoral datasets.xlsx',
                   sheet_name='training set 2021 files',
                   engine='openpyxl')

images_in_subset_2 = set(df['PK-20180714-01'].apply(str.lower))

subset_2 = [image['original_file'].lower() for image in annotations['image_list'] if image['original_file'].lower() in images_in_subset_2
           and image['original_file'].lower() in train_images]
    
subset_3 = [image['original_file'].lower() for image in annotations['image_list'] if image['original_file'].startswith('2018_0729') and
            'PK-20180729-02' in image['path_to_image'] and image['reviewed']
            and image['original_file'].lower() in train_images]

subset_4 = [image['original_file'] for image in annotations['image_list'] if image['folder'] == '20170803-dominica-cabrits'
              and image['original_file'].lower() in train_images]

subset_5 = [image['original_file'] for image in annotations['image_list'] if image['folder'] == '20180406-spermonde-keke'
              and image['original_file'].lower() in train_images]

subset_6 = [image['original_file'] for image in annotations['image_list'] if image['folder'] == '20190417-seychelles-BL'
              and image['original_file'].lower() in train_images]

In [111]:
subset_name_to_files = {
    'K1-20180712-01': [x.lower() for x in subset_1],
    'PK-20180714-01': [x.lower() for x in subset_2],
    'PK-20180729-02': [x.lower() for x in subset_3],
    '20170803-dominica-cabrits': [x.lower() for x in subset_4],
    '20180406-spermonde-keke': [x.lower() for x in subset_5],
    '20190417-seychelles-BL': [x.lower() for x in subset_6]
}

In [104]:
list(subset_name_to_files.keys())

['K1-20180712-01',
 'PK-20180714-01',
 'PK-20180729-02',
 '20170803-dominica-cabrits',
 '20180406-spermonde-keke',
 '20190417-seychelles-BL']

In [81]:
print({key: len(value) for key, value in subset_name_to_files.items()})

{'K1-20180712-01': 173, 'PK-20180714-01': 234, 'PK-20180729-02': 172, '20170803-dominica-cabrits': 100, '20180406-spermonde-keke': 100, '20190417-seychelles-BL': 100}


In [83]:
training_set_2021 = subset_1 + subset_2 + subset_3 + subset_4 + subset_5 + subset_6

In [84]:
len(training_set_2021)

879

In [122]:
result_string = ""

for name, images in subset_name_to_files.items():
    gt_set = {name: value for name, value in gt_train.items() if (name+'.jpg').lower() in images}
    
    pixel_images = convert_pixel_images(gt_set)
    pixel_distribution, number_objects = calculate_distribution(pixel_images)
    
    del pixel_distribution['background']
    
    s_pixels = sum(pixel_distribution.values())
    s_objects = sum(number_objects.values())

    normalised_pixels = {key:value/s_pixels for key, value in pixel_distribution.items()}
    normalised_objects = {key:value/s_objects for key, value in number_objects.items()}
    
    result_string += f"\n\n{name} ({len(gt_set)} images)\n\n"
    result_string += "\nPixel distribution###Number of objects distribution\n"
    for (key_1, value_1), (key_2, value_2) in zip(normalised_pixels.items(), normalised_objects.items()):
        result_string += f'{key_1}#{100*value_1:.2f}%##{key_2}#{100*value_2:.2f}%\n'

100%|██████████| 173/173 [00:06<00:00, 28.38it/s]
100%|██████████| 173/173 [01:26<00:00,  2.01it/s]
100%|██████████| 234/234 [00:16<00:00, 14.49it/s]
100%|██████████| 234/234 [02:19<00:00,  1.68it/s]
100%|██████████| 172/172 [00:04<00:00, 39.23it/s]
100%|██████████| 172/172 [02:17<00:00,  1.25it/s]
100%|██████████| 100/100 [00:04<00:00, 21.11it/s]
100%|██████████| 100/100 [01:20<00:00,  1.24it/s]
100%|██████████| 100/100 [00:05<00:00, 18.69it/s]
100%|██████████| 100/100 [00:30<00:00,  3.28it/s]
100%|██████████| 100/100 [00:02<00:00, 48.66it/s]
100%|██████████| 100/100 [00:25<00:00,  3.92it/s]


In [123]:
print(result_string)



K1-20180712-01 (173 images)


Pixel distribution###Number of objects distribution
c_algae_macro_or_leaves#1.51%##c_algae_macro_or_leaves#3.64%
c_fire_coral_millepora#0.12%##c_fire_coral_millepora#0.22%
c_hard_coral_boulder#5.64%##c_hard_coral_boulder#13.01%
c_hard_coral_branching#10.03%##c_hard_coral_branching#10.36%
c_hard_coral_encrusting#5.46%##c_hard_coral_encrusting#13.34%
c_hard_coral_foliose#2.37%##c_hard_coral_foliose#6.95%
c_hard_coral_mushroom#0.11%##c_hard_coral_mushroom#1.32%
c_hard_coral_submassive#5.23%##c_hard_coral_submassive#5.95%
c_hard_coral_table#0.64%##c_hard_coral_table#1.87%
c_soft_coral#54.38%##c_soft_coral#19.07%
c_soft_coral_gorgonian#3.06%##c_soft_coral_gorgonian#5.29%
c_sponge#10.28%##c_sponge#15.44%
c_sponge_barrel#1.18%##c_sponge_barrel#3.53%


PK-20180714-01 (234 images)


Pixel distribution###Number of objects distribution
c_algae_macro_or_leaves#0.16%##c_algae_macro_or_leaves#1.95%
c_fire_coral_millepora#0.16%##c_fire_coral_millepora#0.39%
c_hard_cora

## 4. Per location (test)

In [124]:
gt_test = read_annotations_gt(file='../../yam/plugins/test_set_2021_05_11/test_set_2021_05_11_task_2.csv', task=2)

In [125]:
test_images = set((name+'.jpg').lower() for name in gt_test.keys())

In [127]:
r = requests.get('https://annotator.uk/dataset/1/images/export?reviewed=True')

annotations = r.json()

subset_1 = [image['original_file'].lower() for image in annotations['image_list'] if image['original_file'].startswith('2018_0712_') if image['reviewed']
           and image['original_file'].lower() in test_images]

In [143]:
df = pd.read_excel('../../yam/plugins/imageCLEFcoral datasets.xlsx',
                   sheet_name='training set 2021 files',
                   engine='openpyxl')

images_in_subset_2 = set(df['PK-20180714-01'].apply(str.lower))

subset_2 = [image['original_file'].lower() for image in annotations['image_list'] if image['original_file'].lower() in images_in_subset_2
           and image['original_file'].lower() in test_images]
    
subset_3 = [image['original_file'].lower() for image in annotations['image_list'] 
            if image['folder'].replace('_', '-') == 'PK-20180729-02'
            and image['original_file'].lower() in test_images]

subset_4 = [image['original_file'] for image in annotations['image_list'] if image['folder'] == '20170803-dominica-cabrits'
              and image['original_file'].lower() in test_images]

subset_5 = [image['original_file'] for image in annotations['image_list'] if image['folder'] == '20180406-spermonde-keke'
              and image['original_file'].lower() in test_images]

subset_6 = [image['original_file'] for image in annotations['image_list'] if image['folder'] == '20190417-seychelles-BL'
              and image['original_file'].lower() in test_images]

In [144]:
subset_name_to_files = {
    'K1-20180712-01': [x.lower() for x in subset_1],
    'PK-20180714-01': [x.lower() for x in subset_2],
    'PK-20180729-02': [x.lower() for x in subset_3],
    '20170803-dominica-cabrits': [x.lower() for x in subset_4],
    '20180406-spermonde-keke': [x.lower() for x in subset_5],
    '20190417-seychelles-BL': [x.lower() for x in subset_6]
}

In [148]:
print({key: len(value) for key, value in subset_name_to_files.items()})

{'K1-20180712-01': 0, 'PK-20180714-01': 0, 'PK-20180729-02': 98, '20170803-dominica-cabrits': 201, '20180406-spermonde-keke': 166, '20190417-seychelles-BL': 20}


In [149]:
test_set_2021 = subset_1 + subset_2 + subset_3 + subset_4 + subset_5 + subset_6

In [150]:
len(test_set_2021)

485

In [151]:
result_string = ""

for name, images in subset_name_to_files.items():
    gt_set = {name: value for name, value in gt_test.items() if (name+'.jpg').lower() in images}
    
    if not gt_set:
        continue
        
    pixel_images = convert_pixel_images(gt_set)
    pixel_distribution, number_objects = calculate_distribution(pixel_images)
    
    del pixel_distribution['background']
    
    s_pixels = sum(pixel_distribution.values())
    s_objects = sum(number_objects.values())

    normalised_pixels = {key:value/s_pixels for key, value in pixel_distribution.items()}
    normalised_objects = {key:value/s_objects for key, value in number_objects.items()}
    
    result_string += f"\n\n{name} ({len(gt_set)} images)\n\n"
    result_string += "\nPixel distribution###Number of objects distribution\n"
    for (key_1, value_1), (key_2, value_2) in zip(normalised_pixels.items(), normalised_objects.items()):
        result_string += f'{key_1}#{100*value_1:.2f}%##{key_2}#{100*value_2:.2f}%\n'

100%|██████████| 98/98 [00:02<00:00, 48.85it/s]
100%|██████████| 98/98 [00:39<00:00,  2.45it/s]
100%|██████████| 201/201 [00:06<00:00, 32.63it/s]
100%|██████████| 201/201 [01:18<00:00,  2.55it/s]
100%|██████████| 166/166 [00:06<00:00, 27.64it/s]
100%|██████████| 166/166 [00:59<00:00,  2.78it/s]
100%|██████████| 20/20 [00:00<00:00, 82.97it/s]
100%|██████████| 20/20 [00:09<00:00,  2.01it/s]


In [156]:
print(result_string)



PK-20180729-02 (98 images)


Pixel distribution###Number of objects distribution
c_algae_macro_or_leaves#0.18%##c_algae_macro_or_leaves#0.82%
c_fire_coral_millepora#0.35%##c_fire_coral_millepora#1.03%
c_hard_coral_boulder#21.31%##c_hard_coral_boulder#18.52%
c_hard_coral_branching#13.04%##c_hard_coral_branching#13.37%
c_hard_coral_encrusting#10.30%##c_hard_coral_encrusting#13.99%
c_hard_coral_foliose#0.31%##c_hard_coral_foliose#1.44%
c_hard_coral_mushroom#0.84%##c_hard_coral_mushroom#5.56%
c_hard_coral_submassive#1.67%##c_hard_coral_submassive#8.23%
c_hard_coral_table#0.05%##c_hard_coral_table#0.21%
c_soft_coral#37.90%##c_soft_coral#18.31%
c_soft_coral_gorgonian#2.09%##c_soft_coral_gorgonian#2.47%
c_sponge#11.74%##c_sponge#15.23%
c_sponge_barrel#0.23%##c_sponge_barrel#0.82%


20170803-dominica-cabrits (201 images)


Pixel distribution###Number of objects distribution
c_algae_macro_or_leaves#37.88%##c_algae_macro_or_leaves#24.47%
c_fire_coral_millepora#0.04%##c_fire_coral_millepora#0.6

In [159]:
!echo '{result_string}' | pbcopy