In [1]:
import json
from pathlib import Path
import os
import shutil
import cv2
import itertools
import numpy as np
from typing import List, Dict
from sklearn.model_selection import train_test_split

In [2]:
DATA_DIR = Path('../input/data/')

In [3]:
# Read .jsonl file and convert it to a list of dicts
# The dicts contain IDs, class names and segmentation masks
# from https://www.kaggle.com/code/leonidkulyk/eda-hubmap-hhv-interactive-annotations
with open('../input/data/polygons.jsonl', 'r') as json_file:
    json_list = list(json_file)
    
tiles_dicts = []
for json_str in json_list:  # string varialbe
    tiles_dicts.append(json.loads(json_str))

In [9]:
tiles_dicts[0]

{'id': '0006ff2aa7cd',
 'annotations': [{'type': 'glomerulus',
   'coordinates': [[[167, 249],
     [166, 249],
     [165, 249],
     [164, 249],
     [163, 249],
     [162, 249],
     [161, 249],
     [160, 249],
     [159, 249],
     [158, 249],
     [157, 249],
     [156, 249],
     [155, 249],
     [154, 249],
     [153, 249],
     [152, 249],
     [151, 249],
     [150, 249],
     [149, 249],
     [148, 249],
     [147, 249],
     [146, 249],
     [145, 249],
     [144, 249],
     [143, 249],
     [142, 249],
     [141, 249],
     [140, 249],
     [139, 249],
     [138, 249],
     [137, 249],
     [136, 249],
     [135, 249],
     [134, 249],
     [133, 249],
     [132, 249],
     [131, 249],
     [130, 249],
     [129, 249],
     [128, 249],
     [127, 249],
     [126, 249],
     [125, 249],
     [124, 249],
     [123, 249],
     [122, 249],
     [122, 248],
     [121, 248],
     [120, 248],
     [119, 248],
     [118, 248],
     [117, 248],
     [117, 247],
     [116, 247],
    

In [10]:
# Define a conversion between class name and number
id_dict = {'blood_vessel': 0, 'glomerulus': 1, 'unsure': 2}

In [44]:
tiles_dicts[0]["annotations"][0]["coordinates"][0][:5]

[[167, 249], [166, 249], [165, 249], [164, 249], [163, 249]]

In [40]:
list(itertools.chain(*tiles_dicts[0]["annotations"][0]["coordinates"][0]))[:5]

[167, 249, 166, 249, 165]

In [46]:
array = list(itertools.chain(*tiles_dicts[0]["annotations"][0]["coordinates"][0]))
array = np.array(array)/512.

In [48]:
" ".join(map(str, array))[:20]

'0.326171875 0.486328'

In [49]:
# Function to copy images and transform labels to 
# coco formatted .txt files
def tile_to_coco(tile: List[Dict], output_folder: Path):
    tile_id = tile['id']    
    
    # Copy image
    shutil.copyfile(DATA_DIR / f'train/{tile_id}.tif', output_folder / f'{tile_id}.tif')
    
    # Create text file and write formatted labels to it
    with open(output_folder / f'{tile_id}.txt', 'w') as text_file:
        for annotation in tile['annotations']:
            
            class_id = id_dict[annotation['type']]
            flat_mask_polygon = list(itertools.chain(*annotation['coordinates'][0]))
            # Divide by 512 because coco labels expect positions between 0 and 1
            # not pixel indices
            array = np.array(flat_mask_polygon)/512.
            text_file.write(f'{class_id} {" ".join(map(str, array))}\n')
            

In [17]:
iterables = ("ABC", "DEF")

In [20]:
element = itertools.chain(*iterables)

In [19]:
iterables

('ABC', 'DEF')

In [22]:
list(element)

['A', 'B', 'C', 'D', 'E', 'F']

In [50]:
# Split into train and validation 
train_dicts, valid_dicts = train_test_split(tiles_dicts, test_size=0.2, random_state=42)

In [53]:
for train_dict in train_dicts: 
    tile_to_coco(train_dict, Path('../input/coco/train/'))

In [55]:
for valid_dict in valid_dicts: 
    tile_to_coco(valid_dict, Path('../input/coco/valid/'))

In [56]:
# Create a yaml file as expected by YOLOv7 (and others)
yaml_text = """
# HuBMAP - Hacking the Human Vasculature dataset 
# https://www.kaggle.com/competitions/hubmap-hacking-the-human-vasculature


# train and val data as 1) directory: path/images/, 2) file: path/images.txt, or 3) list: [path1/images/, path2/images/]
train: ../input/coco/train/
val: ../input/coco/valid/

# class names
names: 
  0: blood_vessel
  1: glomerulus
  2: unsure
"""

In [58]:
with open('../input/coco/hubmap-coco.yaml', 'w') as text_file:
    text_file.write(yaml_text)