# **[HuBMAP 2023] K-fold CV COCO Dataset Generator**

### Versions
* V7 - 2 Categories:  **{1: 'blood_vessel',2: 'glomerulus'}**
* V6 - 1 Category:  **{1: 'blood_vessel'}**
* V5 - Ignore it ,I made mistake!!!
* V4 - 3 Categories:**{1: 'glomerulus', 2: 'blood_vessel', 3: 'unsure'}**

In [1]:
!pip install pycocotools -Uqq

### importing libraries

In [2]:
import json, cv2, numpy as np, itertools, random, pandas as pd
from pycocotools.coco import COCO
from pycocotools import mask as maskUtils
from skimage import io
import matplotlib.pyplot as plt
from pathlib import Path
from tqdm.auto import tqdm
from sklearn import model_selection

import matplotlib.pyplot as plt
from skimage import io
from pycocotools.coco import COCO
import matplotlib.patches as mpatches

### Functions

In [3]:
def coordinates_to_masks(coordinates, shape):
    masks = []
    for coord in coordinates:
        mask = np.zeros(shape, dtype=np.uint8)
        cv2.fillPoly(mask, [np.array(coord)], 1)
        masks.append(mask)
    return masks

def rle_encoding(x):
    dots = np.where(x.flatten() == 1)[0]
    run_lengths = []
    prev = -2
    for b in dots:
        if (b>prev+1): run_lengths.extend((b + 1, 0))
        run_lengths[-1] += 1
        prev = b
    return ' '.join(map(str, run_lengths))

def binary_mask_to_rle(binary_mask):
    rle = {'counts': [], 'size': list(binary_mask.shape)}
    counts = rle.get('counts')
    for i, (value, elements) in enumerate(itertools.groupby(binary_mask.ravel(order='F'))):
        if i == 0 and value == 1:
            counts.append(0)
        counts.append(len(list(elements)))
    return rle

def rle_to_binary_mask(mask_rle, shape=(512, 512)):
    '''
    mask_rle: run-length as string formated (start length)
    shape: (height,width) of array to return 
    Returns numpy array, 1 - mask, 0 - background

    '''
    s = mask_rle.split()
    starts, lengths = [np.asarray(x, dtype=int) 
                       for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(shape[0] * shape[1], dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo : hi] = 1
    return img.reshape(shape)  # Needed to align to RLE direction

### Loading Dataset

In [4]:
df = pd.read_csv('/kaggle/input/hubmap-hacking-the-human-vasculature/tile_meta.csv')
df = df.query('dataset != 3')
#df=df.head(50)
df.reset_index(inplace=True,drop=True)
df.head()

Unnamed: 0,id,source_wsi,dataset,i,j
0,0006ff2aa7cd,2,2,16896,16420
1,00168d1b7522,2,2,14848,14884
2,0033bbc76b6b,1,1,10240,43008
3,003504460b3a,3,2,8192,11776
4,004daf1cbe75,3,2,6144,11264


### Spliting training & Valid

In [5]:
from sklearn.model_selection import StratifiedKFold

n_splits=5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
for fold, (_, val_idx) in enumerate(skf.split(X=df, y=df['source_wsi']), 1):
    df.loc[val_idx, 'fold'] = fold
    
df['fold'] = df['fold'].astype(np.uint8)
df.groupby('fold').size()

fold
1    327
2    327
3    327
4    326
5    326
dtype: int64

In [6]:
df

Unnamed: 0,id,source_wsi,dataset,i,j,fold
0,0006ff2aa7cd,2,2,16896,16420,4
1,00168d1b7522,2,2,14848,14884,5
2,0033bbc76b6b,1,1,10240,43008,2
3,003504460b3a,3,2,8192,11776,4
4,004daf1cbe75,3,2,6144,11264,1
...,...,...,...,...,...,...
1628,ff434af74304,4,2,3072,22528,2
1629,ff4897b3eda6,4,2,11776,20992,2
1630,ff66dec71c4c,3,2,5120,10752,3
1631,ff99cdef0f2a,4,2,5120,24064,4


### Reading polygons.jsonl

In [7]:
jsonl_file_path = "/kaggle/input/hubmap-hacking-the-human-vasculature/polygons.jsonl"
data = []
with open(jsonl_file_path, "r") as file:
    for line in file:
        data.append(json.loads(line))

### Categories

In [8]:
categories_list=['blood_vessel']#,'glomerulus','unsure']
#------------------------------------------------------------------------------
categories_ids = {name:id+1 for id, name in enumerate(categories_list)}  
ids_categories = {id+1:name for id, name in enumerate(categories_list)}  
categories =[{'id':id,'name':name} for name,id in categories_ids.items()]

print(categories_ids)
print(ids_categories)
print(categories)

{'blood_vessel': 1}
{1: 'blood_vessel'}
[{'id': 1, 'name': 'blood_vessel'}]


### Creating COCO

In [9]:
def coco_structure(images_ids):
    idx=1
    annotations=[]
    images=[]
    for item in tqdm(data,total=int(len(images_ids))):
        image_id=item["id"]
        if image_id in images_ids:
            image = {"id": image_id, "file_name": image_id + ".tif", "height": 512, "width": 512}
            images.append(image)
        else:continue
        #-----------------------------
        anns=item["annotations"]
        for an in anns:
            category_type=an["type"]
            if category_type !="blood_vessel": continue
            category_id=categories_ids[category_type]
            segmentation=an["coordinates"]
            mask_img = coordinates_to_masks(segmentation, (512, 512))[0]
            ys, xs = np.where(mask_img)
            x1, x2 = min(xs), max(xs)
            y1, y2 = min(ys), max(ys)

            rle = binary_mask_to_rle(mask_img)

            seg = {
                    "id": idx,
                    "image_id": image_id,
                    "category_id": category_id,
                    "segmentation": rle,
                    "bbox": [int(x1), int(y1), int(x2 - x1 + 1), int(y2 - y1 + 1)],
                    "area": int(np.sum(mask_img)),
                    "iscrowd": 0,
                }
            if image_id in images_ids:
                annotations.append(seg)
                idx=idx+1
                
    return {"info": {}, "licenses": [], "categories": categories, "images": images, "annotations": annotations}

### Saving COCO

In [10]:
for i in range(5,6):   
    selected_fold=i
#     df = df[df.dataset==1].reset_index(drop=True)
    train_ids = df.query(f'fold != {selected_fold}')['id'].values.tolist()
    valid_ids = df.query(f'fold == {selected_fold}')['id'].values.tolist()
    print(len(train_ids), len(valid_ids))
    train_coco_data = coco_structure(train_ids)
    valid_coco_data = coco_structure(valid_ids)
    output_file_path = f"coco_annotations_PURE_train_class_{len(categories_list)}_folds{n_splits}_fold{selected_fold}.json"
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        json.dump(train_coco_data, output_file, ensure_ascii=True, indent=4)

    output_file_path = f"coco_annotations_PURE_valid_class_{len(categories_list)}_folds{n_splits}_fold{selected_fold}.json"
    with open(output_file_path, "w", encoding="utf-8") as output_file:
        json.dump(valid_coco_data, output_file, ensure_ascii=True, indent=4)

1307 326


  0%|          | 0/1307 [00:00<?, ?it/s]

  0%|          | 0/326 [00:00<?, ?it/s]

### Creating DataFrame

In [11]:
def dataframe_structure():
    annotations=[]
    for item in tqdm(data):
        image_id=item["id"]
        anns=item["annotations"]
        #if not image_id in images_ids:continue
        for an in anns:
            category_type=an["type"]
            category_id=categories_ids[category_type]
            segmentation=an["coordinates"]
            mask_img = coordinates_to_masks(segmentation, (512, 512))[0]
            ys, xs = np.where(mask_img)
            x1, x2 = min(xs), max(xs)
            y1, y2 = min(ys), max(ys)
            for index, row in df.query(f"id=='{image_id}'").iterrows():
                seg = {
                        "image_id": image_id,
                        "source_wsi": row["source_wsi"],
                        "dataset": row["dataset"],
                        "category_id": int(category_id),
                        "category_name": ids_categories[category_id],
                        "annotations": rle_encoding(mask_img),
                        "bbox": (int(x1), int(y1), int(x2 - x1 + 1), int(y2 - y1 + 1)),
                        "area": int(np.sum(mask_img)),
                        "iscrowd": 0,
                    }
                annotations.append(seg)
                
    return annotations

In [12]:
#data_dict = dataframe_structure()
#df_train = pd.DataFrame(data_dict)
#df_train.head(10)

In [13]:
#df_train.to_csv("df_train.csv")

In [14]:
#!zip -r file.zip /kaggle/working
#from IPython.display import FileLink
#FileLink(r'file.zip')

### Visualization

In [15]:
# dataDir = Path("/kaggle/input/hubmap-hacking-the-human-vasculature/train")
# annFile = Path(f"coco_annotations_train_class_{len(categories_list)}_folds{n_splits}_fold{selected_fold}.json")

# colors = ['Set1','Set3','Set3_r'] 
# legend = ids_categories #{1: 'blood_vessel',2:glomerulus....}

# coco = COCO(annFile)
# imgIds = coco.getImgIds()
# imgs = coco.loadImgs(imgIds[0:4])

# fig, axs = plt.subplots(len(imgs), 2, figsize=(10, 5*len(imgs)))
# for img, ax_row in zip(imgs, axs):
#     ax = ax_row[0]  # Access the first axis in each row
#     I = io.imread(dataDir / img["file_name"])
#     annIds = coco.getAnnIds(imgIds=[img["id"]])
#     anns = coco.loadAnns(annIds)
#     ax.imshow(I)
#     ax = ax_row[1]  # Access the second axis in each row
#     ax.imshow(I)
#     plt.sca(ax)
#     for i, ann in enumerate(anns):
#         category_id = ann['category_id']
#         color = colors[category_id-1]
#         #-----------------------------------------
#         mask = coco.annToMask(ann)
#         mask = np.ma.masked_where(mask == 0, mask)
#         ax.imshow(mask, cmap=color, alpha=0.8)
#         #-----------------------------------------
#         handles = []
#         for category_id in legend:
#             color = colors[category_id - 1]
#             handles.append(mpatches.Patch(color=plt.colormaps.get_cmap(color)(0)))
#         ax.legend(handles, legend.values(), bbox_to_anchor=(1.05, 1), loc='upper left')

# plt.axis('off')
# plt.show()