# Creating a Dataset out of Real EM Images

The annotations of the real EM images are crudely traced skeletons of fibres and need to be converted into masks and further into the correct dataset formats for fine-tuning.

In [25]:
import os
import cv2 as cv
import numpy as np
from tqdm import tqdm

from src.data_utils import save_label_sam, save_label_yolo

In [26]:
import matplotlib.pyplot as plt

In [27]:
DATASET_1 = '../data/real datasets/set 1'
DATASET_2 = '../data/real datasets/set 2'

NEW_DATASET = '../data/datasets/real_dataset'

In [28]:
IMAGES = []
LABELS = []
for ds in [DATASET_1, DATASET_2]:
    IMAGE_DIR = os.path.join(ds, 'png files')
    
    for i, imname in enumerate(os.listdir(IMAGE_DIR)):
        if not imname.endswith('.png'):
            continue
        raw_label = os.path.join(ds, 'output/{}.npy'.format(os.path.splitext(imname)[0]))
        impath = os.path.join(IMAGE_DIR, imname)
        
        IMAGES.append(impath)
        LABELS.append(raw_label)
        
        img = cv.imread(impath)
        lbl = np.array(np.load(raw_label, allow_pickle=True)) / 1.765
        # For some reason the labels are not the same size as the image
        for l in lbl:   # Invert Y axis for plotting
            l[:, 1] = np.abs(l[:, 1] - img.shape[0])
        
        # Creating labels
        contours = []
        masks = []
        for l in tqdm(lbl):
            mask = np.zeros(img.shape[:2], np.uint8)
            cv.polylines(mask, [l.astype(np.int32)], isClosed=False, thickness=20, color=255)
            # For saving binary masks for SA1B format
            masks.append(mask)
            
            # For saving COCO format
            contours_data, _ = cv.findContours(mask.astype(np.uint8),
                                               mode=cv.RETR_EXTERNAL, method=cv.CHAIN_APPROX_NONE)
            for contour in contours_data:
                epsilon = 0.003 * cv.arcLength(contour, closed=True)
                approx = cv.approxPolyDP(contour, epsilon=epsilon, closed=True)
                if len(approx) > 2:
                    contours.append(approx)
            
            # Saving labels
            new_imdir = os.path.join(NEW_DATASET, 'images/train')
            new_lbldir = os.path.join(NEW_DATASET, 'labels/train')
            new_imname = f'{imname}_{i+2000}'
            # cv.imwrite(os.path.join(new_imdir, f'{new_imname}.png'), img)
            # save_label_yolo(new_imname, contours, img.shape[:2], new_lbldir)
            # save_label_sam(new_imname, masks, new_imdir)
        
        # PLOTTING
        plt.imshow(img, cmap='gray')
        for l in lbl:
            plt.plot(l[:, 0], l[:, 1], linewidth=2, marker='o', ms=3, alpha=0.5)
        plt.show()
        


# The End.