In [1]:
import os
# os.environ['HF_HUB_DISABLE_PROGRESS_BARS'] = '1'

In [2]:
import openslide
import huggingface_hub as hfh
import numpy as np
import cv2
import pandas as pd
from PIL import Image
from tqdm.notebook import tqdm
import shutil
from xml.etree import ElementTree as ET
from PIL import Image, ImageDraw
import os

In [3]:
hfh.login("INSERT TOKEN HERE")

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [4]:
hfh.utils.disable_progress_bars()

In [5]:
external_test_repo_id = "ain3007-project/raw-external-test-dataset"
internal_test_repo_id = "ain3007-project/raw-internal-test-dataset"
output_test_repo_id = "ain3007-project/test-dataset-thumbnails-2000"

In [6]:
hfh.create_repo(output_test_repo_id, private=True, repo_type="dataset", exist_ok=True)


RepoUrl('https://huggingface.co/datasets/ain3007-project/test-dataset-thumbnails-2000', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ain3007-project/test-dataset-thumbnails-2000')

In [7]:
# get all the filenames
all_files_external = list(hfh.list_files_info(repo_id=external_test_repo_id, repo_type="dataset"))
all_files_internal = list(hfh.list_files_info(repo_id=internal_test_repo_id, repo_type="dataset"))
all_files_external[:5]

[RepoFile(path='.gitattributes', size=5257, blob_id='9408dffb7f14703972c362793536d1d614974372', lfs=None, last_commit=None, security=None),
 RepoFile(path='breast__he__10068.ndpi', size=162377221, blob_id='6998801e706dcfd60bfecaab5fbbfb9e03255cf5', lfs={'size': 162377221, 'sha256': 'a2495145aca18cff0c70e13ae51e571f368873607ca5929eeffc5263edde5021', 'pointer_size': 134}, last_commit=None, security=None),
 RepoFile(path='breast__he__10068.xml', size=180740, blob_id='c6988351db7e295fb9db996b86815c005b068b2b', lfs=None, last_commit=None, security=None),
 RepoFile(path='breast__he__12008.svs', size=431400056, blob_id='d28fdb9fa59fb56327500576026b35235403bec2', lfs={'size': 431400056, 'sha256': 'ee210197ef65486331edf911c17a2f91c6b55cb8a47883ea352a3ca3d135363a', 'pointer_size': 134}, last_commit=None, security=None),
 RepoFile(path='breast__he__12008.xml', size=851663, blob_id='d7208080afb08d83082e1e51289af0a4bddaf13a', lfs=None, last_commit=None, security=None)]

In [8]:
file_names_external = list(map(lambda x: x.rfilename, all_files_external))
file_names_internal = list(map(lambda x: x.rfilename, all_files_internal))
file_names_external[:5]

['.gitattributes',
 'breast__he__10068.ndpi',
 'breast__he__10068.xml',
 'breast__he__12008.svs',
 'breast__he__12008.xml']

In [9]:
file_names_external.remove(".gitattributes")
file_names_internal.remove(".gitattributes")

file_names_external[:5]

['breast__he__10068.ndpi',
 'breast__he__10068.xml',
 'breast__he__12008.svs',
 'breast__he__12008.xml',
 'breast__he__12066.svs']

In [10]:
file_extensions_external = list(set(list(map(lambda x: x.split(".")[-1], file_names_external))))
file_extensions_external

['ndpi', 'svs', 'xml', 'tiff']

In [11]:
file_extensions_internal = filter(lambda x: ("/" not in x), file_names_internal)
file_extensions_internal = list(set(list(map(lambda x: x.split(".")[-1], file_extensions_internal))))
file_extensions_internal

['xml', 'tif', 'svs', 'tiff', 'mrxs', 'ndpi']

In [12]:
file_extensions_external.remove("xml")
file_extensions_internal.remove("xml")

In [13]:
!mkdir -p images masks

In [14]:
def str2float(x):
    return float(x.replace(",", "."))
    
def get_mask_from_xml(xml_path, image_size, image_shrinking_factor):
    tree = ET.parse(xml_path)
    root = tree.getroot()
    image = Image.new("L", image_size, "white")
    draw = ImageDraw.Draw(image)
    draw.fill = True
    label2grayscale_color = {"bg": 0, "tissue": 1, "tisuue": 1}
    for i in root[0]:
        annotation_type = i.attrib["Type"]
        annotation_label = i.attrib["PartOfGroup"]
        # there is roi rectangle
        if annotation_type not in ["Spline", "Polygon", "Rectangle"]:
            print(f"Annotation type must be either Spline, Rectangle or Polygon but found: {annotation_type}")
            continue
            
        if annotation_label not in label2grayscale_color:
            print(f"Annotation label must be either tissue or bg but found: {annotation_label}")
            continue
        
        coordinates = [(i.attrib["X"], i.attrib["Y"]) for i in i[0]]
        coordinates = [(str2float(x), str2float(y)) for x, y in coordinates]
        coordinates = [(x*image_shrinking_factor, y*image_shrinking_factor) for x, y in coordinates]
        
        if annotation_type in ["Spline", "Polygon"]:
            draw.polygon(coordinates, fill=label2grayscale_color[annotation_label])
        elif annotation_type == "Rectangle":
            # ^
            # |         point 1 is bigger than point 3
            # | 0 1
            # | 3 2 
            # |------->
            draw.rectangle([coordinates[3], coordinates[1]], fill=label2grayscale_color[annotation_label])
#         if annotation_type == "Spline":
#             draw.line(coordinates, fill=label2grayscale_color[annotation_label], width=1)
#         elif annotation_type == "Polygon":
#             draw.polygon(coordinates, fill=label2grayscale_color[annotation_label])
    return image

In [15]:
def process(file_list, image_idx, metadata_df, thumbnail_size=(2000, 2000), source_repo_id = None):
    for allow_pattern, image_path, xml_path in file_list:
        path = hfh.snapshot_download(
            repo_id=source_repo_id,
            repo_type="dataset", 
            allow_patterns=allow_pattern,
            cache_dir="hf_folder",
        )

        try:
            slide_file = openslide.OpenSlide(os.path.join(path, image_path))
        except openslide.OpenSlideUnsupportedFormatError:
            print(f"image {image_path} didnt get recognized by OpenSlide. OpenSlideUnsupportedFormatError")
            image_idx += 1
            shutil.rmtree('hf_folder')
            continue
        except openslide.OpenSlideError:
            print(f"image {image_path} didnt get recognized by OpenSlide. OpenSlideError")
            image_idx += 1
            shutil.rmtree('hf_folder')
            continue


        thumbnail = slide_file.get_thumbnail(thumbnail_size)
        thumbnail.save(f"images/{image_idx}.png")

        image_shrinking_factor = min(thumbnail.size) / min(slide_file.dimensions)
        
        if xml_path is not None:
            mask = get_mask_from_xml(
                os.path.join(path, xml_path),
                thumbnail.size,
                image_shrinking_factor
            )
            mask.save(f"masks/{image_idx}.png")
            
        metadata_df.loc[len(metadata_df)] = [
            image_idx,
            image_path,
            source_repo_id,
            slide_file.dimensions[0],
            slide_file.dimensions[1],
            slide_file.level_count,
            slide_file.properties["openslide.vendor"],
            image_shrinking_factor,
        ]

            
        shutil.rmtree('hf_folder')
        print(f"finished {image_idx} image of {source_repo_id}")
        image_idx += 1
        
    return image_idx, metadata_df

In [16]:
def get_allow_list(file_id, file_extension):
    if file_extension == "mrxs":
        return (file_id+"/*", file_id+".*"), file_id+".mrxs", file_id+".xml"
    else:
        return (file_id+"*", file_id+f".{file_extension}", file_id+".xml")


In [17]:
# generate allow patterns for processing
external_file_list = []
for file_extension in file_extensions_external:
    external_unique_ids = [f.replace(f".{file_extension}", "") for f in file_names_external if f.endswith(f".{file_extension}")]
    external_file_list.extend([get_allow_list(file_id, file_extension) for file_id in external_unique_ids])
external_file_list[:5]
    

[('breast__he__10068*', 'breast__he__10068.ndpi', 'breast__he__10068.xml'),
 ('breast__he__13131*', 'breast__he__13131.ndpi', 'breast__he__13131.xml'),
 ('breast__he__13997*', 'breast__he__13997.ndpi', 'breast__he__13997.xml'),
 ('breast__he__14128*', 'breast__he__14128.ndpi', 'breast__he__14128.xml'),
 ('breast__he__14327*', 'breast__he__14327.ndpi', 'breast__he__14327.xml')]

In [18]:
internal_file_list = []
for file_extension in file_extensions_internal:
    internal_unique_ids = [f.replace(f".{file_extension}", "") for f in file_names_internal if f.endswith(f".{file_extension}")]
    internal_file_list.extend([get_allow_list(file_id, file_extension) for file_id in internal_unique_ids])
internal_file_list[:5]

[('lymphnode__he__11849*',
  'lymphnode__he__11849.tif',
  'lymphnode__he__11849.xml'),
 ('lymphnode__he__12526*',
  'lymphnode__he__12526.tif',
  'lymphnode__he__12526.xml'),
 ('lymphnode__he__12883*',
  'lymphnode__he__12883.tif',
  'lymphnode__he__12883.xml'),
 ('lymphnode__he__13209*',
  'lymphnode__he__13209.tif',
  'lymphnode__he__13209.xml'),
 ('lymphnode__he__14637*',
  'lymphnode__he__14637.tif',
  'lymphnode__he__14637.xml')]

In [19]:
# external_unique_ids = [f.replace(".xml", "") for f in file_names_external if f.endswith("xml")]
# internal_unique_ids = [f.replace(".xml", "") for f in file_names_internal if f.endswith("xml")]

# now lest make empty metadata dataframe
metadata_df = pd.DataFrame(columns=["image_id", "file_name", "source_dataset", "image-dimension_x", "image-dimension_y", "image-levels", "image-format", "mask_shrink_factor"])
metadata_df

Unnamed: 0,image_id,file_name,source_dataset,image-dimension_x,image-dimension_y,image-levels,image-format,mask_shrink_factor


In [20]:
hfh.utils.are_progress_bars_disabled()

True

In [21]:
# process and save things to a folder
image_idx = 0
image_idx, metadata_df = process(external_file_list, image_idx, metadata_df, thumbnail_size=(2000, 2000), source_repo_id=external_test_repo_id)

finished 0 image of ain3007-project/raw-external-test-dataset
finished 1 image of ain3007-project/raw-external-test-dataset
finished 2 image of ain3007-project/raw-external-test-dataset
finished 3 image of ain3007-project/raw-external-test-dataset
finished 4 image of ain3007-project/raw-external-test-dataset
finished 5 image of ain3007-project/raw-external-test-dataset
finished 6 image of ain3007-project/raw-external-test-dataset
finished 7 image of ain3007-project/raw-external-test-dataset
Annotation label must be either tissue or bg but found: None
finished 8 image of ain3007-project/raw-external-test-dataset
finished 9 image of ain3007-project/raw-external-test-dataset
finished 10 image of ain3007-project/raw-external-test-dataset
finished 11 image of ain3007-project/raw-external-test-dataset
finished 12 image of ain3007-project/raw-external-test-dataset
finished 13 image of ain3007-project/raw-external-test-dataset
finished 14 image of ain3007-project/raw-external-test-dataset
fini

In [22]:
metadata_df

Unnamed: 0,image_id,file_name,source_dataset,image-dimension_x,image-dimension_y,image-levels,image-format,mask_shrink_factor
0,0,breast__he__10068.ndpi,ain3007-project/raw-external-test-dataset,65536,46080,10,hamamatsu,0.030512
1,1,breast__he__13131.ndpi,ain3007-project/raw-external-test-dataset,45056,44544,10,hamamatsu,0.044383
2,2,breast__he__13997.ndpi,ain3007-project/raw-external-test-dataset,49152,55808,10,hamamatsu,0.035828
3,3,breast__he__14128.ndpi,ain3007-project/raw-external-test-dataset,53248,45568,10,hamamatsu,0.03757
4,4,breast__he__14327.ndpi,ain3007-project/raw-external-test-dataset,40960,80128,9,hamamatsu,0.024951
5,5,breast__he__15344.ndpi,ain3007-project/raw-external-test-dataset,57344,56064,9,hamamatsu,0.034871
6,6,breast__he__17472.ndpi,ain3007-project/raw-external-test-dataset,57344,52480,9,hamamatsu,0.03487
7,7,breast__he__18267.ndpi,ain3007-project/raw-external-test-dataset,57344,42496,10,hamamatsu,0.034874
8,8,breast__he__19546.ndpi,ain3007-project/raw-external-test-dataset,36864,39424,10,hamamatsu,0.050727
9,9,breast__he__19951.ndpi,ain3007-project/raw-external-test-dataset,49152,46592,10,hamamatsu,0.040694


In [23]:
image_idx, metadata_df = process(internal_file_list, image_idx, metadata_df, thumbnail_size=(2000, 2000), source_repo_id=internal_test_repo_id)
image_idx

Annotation label must be either tissue or bg but found: roi
finished 60 image of ain3007-project/raw-internal-test-dataset
Annotation label must be either tissue or bg but found: roi
finished 61 image of ain3007-project/raw-internal-test-dataset
Annotation label must be either tissue or bg but found: roi
finished 62 image of ain3007-project/raw-internal-test-dataset
Annotation label must be either tissue or bg but found: roi
finished 63 image of ain3007-project/raw-internal-test-dataset
Annotation label must be either tissue or bg but found: roi
finished 64 image of ain3007-project/raw-internal-test-dataset
Annotation label must be either tissue or bg but found: roi
finished 65 image of ain3007-project/raw-internal-test-dataset
Annotation label must be either tissue or bg but found: roi
finished 66 image of ain3007-project/raw-internal-test-dataset
Annotation label must be either tissue or bg but found: roi
finished 67 image of ain3007-project/raw-internal-test-dataset
Annotation label

110

In [24]:
metadata_df.to_csv("metadata.csv", index=False)

In [25]:
api = hfh.HfApi()

# upload zipped folder
!zip -r images.zip images
!zip -r masks.zip masks

api.upload_file(
    path_or_fileobj="images.zip",
    path_in_repo="images.zip",
    repo_id=output_test_repo_id,
    repo_type="dataset",
)

api.upload_file(
    path_or_fileobj="masks.zip",
    path_in_repo="masks.zip",
    repo_id=output_test_repo_id,
    repo_type="dataset",
)

api.upload_file(
    path_or_fileobj="metadata.csv",
    path_in_repo="metadata.csv",
    repo_id=output_test_repo_id,
    repo_type="dataset",
)

  adding: images/ (stored 0%)
  adding: images/104.png (deflated 0%)
  adding: images/98.png (deflated 6%)
  adding: images/6.png (deflated 0%)
  adding: images/103.png (deflated 1%)
  adding: images/53.png (deflated 0%)
  adding: images/99.png (deflated 6%)
  adding: images/75.png (deflated 0%)
  adding: images/44.png (deflated 0%)
  adding: images/71.png (deflated 0%)
  adding: images/87.png (deflated 2%)
  adding: images/9.png (deflated 0%)
  adding: images/96.png (deflated 9%)
  adding: images/29.png (deflated 0%)
  adding: images/105.png (deflated 1%)
  adding: images/78.png (deflated 0%)
  adding: images/54.png (deflated 1%)
  adding: images/80.png (deflated 6%)
  adding: images/83.png (deflated 2%)
  adding: images/109.png (deflated 0%)
  adding: images/69.png (deflated 1%)
  adding: images/38.png (deflated 1%)
  adding: images/40.png (deflated 0%)
  adding: images/50.png (deflated 1%)
  adding: images/77.png (deflated 0%)
  adding: images/61.png (deflat

'https://huggingface.co/datasets/ain3007-project/test-dataset-thumbnails-2000/blob/main/metadata.csv'