In [1]:
import requests
import re
import json
import subprocess
import pandas as pd
import os
import h5py
import openslide
import numpy as np
import random


In [2]:
def get_case(file_id, dir):
    data_endpt = "https://api.gdc.cancer.gov/data/{}".format(file_id)

    response = requests.get(data_endpt, headers = {"Content-Type": "application/json"})

    # The file name can be found in the header within the Content-Disposition key.
    response_head_cd = response.headers["Content-Disposition"]

    file_name = re.findall("filename=(.+)", response_head_cd)[0]
    file_dir = os.path.join(dir, file_name)

    with open(file_dir, "wb") as output_file:
        output_file.write(response.content)

def get_data(dataset_type):
    manifest_df = pd.read_csv(f'manifest_{dataset_type}_filtered.csv', sep=',')

    #   Check if required columns are present
    if 'id' not in manifest_df.columns:
        raise ValueError("Manifest file must contain an 'id' column for file_id")

    ids = []
    # Iterate over the DataFrame rows
    for i, (index, row) in enumerate(manifest_df.iterrows()):
        ids.append({'id': row['id'], 'filename':row['filename']})

    directory = f'{dataset_type}_dataset'
    os.makedirs(directory, exist_ok=True)
    for item in ids:
        # Construct the full file path
        file_path = os.path.join(directory, item['filename'])
        

        # Check if the file exists
        if os.path.isfile(file_path):
            print("we skip: ", item['filename'])
            continue
        else:
            print("we obtain: ", item['filename'])
            print("with id: ", item['id'])
            get_case(item['id'], directory)

In [3]:
dataset = 'luad'
get_data(dataset)

we obtain:  TCGA-49-6745-01Z-00-DX3.40cd3c60-889c-4eaa-be55-36ab5d8b2400.svs
with id:  94a35fa3-7b14-4393-9288-ebc3a37716f1
we obtain:  TCGA-55-8091-01Z-00-DX1.0996c58a-6e93-4092-8cb8-014d548fe60c.svs
with id:  0f1cfc85-c918-4730-b062-b818847b1df7


KeyboardInterrupt: 

In [51]:
# Path to your SVS file (WSI)
target_magnification=20
target_resolution=0.5
dir_files = os.listdir(f"luad_dataset")
output_dir = "example_dataset_adj"
random.shuffle(dir_files)
# print(dir_files)
for file in dir_files:
    slide_path = os.path.join('luad_dataset', file)
    # print(slide_path)

    slide = openslide.OpenSlide(slide_path)
    print(np.array(slide))

    # Full size of the image --> nr of pixels in x and y
    prop0 = slide.dimensions
    prop1 = slide.level_dimensions
    prop2 = slide.level_downsamples
    mpp_x = slide.properties.get(openslide.PROPERTY_NAME_MPP_X)
    mpp_y = slide.properties.get(openslide.PROPERTY_NAME_MPP_Y)
    print(prop0)
    print(prop1)
    print(prop2)
    print(mpp_x)
    print(mpp_y)
    break

    



OpenSlide('luad_dataset/TCGA-86-A4P8-01Z-00-DX1.D65E8855-F7A1-4584-A501-BBFBC61C2DEB.svs')
(63784, 53264)
((63784, 53264), (15946, 13316), (3986, 3329), (1993, 1664))
(1.0, 4.0, 16.001003512293025, 32.00681471689374)
0.2465
0.2465


In [28]:
def inspect_h5_file(file_path):
    # Open the HDF5 file
    shapes = 0
    with h5py.File(file_path, 'r') as h5_file:
        # List all top-level keys (groups or datasets)
        print("Top-level keys:", list(h5_file.keys()))
        
        # Inspect specific keys to find relevant metadata
        for key in h5_file.keys():
            print(f"\nKey: {key}")
            # Inspect the dataset or group
            item = h5_file[key]
            if isinstance(item, h5py.Dataset):
                shapes += item.shape[0]
                print("Dataset shape:", item.shape)
                print("Dataset dtype:", item.dtype)
                print("Dataset attributes:", dict(item.attrs))
    
    return shapes


In [29]:
dir_files = os.listdir(f"luad_patched/patches")
shapes = 0
for file in dir_files:
    print("-------------------------------------")
    shapes += inspect_h5_file(os.path.join("luad_patched/patches", file))

shapes /= len(dir_files)
print(shapes)

-------------------------------------
Top-level keys: ['coords']

Key: coords
Dataset shape: (3519, 2)
Dataset dtype: int64
Dataset attributes: {'downsample': array([1., 1.]), 'downsampled_level_dim': array([54726, 25547]), 'level_dim': array([54726, 25547]), 'name': 'TCGA-44-8119-01Z-00-DX1.1EBEBFA7-22DB-4365-9DF8-C4E679C11312', 'patch_level': np.int64(0), 'patch_size': np.int64(256), 'save_path': 'luad_patched/patches'}
-------------------------------------
Top-level keys: ['coords']

Key: coords
Dataset shape: (54516, 2)
Dataset dtype: int64
Dataset attributes: {'downsample': array([1., 1.]), 'downsampled_level_dim': array([93296, 85755]), 'level_dim': array([93296, 85755]), 'name': 'TCGA-50-5939-01Z-00-DX1.745D7503-0744-46B1-BC89-EBB8FCE2D55C', 'patch_level': np.int64(0), 'patch_size': np.int64(256), 'save_path': 'luad_patched/patches'}
-------------------------------------
Top-level keys: ['coords']

Key: coords
Dataset shape: (12018, 2)
Dataset dtype: int64
Dataset attributes: {'