Extract metadata from WSI images using `Histolab` and `Opneslide`.

In [8]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from glob import glob

dirpath = Path.cwd()
print(dirpath)

/Users/apartin/work/jdacs/pdx-histo/nbs


In [9]:
from histolab.slide import Slide
# from histolab.tiler import RandomTiler, GridTiler, ScoreTiler
# from histolab.scorer import NucleiScorer

import openslide
# from openslide import OpenSlide

In [10]:
# Specify path
datapath = dirpath/'../data'
imgpath = datapath/'svs_images'  # path to raw WSI data
# outpath = datapath/'processed'        # path to save processed images
# os.makedirs(outpath, exist_ok=True)

In [11]:
# Glob images
files = sorted(imgpath.glob('*.svs'))
print(f'Total files: {len(files)}')
print(files[0].with_suffix('').name)

Total files: 146
10394


In [12]:
# Create instance of class Slide
# path: path where the WSI is saved
# processed_path: path where thumbnails and scaled images will be saved to
f_name = files[0]
img_inpath = str(f_name)
# img_out_path = os.path.join(str(outpath), files[0].with_suffix('').name)
img_outpath = os.path.join(str(datapath), 'processed', files[0].with_suffix('').name)
pdx_slide = Slide(path=img_inpath, processed_path=img_outpath)

print(f"Slide name:            {pdx_slide.name}")
print(f"Levels:                {pdx_slide.levels}")
print(f"Dimensions at level 0: {pdx_slide.dimensions}")
print(f"Dimensions at level 1: {pdx_slide.level_dimensions(level=1)}")
print(f"Dimensions at level 2: {pdx_slide.level_dimensions(level=2)}")

Slide name:            10394
Levels:                [0, 1, 2]
Dimensions at level 0: (21911, 24376)
Dimensions at level 1: (5477, 6094)
Dimensions at level 2: (1369, 1523)


In [13]:
# Explore the Slide class
print('Type:            ', type(pdx_slide._wsi.properties))
print('Total properties:', len(pdx_slide._wsi.properties))
print('AppMag value:    ', pdx_slide._wsi.properties['aperio.AppMag'])  # access a property
mag = int(pdx_slide._wsi.properties['aperio.AppMag'])

# print(pdx_slide._wsi.properties[openslide.PROPERTY_NAME_MPP_X], '\n')
# print(pdx_slide._wsi.properties, '\n')

print('Level count:      ', pdx_slide._wsi.level_count)  # access a property
print('Level downsamples:', pdx_slide._wsi.level_downsamples)  # access a property
print('Level dimensions: ', pdx_slide._wsi.level_dimensions)  # access a property

Type:             <class 'openslide._PropertyMap'>
Total properties: 47
AppMag value:     20
Level count:       3
Level downsamples: (1.0, 4.00027387255797, 16.005183005937205)
Level dimensions:  ((21911, 24376), (5477, 6094), (1369, 1523))


## Sampling and Resolution

In [16]:
def calc_eff_mpp(slide, level=0):
    """ effective MPP = downsample x MPP """
    mpp_eff = slide._wsi.level_downsamples[level] * float(slide._wsi.properties[openslide.PROPERTY_NAME_MPP_X])  # effective magnification
    print('Downsample:', slide._wsi.level_downsamples[level])
    print('Level:     ', level)
    print('MPP (um):  ', mpp_eff)
    return mpp_eff

In [17]:
mpp_eff = calc_eff_mpp(slide=pdx_slide, level=0)
mpp_eff = calc_eff_mpp(slide=pdx_slide, level=1)
mpp_eff = calc_eff_mpp(slide=pdx_slide, level=2)

Downsample: 1.0
Level:      0
MPP (um):   0.5006
Downsample: 4.00027387255797
Level:      1
MPP (um):   2.00253710060252
Downsample: 16.005183005937205
Level:      2
MPP (um):   8.012194612772166


In [18]:
tile_px = 300
level = 0

mpp_eff = calc_eff_mpp(pdx_slide, level=0)

tile_um = mpp_eff * tile_px
print('Tile (um):', tile_um)

Downsample: 1.0
Level:      0
MPP (um):   0.5006
Tile (um): 150.18


## Aggregate metadata from all raw slides

In [8]:
# Aggregate metadata into df from all slides

meta_list = []  # list of dicts
for f_name in files:
    # print(f_name.name)
    
    # Load slide
    img_inpath = str(f_name)
    # img_outpath = os.path.join(str(outpath), f_name.with_suffix('').name)
    img_outpath = os.path.join(str(datapath), 'processed', f_name.with_suffix('').name)
    pdx_slide = Slide(path=img_inpath, processed_path=img_outpath)

    # Create dict that contains the slide metadata (properties)
    ignore_property = ['aperio.User', 'openslide.comment', 'openslide.quickhash-1', 'tiff.ImageDescription']
    meta = {}
    for p_name in pdx_slide._wsi.properties:
        # print('{}: {}'.format( p_name, pdx_slide._wsi.properties[p_name] ))
        if p_name in ignore_property:
            continue
        meta[p_name] = pdx_slide._wsi.properties[p_name]
        
    # Append the slide meta to a list
    meta_list.append(meta)
    
# Create df    
meta_df = pd.DataFrame(meta_list)
meta_df = meta_df[[c for c in sorted(meta_df.columns)]]
cols = ['aperio.ImageID'] + [c for c in meta_df.columns if c != 'aperio.ImageID']
meta_df = meta_df[cols]
print('Shape', meta_df.shape)

Shape (146, 45)


In [9]:
# Save
meta_df.to_csv(datapath/'meta_from_wsi_images.csv', index=False)

In [10]:
display(meta_df.T)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,136,137,138,139,140,141,142,143,144,145
aperio.ImageID,10394,10545,10547,10803,11055,11107,11153,11411,11520,11524,...,9058,9113,9236,9237,9453,9458,9636,9637,9671,9970
aperio.AppMag,20,20,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20
aperio.DSR ID,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,...,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR
aperio.Date,03/20/15,04/02/15,04/02/15,04/20/15,05/08/15,05/08/15,05/13/15,05/20/15,05/29/15,05/29/15,...,12/15/14,12/16/14,12/30/14,12/30/14,01/13/15,01/13/15,01/23/15,01/23/15,01/27/15,02/13/15
aperio.DisplayColor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aperio.Exposure Scale,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,...,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001
aperio.Exposure Time,32,32,32,32,32,32,32,32,32,32,...,32,32,32,32,32,32,32,32,32,32
aperio.Filename,10394,10545,10547,10803,11055,11107,11153,11411,11520,11524,...,9058,9113,9236,9237,9453,9458,9636,9637,9671,9970
aperio.Focus Offset,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,...,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500
aperio.ICC Profile,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,...,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1


## Not all slides were scanned with the same resolution

In [11]:
meta_df['openslide.mpp-x'].value_counts()

0.50270000000000004    104
0.50060000000000004     42
Name: openslide.mpp-x, dtype: int64

In [12]:
meta_df['openslide.mpp-y'].value_counts()

0.50270000000000004    104
0.50060000000000004     42
Name: openslide.mpp-y, dtype: int64