Extract metadata from WSI images using `Histolab` and `Opneslide`.

In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from glob import glob

dirpath = Path.cwd()
print(dirpath)

/Users/apartin/Box/work/jdacs/pdx-histo/nbs


In [2]:
from histolab.slide import Slide
# from histolab.tiler import RandomTiler, GridTiler, ScoreTiler
# from histolab.scorer import NucleiScorer

import openslide
# from openslide import OpenSlide

In [17]:
# Specify path
datapath = dirpath/'../data'
imgpath  = datapath/'doe-globus-pdx-data'  # path to raw WSI data
metapath = datapath/'meta'
# outpath = datapath/'processed'        # path to save processed images
# os.makedirs(outpath, exist_ok=True)

In [4]:
# Glob images
files = sorted(imgpath.glob('*.svs'))
print(f'Total files: {len(files)}')
print(files[0].with_suffix('').name)

Total files: 469
10060


In [5]:
# Create instance of class Slide
# path: path where the WSI is saved
# processed_path: path where thumbnails and scaled images will be saved to
f_name = files[0]
img_inpath = str(f_name)
# img_out_path = os.path.join(str(outpath), files[0].with_suffix('').name)
img_outpath = os.path.join(str(datapath), 'processed', files[0].with_suffix('').name)
pdx_slide = Slide(path=img_inpath, processed_path=img_outpath)

print(f"Slide name:            {pdx_slide.name}")
print(f"Levels:                {pdx_slide.levels}")
print(f"Dimensions at level 0: {pdx_slide.dimensions}")
print(f"Dimensions at level 1: {pdx_slide.level_dimensions(level=1)}")
print(f"Dimensions at level 2: {pdx_slide.level_dimensions(level=2)}")

Slide name:            10060
Levels:                [0, 1, 2]
Dimensions at level 0: (15935, 15220)
Dimensions at level 1: (3983, 3805)
Dimensions at level 2: (1991, 1902)


In [6]:
# Explore the Slide class
print('Type:            ', type(pdx_slide._wsi.properties))
print('Total properties:', len(pdx_slide._wsi.properties))
print('AppMag value:    ', pdx_slide._wsi.properties['aperio.AppMag'])  # access a property
mag = int(pdx_slide._wsi.properties['aperio.AppMag'])

# print(pdx_slide._wsi.properties[openslide.PROPERTY_NAME_MPP_X], '\n')
# print(pdx_slide._wsi.properties, '\n')

print('Level count:      ', pdx_slide._wsi.level_count)  # access a property
print('Level downsamples:', pdx_slide._wsi.level_downsamples)  # access a property
print('Level dimensions: ', pdx_slide._wsi.level_dimensions)  # access a property

Type:             <class 'openslide._PropertyMap'>
Total properties: 47
AppMag value:     20
Level count:       3
Level downsamples: (1.0, 4.000376600552348, 8.00280943530852)
Level dimensions:  ((15935, 15220), (3983, 3805), (1991, 1902))


## Sampling and Resolution

In [7]:
def calc_eff_mpp(slide, level=0):
    """ effective MPP = downsample x MPP """
    mpp_eff = slide._wsi.level_downsamples[level] * float(slide._wsi.properties[openslide.PROPERTY_NAME_MPP_X])  # effective magnification
    print('Downsample:', slide._wsi.level_downsamples[level])
    print('Level:     ', level)
    print('MPP (um):  ', mpp_eff)
    return mpp_eff

In [8]:
for level in range(pdx_slide._wsi.level_count):
    mpp_eff = calc_eff_mpp(slide=pdx_slide, level=level)

Downsample: 1.0
Level:      0
MPP (um):   0.5006
Downsample: 4.000376600552348
Level:      1
MPP (um):   2.0025885262365053
Downsample: 8.00280943530852
Level:      2
MPP (um):   4.006206403315446


In [9]:
# Calc tile size
tile_px = 300
level = 0

mpp_eff = calc_eff_mpp(pdx_slide, level=0)

tile_um = mpp_eff * tile_px
print('Tile (um):', tile_um)

Downsample: 1.0
Level:      0
MPP (um):   0.5006
Tile (um): 150.18


## Aggregate metadata from all raw slides

In [10]:
# Aggregate metadata into df from all slides

meta_list = []  # list of dicts
for i, f_name in enumerate(files):
    # print(f_name.name)
    if i % 100 == 0:
        print(f'slide {i}: {f_name.name}')
    print(f'slide {i}: {f_name.name}')
    
    # Load slide
    img_inpath = str(f_name)
    img_outpath = os.path.join(str(datapath), 'processed', f_name.with_suffix('').name)
    pdx_slide = Slide(path=img_inpath, processed_path=img_outpath)

    # Create dict that contains the slide metadata (properties)
    ignore_property = ['aperio.User', 'openslide.comment', 'openslide.quickhash-1', 'tiff.ImageDescription']
    meta = {}
    for p_name in pdx_slide._wsi.properties:
        # print('{}: {}'.format( p_name, pdx_slide._wsi.properties[p_name] ))
        if p_name in ignore_property:
            continue
        meta[p_name] = pdx_slide._wsi.properties[p_name]
        
    # Append the slide meta to a list
    meta_list.append(meta)
    del pdx_slide
    
# Create df    
meta_df = pd.DataFrame(meta_list)
meta_df = meta_df[[c for c in sorted(meta_df.columns)]]
cols = ['aperio.ImageID'] + [c for c in meta_df.columns if c != 'aperio.ImageID']
meta_df = meta_df[cols]
print('Shape', meta_df.shape)

slide 0
slide 0
slide 1
slide 2
slide 3
slide 4
slide 5
slide 6
slide 7
slide 8
slide 9
slide 10
slide 11
slide 12
slide 13
slide 14
slide 15
slide 16
slide 17
slide 18
slide 19
slide 20
slide 21
slide 22
slide 23
slide 24
slide 25
slide 26
slide 27
slide 28
slide 29
slide 30
slide 31
slide 32
slide 33
slide 34
slide 35
slide 36
slide 37
slide 38
slide 39
slide 40
slide 41
slide 42
slide 43
slide 44
slide 45
slide 46
slide 47
slide 48
slide 49
slide 50
slide 51
slide 52
slide 53
slide 54
slide 55
slide 56
slide 57
slide 58
slide 59
slide 60
slide 61
slide 62
slide 63
slide 64
slide 65
slide 66
slide 67
slide 68
slide 69
slide 70
slide 71
slide 72
slide 73
slide 74
slide 75
slide 76
slide 77
slide 78
slide 79
slide 80
slide 81
slide 82
slide 83
slide 84
slide 85
slide 86
slide 87
slide 88
slide 89
slide 90
slide 91
slide 92
slide 93
slide 94
slide 95
slide 96
slide 97
slide 98
slide 99
slide 100
slide 100
slide 101
slide 102
slide 103
slide 104
slide 105
slide 106
slide 107
slide 108
sl

In [18]:
# Save
meta_df.to_csv(metapath/'meta_from_wsi_images.csv', index=False)

In [12]:
display(meta_df.T)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,459,460,461,462,463,464,465,466,467,468
aperio.ImageID,10060,10065,10096,10256,10261,10274,10381,10394,10398,10415,...,9636,9637,9640,9643,9671,9712,9892,9894,9926,9970
aperio.AppMag,20,20,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20
aperio.DSR ID,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,...,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR
aperio.Date,02/24/15,02/24/15,02/26/15,03/06/15,03/06/15,03/06/15,03/20/15,03/20/15,03/20/15,03/20/15,...,01/23/15,01/23/15,01/23/15,01/23/15,01/27/15,01/28/15,02/11/15,02/11/15,02/11/15,02/13/15
aperio.DisplayColor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aperio.Exposure Scale,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,...,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001
aperio.Exposure Time,32,32,32,32,32,32,32,32,32,32,...,32,32,32,32,32,32,32,32,32,32
aperio.Filename,10060,10065,10096,10256,10261,10274,10381,10394,10398,10415,...,9636,9637,9640,9643,9671,9712,9892,9894,9926,9970
aperio.Focus Offset,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,...,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500
aperio.ICC Profile,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,...,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1


## Not all slides were scanned with the same resolution

In [13]:
meta_df['openslide.mpp-x'].value_counts()

0.50270000000000004    319
0.50060000000000004    150
Name: openslide.mpp-x, dtype: int64

In [14]:
meta_df['openslide.mpp-y'].value_counts()

0.50270000000000004    319
0.50060000000000004    150
Name: openslide.mpp-y, dtype: int64

In [15]:
meta_df['openslide.level-count'].value_counts()

3    434
2     35
Name: openslide.level-count, dtype: int64

In [16]:
meta_df['aperio.AppMag'].value_counts()

20    469
Name: aperio.AppMag, dtype: int64