Extract metadata from WSI images using `Histolab` and `Opneslide`.

In [1]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
from glob import glob

dirpath = Path.cwd()
print(dirpath)

/vol/ml/apartin/projects/pdx-histo/nbs


In [2]:
from histolab.slide import Slide
# from histolab.tiler import RandomTiler, GridTiler, ScoreTiler
# from histolab.scorer import NucleiScorer
import openslide
# from openslide import OpenSlide

In [3]:
# Specify path
datapath = dirpath/'../data'
imgpath  = datapath/'doe-globus-pdx-data'  # path to raw WSI data
metapath = datapath/'meta'
# outpath = datapath/'processed'        # path to save processed images
# os.makedirs(outpath, exist_ok=True)

In [4]:
# Glob images
files = sorted(imgpath.glob('*.svs'))
print(f'Total files: {len(files)}')
print(files[0].with_suffix('').name)

Total files: 596
10059


In [5]:
# Confirm consistency between the image SVS files and the 'Image ID' column in excel file
s1 = set([int(x.name.split('.')[0]) for x in files])
df_img = pd.read_csv(metapath/'ImageID_PDMRID_CrossRef.csv')
df_img = df_img.rename(columns={'Image ID': 'image_id'})
s2 = set(df_img['image_id'].values)

print("SVS image files that are present in the folder but do not appear in the 'Image ID' column: {}".format(s1.difference(s2)))
print("Image ids that appear in the 'Image ID' column but are not present in the main folder:     {}".format(s2.difference(s1)))

SVS image files that are present in the folder but do not appear in the 'Image ID' column: {11008, 83741, 13504}
Image ids that appear in the 'Image ID' column but are not present in the main folder:     set()


In [6]:
# Slide instance
# path: path to WSI file
# processed_path: path to save thumbnails and scaled images
fname = files[0]
img_inpath = str(fname)
img_outpath = os.path.join(str(datapath), 'processed', fname.with_suffix('').name)
pdx_slide = Slide(path=img_inpath, processed_path=img_outpath)

# Slide properties
print(f"Type:                  {type(pdx_slide)}")
print(f"Slide name:            {pdx_slide.name}")
print(f"Levels:                {pdx_slide.levels}")
print(f"Dimensions at level 0: {pdx_slide.dimensions}")
print(f"Dimensions at level 1: {pdx_slide.level_dimensions(level=1)}")
print(f"Dimensions at level 2: {pdx_slide.level_dimensions(level=2)}")

Slide name:            10059
Levels:                [0, 1, 2]
Dimensions at level 0: (15935, 13745)
Dimensions at level 1: (3983, 3436)
Dimensions at level 2: (1991, 1718)


In [7]:
# Access the openslide properties through the Slide object
# (histopath inherits from the openslide)
print(f"Type:             {type(pdx_slide._wsi.properties)}")
print(f"Total properties: {len(pdx_slide._wsi.properties)}")
print(f"Property value:   {pdx_slide._wsi.properties['aperio.AppMag']}")  # access a property
mag = int(pdx_slide._wsi.properties['aperio.AppMag'])

# print(pdx_slide._wsi.properties[openslide.PROPERTY_NAME_MPP_X], '\n')
# print(pdx_slide._wsi.properties, '\n')

print(f"Level count:       {pdx_slide._wsi.level_count}")  # access a property
print(f"Level downsamples: {pdx_slide._wsi.level_downsamples}")  # access a property
print(f"Level dimensions:  {pdx_slide._wsi.level_dimensions}")  # access a property

Type:             <class 'openslide._PropertyMap'>
Total properties: 47
AppMag value:     20
Level count:       3
Level downsamples: (1.0, 4.000522118596585, 8.002048946686164)
Level dimensions:  ((15935, 13745), (3983, 3436), (1991, 1718))


## Sampling and Resolution

In [8]:
def calc_eff_mpp(slide, level=0):
    """ effective MPP = downsample x MPP """
    mpp_eff = slide._wsi.level_downsamples[level] * float(slide._wsi.properties[openslide.PROPERTY_NAME_MPP_X])  # effective magnification
    print('Downsample:', slide._wsi.level_downsamples[level])
    print('Level:     ', level)
    print('MPP (um):  ', mpp_eff)
    return mpp_eff

In [9]:
for level in range(pdx_slide._wsi.level_count):
    mpp_eff = calc_eff_mpp(slide=pdx_slide, level=level)

Downsample: 1.0
Level:      0
MPP (um):   0.5006
Downsample: 4.000522118596585
Level:      1
MPP (um):   2.0026613725694506
Downsample: 8.002048946686164
Level:      2
MPP (um):   4.005825702711094


In [10]:
# Calc tile size
tile_px = 300
level = 0

mpp_eff = calc_eff_mpp(pdx_slide, level=0)

tile_um = mpp_eff * tile_px
print('Tile (um):', tile_um)

Downsample: 1.0
Level:      0
MPP (um):   0.5006
Tile (um): 150.18


## Aggregate metadata from all raw slides

In [11]:
# Aggregate metadata into df from all slides

meta_list = []  # list of dicts
for i, fname in enumerate(files):
    if i % 100 == 0:
        print(f'slide {i}: {fname.name}')
    
    # Load slide
    img_inpath = str(fname)
    img_outpath = os.path.join(str(datapath), 'processed', fname.with_suffix('').name)
    pdx_slide = Slide(path=img_inpath, processed_path=img_outpath)

    # Create dict that contains the slide metadata (properties)
    ignore_property = ['aperio.User', 'openslide.comment', 'openslide.quickhash-1', 'tiff.ImageDescription']
    meta = {}
    for pname in pdx_slide._wsi.properties:
        # print('{}: {}'.format( p_name, pdx_slide._wsi.properties[p_name] ))
        if p_name in ignore_property:
            continue
        meta[p_name] = pdx_slide._wsi.properties[pname]
        
    # Append the slide meta to a list
    meta_list.append(meta)
    del pdx_slide
    
# Create df    
meta_df = pd.DataFrame(meta_list)
meta_df = meta_df[[c for c in sorted(meta_df.columns)]]
cols = ['aperio.ImageID'] + [c for c in meta_df.columns if c != 'aperio.ImageID']
meta_df = meta_df[cols]
print('Shape', meta_df.shape)

slide 0: 10059.svs
slide 100: 13369.svs
slide 200: 16043.svs
slide 300: 19786.svs
slide 400: 27033.svs
slide 500: 46877.svs
Shape (596, 45)


In [12]:
# Save
meta_df.to_csv(metapath/'meta_from_wsi_images.csv', index=False)

In [13]:
display(meta_df.T)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,586,587,588,589,590,591,592,593,594,595
aperio.ImageID,10059,10060,10065,10082,10094,10096,10256,10261,10274,10381,...,9636,9637,9640,9643,9671,9712,9892,9894,9926,9970
aperio.AppMag,20,20,20,20,20,20,20,20,20,20,...,20,20,20,20,20,20,20,20,20,20
aperio.DSR ID,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,...,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR,FR-S-DTP-430APR
aperio.Date,02/24/15,02/24/15,02/24/15,02/24/15,02/26/15,02/26/15,03/06/15,03/06/15,03/06/15,03/20/15,...,01/23/15,01/23/15,01/23/15,01/23/15,01/27/15,01/28/15,02/11/15,02/11/15,02/11/15,02/13/15
aperio.DisplayColor,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
aperio.Exposure Scale,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,...,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001,0.000001
aperio.Exposure Time,32,32,32,32,32,32,32,32,32,32,...,32,32,32,32,32,32,32,32,32,32
aperio.Filename,10059,10060,10065,10082,10094,10096,10256,10261,10274,10381,...,9636,9637,9640,9643,9671,9712,9892,9894,9926,9970
aperio.Focus Offset,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,...,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500,-0.000500
aperio.ICC Profile,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,...,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1,ScanScope v1


## Not all slides were scanned with the same resolution

In [14]:
meta_df['openslide.mpp-x'].value_counts()

0.50270000000000004    423
0.50060000000000004    172
0.49680000000000002      1
Name: openslide.mpp-x, dtype: int64

In [15]:
meta_df['openslide.mpp-y'].value_counts()

0.50270000000000004    423
0.50060000000000004    172
0.49680000000000002      1
Name: openslide.mpp-y, dtype: int64

In [16]:
meta_df['openslide.level-count'].value_counts()

3    548
2     48
Name: openslide.level-count, dtype: int64

In [17]:
meta_df['aperio.AppMag'].value_counts()

20    596
Name: aperio.AppMag, dtype: int64