Extract metadata from WSI images using `Opneslide` (also tried `Histolab`).

In [1]:
import os
import sys
from pathlib import Path
import glob
from pprint import pprint
import pandas as pd
import numpy as np

dirpath = Path.cwd()
print(dirpath)

sys.path.append(str(dirpath/'../src'))
# print(sys.path)

/vol/ml/apartin/projects/pdx-histo/nbs


In [2]:
from histolab.slide import Slide
# from histolab.tiler import RandomTiler, GridTiler, ScoreTiler
# from histolab.scorer import NucleiScorer
import openslide
# from openslide import OpenSlide

In [3]:
import openslide

# from deephistopath.wsi import filter
from deephistopath.wsi import slide
# from deephistopath.wsi import tiles
from deephistopath.wsi import util

In [4]:
# Specify path
datapath = dirpath/'../data'
slidespath  = datapath/'doe-globus-pdx-data'  # path to raw WSI data
metapath = datapath/'meta'
crossref_meta_fname = '_ImageID_PDMRID_CrossRef.xlsx'  # comes with the svs slides

In [5]:
# Glob slides
# slides_path_list = sorted(slidespath.glob('*.svs'))
slides_path_list = glob.glob(os.path.join(slidespath, '*.svs'))
print(f'Total slides: {len(slides_path_list)}')
# print(slides_path_list[0].with_suffix('').name)
print(os.path.basename(slides_path_list[0]))

Total slides: 596
34404.svs


In [6]:
# Confirm that svs file names match and the 'Image ID' column in excel file
# s1 = set([int(x.name.split('.')[0]) for x in slides_path_list])
s1 = set([int(os.path.basename(x).split('.')[0]) for x in slides_path_list])
# df_img = pd.read_csv(metapath/'ImageID_PDMRID_CrossRef.csv')
# df_img = df_img.rename(columns={'Image ID': 'image_id'})
# s2 = set(df_img['image_id'].values)

df_img = pd.read_excel(metapath/crossref_meta_fname, engine='openpyxl', header=2)
df_img = df_img.rename(columns={'Image ID': 'image_id'})
df_img = df_img.dropna(axis=0, how='all').reset_index(drop=True)
df_img['image_id'] = [int(x) if ~np.isnan(x) else x for x in df_img['image_id'].values]
s2 = set(df_img['image_id'].values)

print("SVS slides that are present in the folder but not in the 'Image ID' column: {}".format(s1.difference(s2)))
print("Slide ids that are in the 'Image ID' column but not present in the folder:  {}".format(s2.difference(s1)))

SVS slides that are present in the folder but not in the 'Image ID' column: {11008, 83741, 13504}
Slide ids that are in the 'Image ID' column but not present in the folder:  set()


In [7]:
# # ------------------------
# # Explore the Slide object
# # ------------------------
# fname = slides_path_list[0]
# img_inpath = str(fname)
# img_outpath = os.path.join(str(datapath), 'processed', os.path.basename(fname).split('.')[0])
# s = Slide(path=img_inpath, processed_path=img_outpath)

# print(f"Type:                  {type(s)}")
# print(f"Slide name:            {s.name}")
# print(f"Levels:                {s.levels}")
# print(f"Dimensions at level 0: {s.dimensions}")
# print(f"Dimensions at level 1: {s.level_dimensions(level=1)}")
# print(f"Dimensions at level 2: {s.level_dimensions(level=2)}")

# # --------------------------------------------------------
# # Access the openslide properties through the Slide object
# # (histopath inherits from openslide)
# # --------------------------------------------------------
# print(f"Type:             {type(s._wsi.properties)}")
# print(f"Total properties: {len(s._wsi.properties)}")
# print(f"Property value:   {s._wsi.properties['aperio.AppMag']}")  # access a property
# # print(pdx_slide._wsi.properties[openslide.PROPERTY_NAME_MPP_X])
# mag = int(s._wsi.properties['aperio.AppMag'])

# print(f"Level count:       {s._wsi.level_count}")  # access a property
# print(f"Level downsamples: {s._wsi.level_downsamples}")  # access a property
# print(f"Level dimensions:  {s._wsi.level_dimensions}")  # access a property

In [8]:
# Explore the Slide object
s = slide.open_slide(slides_path_list[0])
print(f"\nFile type: {type(s.properties)}")
print(f"Properties:  {len(s.properties)}")
print(f"AppMag:      {s.properties['aperio.AppMag']}")  # access a property
mag = int(s.properties['aperio.AppMag'])

print(f"Level count:       {s.level_count}")         # access a property
print(f"Level downsamples: {s.level_downsamples}")   # access a property
print(f"Level dimensions:  {s.level_dimensions}\n")  # access a property


File type: <class 'openslide._PropertyMap'>
Properties:  47
AppMag:      20
Level count:       3
Level downsamples: (1.0, 4.0, 16.001374570446735)
Level dimensions:  ((39840, 46568), (9960, 11642), (2490, 2910))



## Sampling and Resolution

In [9]:
# def calc_eff_mpp(slide, level=0):
#     """ effective MPP = downsample x MPP """
#     mpp_eff = slide._wsi.level_downsamples[level] * float(slide._wsi.properties[openslide.PROPERTY_NAME_MPP_X])  # effective magnification
#     print('Downsample:', slide._wsi.level_downsamples[level])
#     print('Level:     ', level)
#     print('MPP (um):  ', mpp_eff)
#     return mpp_eff

# for level in range(pdx_slide._wsi.level_count):
#     mpp_eff = calc_eff_mpp(slide=pdx_slide, level=level)

# # Calc tile size
# tile_px = 300
# level = 0
# mpp_eff = calc_eff_mpp(pdx_slide, level=0)
# tile_um = mpp_eff * tile_px
# print('Tile (um):', tile_um)

In [10]:
from get_meta_from_slides import calc_eff_mpp

for level in range(s.level_count):
    print()
    mpp_eff = calc_eff_mpp(s=s, level=level, verbose=True)
    
# Calc tile size
tile_px = 300
level = 0
mpp_eff = calc_eff_mpp(s, level=0)
tile_um = mpp_eff * tile_px
print(f'\nTile (um): {tile_um}\n')


Downsample: 1.0
Level:      0
MPP (um):   0.5027
Mag:        20.0

Downsample: 4.0
Level:      1
MPP (um):   2.0108
Mag:        5.0

Downsample: 16.001374570446735
Level:      2
MPP (um):   8.043890996563574
Mag:        1.2498926209088566

Tile (um): 150.81



## Aggregate metadata from all raw slides

In [11]:
# meta_list = []  # list of dicts
# print_after = 1

# for i, fname in enumerate(slides_path_list):
#     if i % print_after == 0:
#         print(f'slide {i}: {fname.name}')
    
#     # Load slide
#     img_inpath = str(fname)
#     img_outpath = os.path.join(str(datapath), 'processed', fname.with_suffix('').name)
#     pdx_slide = Slide(path=img_inpath, processed_path=img_outpath)

#     # Create dict that contains the slide metadata (properties)
#     ignore_property = ['aperio.User', 'openslide.comment', 'openslide.quickhash-1', 'tiff.ImageDescription']
#     meta = {}
#     for pname in pdx_slide._wsi.properties:
#         # print('{}: {}'.format( p_name, pdx_slide._wsi.properties[p_name] ))
#         if p_name in ignore_property:
#             continue
#         meta[p_name] = pdx_slide._wsi.properties[pname]
        
#     # Append the slide meta to a list
#     meta_list.append(meta)
#     del pdx_slide
    
# # Create df    
# meta_df = pd.DataFrame(meta_list)
# meta_df = meta_df[[c for c in sorted(meta_df.columns)]]
# cols = ['aperio.ImageID'] + [c for c in meta_df.columns if c != 'aperio.ImageID']
# meta_df = meta_df[cols]
# print('Shape', meta_df.shape)

In [12]:
t = util.Time()
meta_list = []  # list of dicts
print_after = 50

for i, sname in enumerate(slides_path_list):
    if i % print_after == 0:
        print(f'slide {i}: {sname.split(os.sep)[-1]}')

    # Create dict to contain slide metadata (properties)
    s = slide.open_slide(sname)
    ignore_property = ['aperio.User', 'openslide.comment',
                       'openslide.quickhash-1', 'tiff.ImageDescription']
    meta = {pname: s.properties[pname] for pname in s.properties if pname not in ignore_property}
    meta.update({'memory': os.path.getsize(sname)})  # get the disk memory the file takes
    meta_list.append(meta)  # append dict with slide meta to a list
    del s

# Create df    
meta_df = pd.DataFrame(meta_list)
meta_df = meta_df[[c for c in sorted(meta_df.columns)]]
cols = ['aperio.ImageID'] + [c for c in meta_df.columns if c != 'aperio.ImageID']
meta_df = meta_df[cols]
print('Shape', meta_df.shape)
pprint(meta_df.T.iloc[:4, :7])

t.elapsed_display()

slide 0: 34404.svs
slide 50: 21836.svs
slide 100: 16102.svs
slide 150: 16056.svs
slide 200: 12442.svs
slide 250: 19415.svs
slide 300: 21575.svs
slide 350: 11170.svs
slide 400: 45982.svs
slide 450: 14447.svs
slide 500: 22470.svs
slide 550: 22510.svs
Shape (596, 46)
                              0                1                2  \
aperio.ImageID            34404            23496            12256   
aperio.AppMag                20               20               20   
aperio.DSR ID   FR-S-DTP-433APR  FR-S-DTP-433APR  FR-S-DTP-430APR   
aperio.Date            05/03/17         08/08/16         07/15/15   

                              3                4                5  \
aperio.ImageID            31009            20307            34753   
aperio.AppMag                20               20               20   
aperio.DSR ID   FR-S-DTP-433APR  FR-S-DTP-433APR  FR-S-DTP-433APR   
aperio.Date            02/21/17         04/12/16         05/09/17   

                              6  
aperio.Im

In [47]:
# Save
print('\nSave slides metadata in csv.')
meta_df.to_csv(metapath/'meta_from_wsi_slides_.csv', index=False)


Save slides metadata in csv.


In [48]:
df_copy = meta_df.copy()
print(df_copy.shape)
print(meta_df.shape)
df_copy.equals(meta_df)

(596, 46)
(596, 46)


True

In [70]:
# Check that saves file is equal --> it's not probably due to the types of values
df = pd.read_csv(metapath/'meta_from_wsi_slides_.csv')
print(df.shape)
print(meta_df.shape)
print(df.equals(meta_df))

print(meta_df.iloc[:,0].dtypes)
print(df.iloc[:,0].dtypes)

(596, 46)
(596, 46)
False
object
int64


In [71]:
# (df == meta_df)[:2]
# meta_df[:2]
# df[:2]

In [66]:
display(meta_df.T.iloc[:, :4])

Unnamed: 0,0,1,2,3,4
aperio.ImageID,34404,23496,12256,31009,20307
aperio.AppMag,20,20,20,20,20
aperio.DSR ID,FR-S-DTP-433APR,FR-S-DTP-433APR,FR-S-DTP-430APR,FR-S-DTP-433APR,FR-S-DTP-433APR
aperio.Date,05/03/17,08/08/16,07/15/15,02/21/17,04/12/16
aperio.DisplayColor,0,0,0,0,0
aperio.Exposure Scale,0.000001,0.000001,0.000001,0.000001,0.000001
aperio.Exposure Time,32,32,32,32,32
aperio.Filename,34404,23496,12256,31009,20307
aperio.Focus Offset,0.000000,0.000000,-0.000500,0.000000,0.000000
aperio.ICC Profile,AT2,AT2,ScanScope v1,AT2,AT2


## Not all slides were scanned with the same resolution

In [21]:
meta_df['openslide.mpp-x'].value_counts()

0.50270000000000004    423
0.50060000000000004    172
0.49680000000000002      1
Name: openslide.mpp-x, dtype: int64

In [22]:
meta_df['openslide.mpp-y'].value_counts()

0.50270000000000004    423
0.50060000000000004    172
0.49680000000000002      1
Name: openslide.mpp-y, dtype: int64

In [23]:
meta_df['openslide.level-count'].value_counts()

3    548
2     48
Name: openslide.level-count, dtype: int64

In [24]:
meta_df['aperio.AppMag'].value_counts()

20    596
Name: aperio.AppMag, dtype: int64