In [1]:
import glob
from openslide import OpenSlide
import tifffile as tiff
import cv2
import gc
import os
import pandas as pd

files = glob.glob("../input/mayo-clinic-strip-ai/train/*.tif")

## We can inspect image pixel size without loading image on RAM by using openslide

In [2]:
df = []
for file in files:
    slide = OpenSlide(file)
    filesize = os.path.getsize(file)
    df.append((file, slide.dimensions[0], slide.dimensions[1], slide.dimensions[0]*slide.dimensions[1], filesize))

In [3]:
df = pd.DataFrame(df, columns=["path", "width", "height", "pixel_size", "file_size"])

In [4]:
df = df.sort_values('pixel_size', ascending=False).reset_index()
df.head(20)

Unnamed: 0,index,path,width,height,pixel_size,file_size
0,157,../input/mayo-clinic-strip-ai/train/6baf51_0.tif,48282,101406,4896084492,1833979850
1,173,../input/mayo-clinic-strip-ai/train/b894f4_0.tif,91723,45045,4131662535,2770328890
2,183,../input/mayo-clinic-strip-ai/train/b07b42_0.tif,83747,47916,4012821252,2795473366
3,315,../input/mayo-clinic-strip-ai/train/3c2c23_0.tif,46014,86558,3982879812,1589098772
4,160,../input/mayo-clinic-strip-ai/train/3b7d81_0.tif,46019,81015,3728229285,925607978
5,253,../input/mayo-clinic-strip-ai/train/0415c3_0.tif,30560,118076,3608402560,930049526
6,282,../input/mayo-clinic-strip-ai/train/2db520_0.tif,41578,86564,3599157992,983264334
7,687,../input/mayo-clinic-strip-ai/train/f9569b_0.tif,46177,77440,3575946880,1566309064
8,293,../input/mayo-clinic-strip-ai/train/9874eb_0.tif,30599,112548,3443856252,941956192
9,87,../input/mayo-clinic-strip-ai/train/3982bf_0.tif,91723,37337,3424661651,1894761500


In [5]:
df.to_csv("filesize_inspection.csv")

## ↓These files can be read by tifffile.

In [6]:
import tqdm
scale = 4

for path in tqdm.tqdm(df["path"][1:]):
    # This downsample method is reffered from https://www.kaggle.com/code/tmyok1984/mayo-convert-tif-to-jpg, thanks!
    image = tiff.imread(path)
    cv2.imwrite("test.jpg", image[::scale,::scale,::-1])
    del image
    gc.collect()

100%|██████████| 753/753 [2:48:22<00:00, 13.42s/it]


## ↓This file cannot be read by tifffile!!

In [7]:
df["path"][0]

'../input/mayo-clinic-strip-ai/train/6baf51_0.tif'

In [8]:
# image = tiff.imread(df["path"][0])
# cv2.imwrite("test.jpg", image[::scale,::scale,::-1])
# del image
# gc.collect()

## Therefore, it may be possible to prevent OOM by skipping processing when pixel_size is `4131662535` or larger at the time of submission.
## ※Strictly speaking, the amount of used memory and pixel_size do not correlate perfectly, and the amount of memory used varies depending on the process being performed, so `4131662535` is just a guideline.