In [18]:
import os

import glob
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from tqdm import tqdm

from PIL import Image
import h5py
import cv2
from typing import *
from pathlib import Path

import torch
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize

In [19]:
from google.colab import drive
drive.mount('/gdrive')

%cd /gdrive/MyDrive/CLIP Images


Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/MyDrive/CLIP Images


In [20]:
tiff_files = [f for f in os.listdir() if f.endswith('.tiff') or f.endswith('.tif')]

In [21]:
from PIL import Image
import h5py
import numpy as np
from tqdm import tqdm
import os
from typing import Union

def preprocess(image: Image.Image, desired_size: int) -> np.ndarray:
    img = image.copy()
    img.thumbnail((desired_size, desired_size), Image.ANTIALIAS)

    new_img = Image.new("RGB", (desired_size, desired_size))
    new_img.paste(img, ((desired_size - img.width) // 2, (desired_size - img.height) // 2))

    return np.array(new_img)

def img_to_hdf5(directory_path: Union[str, os.PathLike], out_filepath: str, resolution: int = 320):
    tiff_files = [f for f in os.listdir(directory_path) if f.endswith('.tiff') or f.endswith('.tif')]
    dset_size = len(tiff_files)
    failed_images = []

    if dset_size == 0:
        print("No TIFF files found in the directory.")
        return

    print(f"Found {dset_size} TIFF files in the directory.")

    with h5py.File(out_filepath, 'w') as h5f:
        img_dset = h5f.create_dataset('cxr', shape=(dset_size, resolution, resolution, 3), dtype=np.uint8)

        for idx, filename in enumerate(tqdm(tiff_files)):
            try:
                img_path = os.path.join(directory_path, filename)

                # Read image using Pillow
                img = Image.open(img_path).convert('RGB')
                print(f"Processing {img_path}")

                # Preprocess image to desired size
                img_array = preprocess(img, desired_size=resolution)
                print(f"Image shape after preprocessing: {img_array.shape}")

                # Check if the image array has the correct shape
                assert img_array.shape == (resolution, resolution, 3), f"Image shape mismatch for {img_path}"

                # Add image to the dataset
                img_dset[idx] = img_array
                print(f"Added image {idx + 1} to dataset")
            except Exception as e:
                print(f"Failed to process {img_path}: {e}")
                failed_images.append((img_path, e))

    print(f"{len(failed_images)} / {dset_size} images failed to be added to the HDF5 file.")
    if failed_images:
        print("Failed images and errors:", failed_images)

In [22]:
img_to_hdf5('/gdrive/MyDrive/CLIP Images', '/gdrive/MyDrive/CLIP_Images_Dataset.h5')


Found 9 TIFF files in the directory.


  img.thumbnail((desired_size, desired_size), Image.ANTIALIAS)
 11%|█         | 1/9 [00:00<00:07,  1.13it/s]

Processing /gdrive/MyDrive/CLIP Images/Copy of sj-03-476_003.tif
Image shape after preprocessing: (320, 320, 3)
Added image 1 to dataset


 22%|██▏       | 2/9 [00:01<00:05,  1.25it/s]

Processing /gdrive/MyDrive/CLIP Images/Copy of sj-03-476_002.tif
Image shape after preprocessing: (320, 320, 3)
Added image 2 to dataset


 33%|███▎      | 3/9 [00:02<00:05,  1.16it/s]

Processing /gdrive/MyDrive/CLIP Images/Copy of sj-03-476_001.tif
Image shape after preprocessing: (320, 320, 3)
Added image 3 to dataset


 44%|████▍     | 4/9 [00:03<00:04,  1.18it/s]

Processing /gdrive/MyDrive/CLIP Images/Copy of sj-05-588-R1_002.tif
Image shape after preprocessing: (320, 320, 3)
Added image 4 to dataset


 56%|█████▌    | 5/9 [00:04<00:03,  1.21it/s]

Processing /gdrive/MyDrive/CLIP Images/Copy of sj-05-588-R1_001.tif
Image shape after preprocessing: (320, 320, 3)
Added image 5 to dataset


 67%|██████▋   | 6/9 [00:04<00:02,  1.22it/s]

Processing /gdrive/MyDrive/CLIP Images/Copy of sj-05-588-R1_003.tif
Image shape after preprocessing: (320, 320, 3)
Added image 6 to dataset


 78%|███████▊  | 7/9 [00:05<00:01,  1.23it/s]

Processing /gdrive/MyDrive/CLIP Images/Copy of sj-04-3077-R2_002.tif
Image shape after preprocessing: (320, 320, 3)
Added image 7 to dataset


 89%|████████▉ | 8/9 [00:06<00:00,  1.29it/s]

Processing /gdrive/MyDrive/CLIP Images/Copy of sj-04-3077-R2_003.tif
Image shape after preprocessing: (320, 320, 3)
Added image 8 to dataset


100%|██████████| 9/9 [00:07<00:00,  1.26it/s]

Processing /gdrive/MyDrive/CLIP Images/Copy of sj-04-3077-R2_001.tif
Image shape after preprocessing: (320, 320, 3)
Added image 9 to dataset
0 / 9 images failed to be added to the HDF5 file.



