In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [69]:
#!/usr/bin/env python3
"""
Standalone script for downloading, resizing, and retrieving MIMIC-CXR images
via the google-cloud-storage client library, with error-handling to skip bad files.

Before running in Colab, install dependencies:
    !pip install google-cloud-storage pillow matplotlib crcmod tqdm
"""

import argparse
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor
from typing import List, Tuple, Optional, Union

from tqdm import tqdm
from PIL import Image
import matplotlib.pyplot as plt

from google.cloud import storage
from google.oauth2 import service_account


class MIMICCXRDataset:
    """
    Handler for downloading, resizing, and accessing MIMIC-CXR images
    via the google-cloud-storage API, with progress bars and error skipping.
    """

    def __init__(
        self,
        source_bucket: str,
        dest_bucket: str,
        dest_prefix: str = "",
        project: Optional[str] = None,
        credentials_path: Optional[str] = None,
        target_width: int = 512,
        max_workers: int = 8,
        local_root: str = "cxr_data",
    ):
        # Colab auth (no-op outside Colab)
        try:
            from google.colab import auth  # type: ignore
            auth.authenticate_user()
            print("✅ Authenticated in Colab via your Google account.")
        except ImportError:
            pass

        # Initialize GCS client
        if credentials_path:
            creds = service_account.Credentials.from_service_account_file(
                credentials_path,
                scopes=["https://www.googleapis.com/auth/cloud-platform"],
            )
            self.client = storage.Client(project=project, credentials=creds)
        else:
            self.client = storage.Client(project=project)

        self.source_bucket_name = source_bucket
        self.dest_bucket_name = dest_bucket
        self.dest_prefix = dest_prefix.strip("/")
        self.target_width = target_width
        self.max_workers = max_workers

        # Prepare local directories
        root = Path(local_root)
        self.local_source = root / "original"
        self.local_resized = root / "resized"
        for d in (self.local_source, self.local_resized):
            d.mkdir(parents=True, exist_ok=True)

    def download_and_resize(self, num_files: Union[int, str] = "full") -> None:
        """Download up to `num_files` JPEGs, resize, and re-upload them."""
        # List blobs
        bucket_kwargs = {"user_project": self.client.project} if self.client.project else {}
        src_bucket = self.client.bucket(self.source_bucket_name, **bucket_kwargs)
        blobs = [b for b in src_bucket.list_blobs() if b.name.lower().endswith(".jpg")]

        # Limit count
        if isinstance(num_files, int) and num_files > 0:
            selected = blobs[:num_files]
            print(f"📥 Downloading first {len(selected)} originals…")
        else:
            selected = blobs
            print(f"📥 Downloading all {len(selected)} originals…")

        # Download
        for blob in tqdm(selected, desc="Downloading", unit="file"):
            try:
                dest = self.local_source / blob.name
                if not dest.exists():
                    dest.parent.mkdir(parents=True, exist_ok=True)
                    blob.download_to_filename(str(dest))
            except Exception as e:
                print(f"⚠️ Skipped download of {blob.name!r}: {e}")

        # Resize
        def _resize(path: Path):
            try:
                rel = path.relative_to(self.local_source)
                out_path = self.local_resized / rel
                out_path.parent.mkdir(parents=True, exist_ok=True)
                img = Image.open(path).convert("RGB")
                w, h = img.width, img.height
                new_h = int(self.target_width * (h / w))
                img.resize((self.target_width, new_h), Image.LANCZOS).save(out_path, "JPEG")
            except Exception as e:
                print(f"⚠️ Skipped resizing {path!r}: {e}")

        jpgs = list(self.local_source.rglob("*.jpg"))
        print(f"🔄 Resizing {len(jpgs)} images to {self.target_width}px…")
        with ThreadPoolExecutor(max_workers=self.max_workers) as pool:
            list(
                tqdm(
                    pool.map(_resize, jpgs),
                    desc="Resizing",
                    total=len(jpgs),
                    unit="img",
                )
            )

        # Upload
        dst_bucket = self.client.bucket(self.dest_bucket_name)
        resized_files = list(self.local_resized.rglob("*.jpg"))
        print(f"📤 Uploading {len(resized_files)} resized images…")
        for fpath in tqdm(resized_files, desc="Uploading", unit="file"):
            try:
                rel = fpath.relative_to(self.local_resized)
                blob_name = f"{self.dest_prefix}/{rel}" if self.dest_prefix else str(rel)
                blob = dst_bucket.blob(blob_name)
                blob.upload_from_filename(str(fpath))
            except Exception as e:
                print(f"⚠️ Skipped upload of {fpath!r}: {e}")

        print("🎉 All done (with any bad files skipped).")

    def get_patient_images(
        self,
        patient_id: str,
        resized: bool = True,
        max_images: Optional[int] = None,
        display: bool = True,
    ) -> Tuple[str, List[Image.Image]]:
        """
        Find and (optionally) display local images for a given patient_id.
        Assumes images are stored under a folder named after the patient_id.
        Falls back to originals if no resized images are found.
        """
        pid = patient_id.lstrip("/")
        base = self.local_resized if resized else self.local_source

        # Look in the subdirectory matching patient_id
        patient_dir = base / pid
        print(f"🔍 Searching for images in {patient_dir}")
        files = list(patient_dir.rglob("*.jpg")) if patient_dir.exists() else []

        # Fallback to originals if none found in resized
        if not files and resized:
            print("⚠️ No resized images found; falling back to originals.")
            patient_dir = self.local_source / pid
            files = list(patient_dir.rglob("*.jpg")) if patient_dir.exists() else []

        if not files:
            print(f"❌ No images found for patient '{pid}'.")
            return pid, []

        total = len(files)
        if max_images:
            files = files[:max_images]
        print(f"✅ Found {total} files; processing {len(files)} of them.")

        images: List[Image.Image] = []
        for f in files:
            try:
                img = Image.open(f).convert("RGB")
                images.append(img)
                if display:
                    plt.figure(figsize=(6, 6 * img.height / img.width))
                    plt.imshow(img)
                    plt.axis("off")
                    plt.title(f.name)
                    plt.show()
            except Exception as e:
                print(f"⚠️ Skipped {f!r}: {e}")

        return pid, images

In [None]:

def main():
    parser = argparse.ArgumentParser(
        description="Download, resize, and re-upload MIMIC-CXR images."
    )
    parser.add_argument(
        "--num-files",
        type=str,
        default="2",
        help="Number of files to download, or 'full' for all (default: 100).",
    )
    args, _ = parser.parse_known_args()
    nf: Union[int, str] = args.num_files.lower() != "full" and int(args.num_files) or "full"

    ds = MIMICCXRDataset(
        source_bucket="mimic-cxr-jpg-2.1.0.physionet.org",
        dest_bucket="medfuse-mimic-cxr",
        dest_prefix="resize",
        project="medfuse-456700",
    )
    ds.download_and_resize(num_files=nf)
    op = ds.get_patient_images('p100000321', display=True)
    print(op)

if __name__ == "__main__":
    main()


✅ Authenticated in Colab via your Google account.
📥 Downloading first 2 originals…


Downloading: 100%|██████████| 2/2 [00:00<00:00, 5829.47file/s]

🔄 Resizing 4504 images to 512px…



Resizing:  46%|████▌     | 2082/4504 [06:04<04:57,  8.15img/s]

⚠️ Skipped resizing PosixPath('cxr_data/original/files/p10/p10128191/s56054806/74f12ae0-ff4098a3-9d028adb-89c2f871-e69a99af.jpg'): image file is truncated (9 bytes not processed)


Resizing:  80%|████████  | 3622/4504 [10:26<02:00,  7.32img/s]

⚠️ Skipped resizing PosixPath('cxr_data/original/files/p10/p10077298/s50389495/a48a71dd-dc7235c3-40824c59-1e19782a-98cf123a.jpg'): cannot identify image file 'cxr_data/original/files/p10/p10077298/s50389495/a48a71dd-dc7235c3-40824c59-1e19782a-98cf123a.jpg'


Resizing: 100%|██████████| 4504/4504 [12:56<00:00,  5.80img/s]


📤 Uploading 4502 resized images…


Uploading: 100%|██████████| 4502/4502 [10:07<00:00,  7.41file/s]

🎉 All done (with any bad files skipped).
🔍 Searching for images in cxr_data/resized/p100000321
⚠️ No resized images found; falling back to originals.
❌ No images found for patient 'p100000321'.
('p100000321', [])





In [61]:
patient_id = 'p100000321'
ds.get_patient_images(patient_id, display=True)



AttributeError: 'MIMICCXRDataset' object has no attribute 'get_patient_images'

In [None]:
!pip install --upgrade --force-reinstall \
    numpy==1.23.5 \
    pandas==1.5.3 \
    pyhealth


Collecting numpy==1.23.5
  Downloading numpy-1.23.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)
Collecting pandas==1.5.3
  Using cached pandas-1.5.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pyhealth
  Using cached pyhealth-1.1.6-py2.py3-none-any.whl.metadata (28 kB)
Collecting python-dateutil>=2.8.1 (from pandas==1.5.3)
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas==1.5.3)
  Downloading pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting torch>=1.8.0 (from pyhealth)
  Downloading torch-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (29 kB)
Collecting torchvision>=0.9.0 (from pyhealth)
  Downloading torchvision-0.22.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (6.1 kB)
Collecting rdkit>=2022.03.4 (from pyhealth)
  Using cached rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting scikit-learn

In [None]:
!pip install matplotlib-venn



In [None]:
!apt-get -qq install -y libfluidsynth1

E: Package 'libfluidsynth1' has no installation candidate


In [None]:
!gsutil ls -p medfuse-456700 gs://mimic-cxr-jpg-2.1.0.physionet.org

ServiceException: 401 Anonymous caller does not have storage.objects.list access to the Google Cloud Storage bucket. Permission 'storage.objects.list' denied on resource (or it may not exist).
