<a href="https://colab.research.google.com/github/a2m-dotcom/DLBCL_Pub/blob/main/Mask_and_Crop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
df = df_clean.copy()  # from previous step

images_dir = "/content/drive/MyDrive/MIDOGPP/MIDOGpp/images"
crops_dir  = "/content/drive/MyDrive/MIDOGPP/MIDOGpp/crops"


In [None]:
import os
os.makedirs(crops_dir, exist_ok=True)


In [None]:
import cv2
import numpy as np
from tqdm import tqdm

CROP_SIZE = 128
HALF = CROP_SIZE // 2

def safe_crop(img, x, y, crop_size=128):
    h, w = img.shape[:2]

    x1 = max(0, x - HALF)
    y1 = max(0, y - HALF)
    x2 = min(w, x + HALF)
    y2 = min(h, y + HALF)

    crop = img[y1:y2, x1:x2]

    # If crop is smaller at boundaries â†’ pad to 128px
    if crop.shape[0] != CROP_SIZE or crop.shape[1] != CROP_SIZE:
        pad_y = CROP_SIZE - crop.shape[0]
        pad_x = CROP_SIZE - crop.shape[1]
        crop = np.pad(crop, ((0,pad_y),(0,pad_x),(0,0)), mode='constant', constant_values=0)

    return crop


In [None]:
import os
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

CROP_SIZE = 128
HALF = CROP_SIZE // 2

def safe_crop(img, x, y):
    h, w = img.shape[:2]

    x1 = max(0, x - HALF)
    y1 = max(0, y - HALF)
    x2 = min(w, x + HALF)
    y2 = min(h, y + HALF)

    crop = img[y1:y2, x1:x2]

    if crop.shape[0] != CROP_SIZE or crop.shape[1] != CROP_SIZE:
        pad_y = CROP_SIZE - crop.shape[0]
        pad_x = CROP_SIZE - crop.shape[1]
        crop = np.pad(crop, ((0, pad_y), (0, pad_x), (0, 0)),
                      mode='constant', constant_values=0)

    return crop


images_dir = "/content/drive/MyDrive/MIDOGPP/MIDOGpp/images"
crops_dir  = "/content/drive/MyDrive/MIDOGPP/MIDOGpp/crops"
os.makedirs(crops_dir, exist_ok=True)

# GROUP BY SLIDE
grouped = df.groupby("filename")

for slide_name, group in tqdm(grouped, total=len(grouped)):
    slide_path = f"{images_dir}/{slide_name}"

    img = cv2.imread(slide_path)
    if img is None:
        print("Could not open:", slide_path)
        continue

    # Process all annotations belonging to THIS slide
    for _, row in group.iterrows():
        x = int(row["x"])
        y = int(row["y"])
        ann = int(row["annotationID"])

        crop = safe_crop(img, x, y)
        crop_path = f"{crops_dir}/{ann}.png"
        cv2.imwrite(crop_path, crop)



In [None]:
!pip install git+https://github.com/ChaoningZhang/MobileSAM.git

In [None]:
!pip install git+https://github.com/facebookresearch/segment-anything.git
!pip install opencv-python matplotlib


In [None]:
!wget -O sam_vit_b.pth https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth


In [None]:
import torch
from segment_anything import sam_model_registry, SamPredictor

device = "cuda"

sam_checkpoint = "sam_vit_b.pth"
model_type = "vit_b"

sam = sam_model_registry[model_type](checkpoint=sam_checkpoint)
sam.to(device)

predictor = SamPredictor(sam)

In [None]:
import numpy as np
import cv2

def get_sam_mask(crop):
    crop_rgb = cv2.cvtColor(crop, cv2.COLOR_BGR2RGB)
    predictor.set_image(crop_rgb)

    input_point = np.array([[64, 64]])
    input_label = np.array([1])

    masks, scores, _ = predictor.predict(
        point_coords=input_point,
        point_labels=input_label,
        multimask_output=False
    )

    mask = (masks[0] * 255).astype(np.uint8)
    return mask


In [None]:
df_clean["crop_path"] = df_clean["annotationID"].apply(
    lambda x: f"{crops_dir}/{x}.png"
)


In [None]:
from tqdm import tqdm

for idx, row in tqdm(df_clean.iterrows(), total=len(df_clean)):
    crop_path = row["crop_path"]
    ann = int(row["annotationID"])

    crop = cv2.imread(crop_path)
    if crop is None:
        print("Failed to read:", crop_path)
        continue

    mask = get_sam_mask(crop)
    cv2.imwrite(f"{masks_dir}/{ann}.png", mask)

In [None]:
images_dir = "/content/drive/MyDrive/MIDOGPP/MIDOGpp/images"
crops_dir  = "/content/drive/MyDrive/MIDOGPP/MIDOGpp/crops"
masks_dir = "/content/drive/MyDrive/MIDOGPP/MIDOGpp/masks"

In [None]:
import os
import pandas as pd

images_dir = "/content/drive/MyDrive/MIDOGPP/MIDOGpp/images"
crops_dir  = "/content/drive/MyDrive/MIDOGPP/MIDOGpp/crops"
masks_dir  = "/content/drive/MyDrive/MIDOGPP/MIDOGpp/masks"

def list_files(path):
    return sorted([f for f in os.listdir(path) if not f.startswith(".")])

crop_files = list_files(crops_dir)
mask_files = list_files(masks_dir)
image_files = list_files(images_dir)

# Extract annotation_id from crop files (remove extension)
anno_ids = [os.path.splitext(f)[0] for f in crop_files]

rows = []

for anno in anno_ids:
    crop_path = os.path.join(crops_dir, anno + ".png")
    if not os.path.exists(crop_path):
        crop_path = os.path.join(crops_dir, anno + ".jpg")
        if not os.path.exists(crop_path):
            continue

    mask_path = os.path.join(masks_dir, anno + ".png")
    if not os.path.exists(mask_path):
        mask_path = os.path.join(masks_dir, anno + ".jpg")
        if not os.path.exists(mask_path):
            continue

    # determine parent image
    # Usually annotation filename contains original patch/image ID
    image_id = anno.split("_")[0]   # IMPORTANT: adjust if needed

    # find matching image file
    possible_imgs = [
        os.path.join(images_dir, image_id + ext)
        for ext in [".png", ".jpg", ".jpeg", ".tif", ".tiff"]
    ]

    image_path = None
    for p in possible_imgs:
        if os.path.exists(p):
            image_path = p
            break

    if image_path is None:
        continue

    rows.append([anno, image_path, crop_path, mask_path])

df = pd.DataFrame(rows, columns=["annotation_id", "image_path", "crop_path", "mask_path"])

print("Total annotations:", len(anno_ids))
print("Usable complete samples:", len(df))
df.head()