In [8]:
import kagglehub
import os
import numpy as np
import cv2
import json
from pycocotools.coco import COCO
from pycocotools import mask as maskUtils
from tqdm import tqdm
from PIL import Image

### 📥 Step 1: Download COCO 2017 Dataset

We downloaded the dataset using KaggleHub. The dataset includes:
- `train2017/` and `val2017/` image folders
- `annotations/` containing segmentation metadata in JSON format

The dataset path and structure were printed to verify successful setup.


In [5]:
# Download and print actual path
path = kagglehub.dataset_download("awsaf49/coco-2017-dataset")
print("✅ Dataset Path:", path)

# List contents inside it
import os
print("📁 Files:")
print(os.listdir(path))


✅ Dataset Path: /kaggle/input/coco-2017-dataset
📁 Files:
['coco2017']


### 🗂️ Step 2: Explore Dataset Structure

We walked through the folder to identify:
- Location of validation images: `val2017/`
- Annotations: `annotations/instances_val2017.json`

This helped us configure paths for further processing.


In [None]:
import os

base_path = "/kaggle/input/coco-2017-dataset/coco2017"

for root, dirs, files in os.walk(base_path):
    for file in files:
        print(os.path.join(root, file))


### 🧪 Step 3: Test Segmentation Mask Generation

We tested the segmentation mask generation pipeline by processing 100 images.

Each image was converted into a pixel-level mask where:
- Each class label (category) was mapped to a unique integer
- Masks were saved as `.png` files with the same filename as the original image


In [9]:
# Paths
ann_path = "/kaggle/input/coco-2017-dataset/coco2017/annotations/instances_val2017.json"
img_dir = "/kaggle/input/coco-2017-dataset/coco2017/val2017"
output_mask_dir = "/kaggle/working/masks"
os.makedirs(output_mask_dir, exist_ok=True)

# Load COCO
coco = COCO(ann_path)
img_ids = coco.getImgIds()
cat_ids = coco.getCatIds()
cat_id_to_idx = {cat_id: idx + 1 for idx, cat_id in enumerate(cat_ids)}  # start from 1

# Loop through images
for img_id in tqdm(img_ids[:100]):  # you can increase this number later
    img_info = coco.loadImgs(img_id)[0]
    file_name = img_info['file_name']
    width, height = img_info['width'], img_info['height']

    ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
    anns = coco.loadAnns(ann_ids)

    mask = np.zeros((height, width), dtype=np.uint8)

    for ann in anns:
        cat_id = ann['category_id']
        cat_idx = cat_id_to_idx[cat_id]
        rle = coco.annToRLE(ann)
        ann_mask = maskUtils.decode(rle)
        mask[ann_mask == 1] = cat_idx

    # Save mask
    mask_path = os.path.join(output_mask_dir, file_name.replace(".jpg", ".png"))
    Image.fromarray(mask).save(mask_path)

loading annotations into memory...
Done (t=0.99s)
creating index...
index created!


100%|██████████| 100/100 [00:01<00:00, 94.40it/s]


### ⚠️ Step 4: Edge Case Handling

To ensure data quality, we handled common issues such as:
1. Missing or null segmentation fields
2. Zero-area annotations
3. Overlapping masks (handled by overwrite)
4. Invalid or unrecognized category IDs

These were skipped during mask creation to avoid errors and noise.


In [14]:
    for ann in anns:
        if 'segmentation' not in ann or ann['segmentation'] is None:
            continue  # Edge Case 1

        if ann['area'] == 0:
            continue  # Edge Case 2

        cat_id = ann['category_id']
        if cat_id not in cat_id_to_idx:
            continue  # Edge Case 4

        cat_idx = cat_id_to_idx[cat_id]
        rle = coco.annToRLE(ann)
        ann_mask = maskUtils.decode(rle)

        mask[ann_mask == 1] = cat_idx  # Edge Case 3: overlaps handled


### 💾 Step 5: Process Full Dataset and Save Metadata

We processed 3,000 images and created corresponding segmentation masks.

In addition:
- A category-to-index map was saved in `category_mapping.json`
- A file-to-mask mapping was saved in `image_mask_map.json`
- All masks were saved in the `masks/` directory under the Kaggle working folder

This completes Task 1 – Dataset Preparation.


In [12]:
# Save category mapping
with open("/kaggle/working/category_mapping.json", "w") as f:
    json.dump(cat_id_to_idx, f, indent=2)

# Save image-mask mapping
image_mask_map = {}

# Process 3,000 images
for img_id in tqdm(img_ids[:3000]):
    img_info = coco.loadImgs(img_id)[0]
    file_name = img_info['file_name']
    width, height = img_info['width'], img_info['height']

    ann_ids = coco.getAnnIds(imgIds=img_id, iscrowd=False)
    anns = coco.loadAnns(ann_ids)

    mask = np.zeros((height, width), dtype=np.uint8)

    for ann in anns:
        if 'segmentation' not in ann or ann['segmentation'] is None:
            continue
        if ann['area'] == 0:
            continue
        cat_id = ann['category_id']
        if cat_id not in cat_id_to_idx:
            continue

        cat_idx = cat_id_to_idx[cat_id]
        rle = coco.annToRLE(ann)
        ann_mask = maskUtils.decode(rle)
        mask[ann_mask == 1] = cat_idx

    mask_filename = file_name.replace(".jpg", ".png")
    Image.fromarray(mask).save(os.path.join(output_mask_dir, mask_filename))
    image_mask_map[file_name] = mask_filename

# Save mapping file
with open("/kaggle/working/image_mask_map.json", "w") as f:
    json.dump(image_mask_map, f, indent=2)

print("✅ Step 4 complete: Saved 3000 masks, mapping files.")

100%|██████████| 3000/3000 [00:31<00:00, 96.08it/s] 

✅ Step 4 complete: Saved 3000 masks, mapping files.



