In [1]:
import os
import glob
import json
import numpy as np
from PIL import Image
from google.cloud import storage
from shapely.geometry import Polygon, MultiPolygon
import shutil

In [2]:
# First, clean up any existing output
if os.path.exists("/kaggle/working/output"):
    shutil.rmtree("/kaggle/working/output")
    print("Previous output directory deleted")

In [4]:
# Function to download files from Google Cloud Storage
def download_from_gcs(bucket_name, prefix, local_dir, extensions=None):
    """
    Download files from GCS bucket with specified prefix and extensions to local directory
    
    Args:
    - bucket_name: Name of the GCS bucket
    - prefix: Prefix path in the bucket
    - local_dir: Local directory to download files to
    - extensions: List of extensions to download. If None, download all files.
    """
    storage_client = storage.Client(project='tidy-rig-452705-i9')
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    
    downloaded_count = 0
    for blob in blobs:
        # If no extensions specified, download all files
        # Otherwise, check if the blob name ends with any of the specified extensions
        if extensions is None or any(blob.name.endswith(ext) for ext in extensions):
            filename = os.path.basename(blob.name)
            destination_path = os.path.join(local_dir, filename)
            blob.download_to_filename(destination_path)
            downloaded_count += 1
    
    print(f"Downloaded {downloaded_count} files{' with extensions ' + str(extensions) if extensions else ''}")
    return downloaded_count

In [28]:
import os
import json
import numpy as np
import cv2
from PIL import Image
from shapely.geometry import Polygon, MultiPolygon
from google.cloud import storage
import glob

# Label ids of the dataset
category_ids = {
    "Background": 0,
    "Bolt": 1,
    "Bolt Washer": 2,
    "Busbar": 3,
    "Cable": 4,
    "Connector": 5,
    "Nut": 6,
    "Plastic Film": 7,
    "Plastic Cover": 8
}

# Mapping from category names to folder names
category_to_folders = {
    "Bolt": ["bolt1", "bolt2", "bolt3"],
    "Busbar": ["busbar_long", "busbar_mid", "busbar_short"],
    "Cable": ["cable"],
    "Connector": ["connectors"],
    "Nut": ["nut1", "nut2"],
    "Plastic Film": ["plastic_film"],
    "Plastic Cover": ["plastic_cover"]
}

# Create reverse mapping from folder to category ID
folder_to_category_id = {}
for category, folders in category_to_folders.items():
    category_id = category_ids[category]
    for folder in folders:
        folder_to_category_id[folder] = category_id

# Define all categories as multipolygon for safety
multipolygon_ids = list(range(1, 9))  # All categories except background

# Function to download files from Google Cloud Storage
def download_from_gcs(bucket_name, prefix, local_dir, extensions=None):
    """
    Download files from GCS bucket with specified prefix and extensions to local directory
    
    Args:
    - bucket_name: Name of the GCS bucket
    - prefix: Prefix path in the bucket
    - local_dir: Local directory to download files to
    - extensions: List of extensions to download. If None, download all files.
    """
    storage_client = storage.Client(project='tidy-rig-452705-i9')
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    
    downloaded_count = 0
    for blob in blobs:
        # If no extensions specified, download all files
        # Otherwise, check if the blob name ends with any of the specified extensions
        if extensions is None or any(blob.name.endswith(ext) for ext in extensions):
            filename = os.path.basename(blob.name)
            destination_path = os.path.join(local_dir, filename)
            blob.download_to_filename(destination_path)
            downloaded_count += 1
    
    print(f"Downloaded {downloaded_count} files{' with extensions ' + str(extensions) if extensions else ''}")
    return downloaded_count

# Get COCO json format
def get_coco_json_format():
    return {
        "images": [],
        "annotations": [],
        "categories": []
    }

# Create categories in COCO format
def create_category_annotation(category_ids):
    return [{"id": value, "name": key, "supercategory": "none"} for key, value in category_ids.items()]

# Create image entry for COCO format
def create_image_annotation(file_name, width, height, image_id):
    return {
        "file_name": file_name,
        "height": height,
        "width": width,
        "id": image_id
    }

# Create sub-masks from grayscale mask
def create_sub_masks(mask_image, w, h):
    mask_array = np.array(mask_image)
    sub_masks = {}
    unique_values = np.unique(mask_array)
    
    for value in unique_values:
        if value == 0:  # Skip background
            continue
        binary_mask = np.zeros((h, w), dtype=np.uint8)
        binary_mask[mask_array == value] = 1
        sub_masks[value] = Image.fromarray(binary_mask)
    
    return sub_masks

# Create polygon from binary mask
def create_sub_mask_annotation(sub_mask):
    sub_mask_array = np.array(sub_mask, dtype=np.uint8)
    contours, _ = cv2.findContours(sub_mask_array, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    polygons = []
    segmentations = []
    
    for contour in contours:
        if contour.size >= 6:  # Need at least 3 points (x,y)
            polygon = Polygon(contour.reshape(-1, 2))
            if polygon.is_valid:
                polygons.append(polygon)
                segmentation = contour.flatten().tolist()
                segmentations.append(segmentation)
    
    return polygons, segmentations

# Create annotation in COCO format
def create_annotation_format(polygon, segmentation, image_id, category_id, annotation_id):
    min_x, min_y, max_x, max_y = polygon.bounds
    width = max_x - min_x
    height = max_y - min_y
    bbox = [min_x, min_y, width, height]
    area = polygon.area
    
    return {
        "segmentation": segmentation,
        "iscrowd": 0,
        "area": area,
        "image_id": image_id,
        "bbox": bbox,
        "category_id": category_id,
        "id": annotation_id
    }

# Process images and masks to create COCO annotations
def images_annotations_info(image_dir, mask_dir, component_name):
    annotation_id = 0
    image_id = 0
    annotations = []
    images = []
    
    mask_files = glob.glob(os.path.join(mask_dir, "*.png"))
    
    print(f"Found {len(mask_files)} mask files in {mask_dir}")
    skipped_count = 0
    empty_mask_count = 0
    
    for mask_path in mask_files:
        mask_filename = os.path.basename(mask_path)
        # Remove "_semantic_mask" from the filename before changing extension
        base_name = os.path.splitext(mask_filename)[0]
        if "_semantic_mask" in base_name:
            base_name = base_name.replace("_semantic_mask", "")
            
        # Try both .jpg and .png extensions
        image_extensions = ['.jpg', '.png']
        image_filename = None
        image_path = None
        
        for ext in image_extensions:
            potential_image_filename = base_name + ext
            potential_image_path = os.path.join(image_dir, potential_image_filename)
            
            if os.path.exists(potential_image_path):
                image_filename = potential_image_filename
                image_path = potential_image_path
                break
        
        if image_path is None:
            print(f"Warning: No matching image for mask {mask_filename}")
            print(f"Checked extensions: {image_extensions}")
            continue
        
        # Check if mask has any objects
        mask_image = Image.open(mask_path).convert("L")
        mask_array = np.array(mask_image)
        unique_values = np.unique(mask_array)
        if len(unique_values) == 1 and unique_values[0] == 0:
            print(f"Warning: Mask {mask_filename} contains only background")
            empty_mask_count += 1
            continue
            
        w, h = mask_image.size
        
        image = create_image_annotation(image_filename, w, h, image_id)
        images.append(image)
        
        sub_masks = create_sub_masks(mask_image, w, h)
        
        # After processing all sub_masks
        if len(sub_masks) == 0:
            print(f"Warning: No valid objects found in mask {mask_filename}")
            skipped_count += 1
            continue
        
        for pixel_value, sub_mask in sub_masks.items():
            # Get the category ID based on the component name
            category_id = folder_to_category_id.get(component_name, int(pixel_value))
            
            polygons, segmentations = create_sub_mask_annotation(sub_mask)
            
            if category_id in multipolygon_ids and len(polygons) > 1:
                multi_poly = MultiPolygon(polygons)
                annotation = create_annotation_format(
                    multi_poly.convex_hull, segmentations, image_id, category_id, annotation_id
                )
                annotations.append(annotation)
                annotation_id += 1
            else:
                for i in range(len(polygons)):
                    annotation = create_annotation_format(
                        polygons[i], [segmentations[i]], image_id, category_id, annotation_id
                    )
                    annotations.append(annotation)
                    annotation_id += 1
        
        image_id += 1
    
    # After the loop
    print(f"Skipped {skipped_count} masks with no valid objects")
    print(f"Found {empty_mask_count} empty masks (only background)")
    
    return images, annotations, annotation_id, image_id

# Main execution
def main():
    kaggle_dir = "/kaggle/working/"
    output_dir = os.path.join(kaggle_dir, "output")
    os.makedirs(output_dir, exist_ok=True)
    
    bucket_name = "segmentedimages"
    
    # Define all components to process
    sreeni_components = [
        "bolt1", "bolt2", "bolt3", 
        "busbar_long", "busbar_mid", "busbar_short", 
        "cable", "connectors", 
        "mechmind", "multi",
        "nut1", "nut2", 
        "plastic_cover", 
    ]
    
    # Define labelstudio components
    labelstudio_components = [
        "busbar_mid", "multi", "plastic_film"
    ]
    
    coco_format = get_coco_json_format()
    coco_format["categories"] = create_category_annotation(category_ids)
    
    all_images = []
    all_annotations = []
    total_annotation_count = 0
    total_image_count = 0
    
    # Process sreeni components
    for component in sreeni_components:
        print(f"Processing sreeni/{component}...")
        
        # 1. Process regular images and masks
        image_dir = os.path.join(kaggle_dir, f"sreeni_{component}_images")
        mask_dir = os.path.join(kaggle_dir, f"sreeni_{component}_masks")
        
        os.makedirs(image_dir, exist_ok=True)
        os.makedirs(mask_dir, exist_ok=True)
        
        print(f"Downloading images for sreeni/{component}...")
        download_from_gcs(bucket_name, f"sreeni/{component}/images", image_dir, extensions=['.jpg', '.png'])
        
        print(f"Downloading masks for sreeni/{component}...")
        download_from_gcs(bucket_name, f"sreeni/{component}/segmented_images", mask_dir, extensions=['.png'])
        
        images, annotations, annotation_count, last_image_id = images_annotations_info(
            image_dir, mask_dir, component
        )
        
        all_images.extend(images)
        all_annotations.extend(annotations)
        total_annotation_count += annotation_count
        total_image_count += len(images)
        
        print(f"Processed {len(images)} regular images and created {annotation_count} annotations for sreeni/{component}")
        
        # 2. Process augmented images and masks if they exist
        aug_image_dir = os.path.join(kaggle_dir, f"sreeni_{component}_augmented_images")
        aug_mask_dir = os.path.join(kaggle_dir, f"sreeni_{component}_augmented_masks")
        
        os.makedirs(aug_image_dir, exist_ok=True)
        os.makedirs(aug_mask_dir, exist_ok=True)
        
        print(f"Checking for augmented data in sreeni/{component}/augment/...")
        aug_images_count = download_from_gcs(
            bucket_name, 
            f"sreeni/{component}/augment/augmented_images", 
            aug_image_dir, 
            extensions=['.jpg', '.png']
        )
        
        if aug_images_count > 0:
            print(f"Downloading augmented masks for sreeni/{component}...")
            download_from_gcs(
                bucket_name, 
                f"sreeni/{component}/augment/augmented_masks", 
                aug_mask_dir, 
                extensions=['.png']
            )
            
            aug_images, aug_annotations, aug_annotation_count, _ = images_annotations_info(
                aug_image_dir, aug_mask_dir, component
            )
            
            # Update image IDs for augmented images
            for img in aug_images:
                img["id"] += total_image_count
            
            # Update image IDs in annotations
            for ann in aug_annotations:
                ann["image_id"] += total_image_count
                ann["id"] += total_annotation_count
            
            all_images.extend(aug_images)
            all_annotations.extend(aug_annotations)
            total_annotation_count += aug_annotation_count
            total_image_count += len(aug_images)
            
            print(f"Processed {len(aug_images)} augmented images and created {aug_annotation_count} annotations for sreeni/{component}")
    
    # Process labelstudio components (similar structure as sreeni components)
    for component in labelstudio_components:
        print(f"Processing labelstudio/{component}...")
        
        # 1. Process regular images and masks
        image_dir = os.path.join(kaggle_dir, f"labelstudio_{component}_images")
        mask_dir = os.path.join(kaggle_dir, f"labelstudio_{component}_masks")
        
        os.makedirs(image_dir, exist_ok=True)
        os.makedirs(mask_dir, exist_ok=True)
        
        print(f"Downloading images for labelstudio/{component}...")
        download_from_gcs(bucket_name, f"labelstudio/{component}/images", image_dir, extensions=['.jpg', '.png'])
        
        print(f"Downloading masks for labelstudio/{component}...")
        download_from_gcs(bucket_name, f"labelstudio/{component}/segmented_images", mask_dir, extensions=['.png'])
        
        images, annotations, annotation_count, last_image_id = images_annotations_info(
            image_dir, mask_dir, component
        )
        
        all_images.extend(images)
        all_annotations.extend(annotations)
        total_annotation_count += annotation_count
        total_image_count += len(images)
        
        print(f"Processed {len(images)} regular images and created {annotation_count} annotations for labelstudio/{component}")
        
        # 2. Process augmented images and masks if they exist
        aug_image_dir = os.path.join(kaggle_dir, f"labelstudio_{component}_augmented_images")
        aug_mask_dir = os.path.join(kaggle_dir, f"labelstudio_{component}_augmented_masks")
        
        os.makedirs(aug_image_dir, exist_ok=True)
        os.makedirs(aug_mask_dir, exist_ok=True)
        
        print(f"Checking for augmented data in labelstudio/{component}/augment/...")
        aug_images_count = download_from_gcs(
            bucket_name, 
            f"labelstudio/{component}/augment/augmented_images", 
            aug_image_dir, 
            extensions=['.jpg', '.png']
        )
        
        if aug_images_count > 0:
            print(f"Downloading augmented masks for labelstudio/{component}...")
            download_from_gcs(
                bucket_name, 
                f"labelstudio/{component}/augment/augmented_masks", 
                aug_mask_dir, 
                extensions=['.png']
            )
            
            aug_images, aug_annotations, aug_annotation_count, _ = images_annotations_info(
                aug_image_dir, aug_mask_dir, component
            )
            
            # Update image IDs for augmented images
            for img in aug_images:
                img["id"] += total_image_count
            
            # Update image IDs in annotations
            for ann in aug_annotations:
                ann["image_id"] += total_image_count
                ann["id"] += total_annotation_count
            
            all_images.extend(aug_images)
            all_annotations.extend(aug_annotations)
            total_annotation_count += aug_annotation_count
            total_image_count += len(aug_images)
            
            print(f"Processed {len(aug_images)} augmented images and created {aug_annotation_count} annotations for labelstudio/{component}")
    
    coco_format["images"] = all_images
    coco_format["annotations"] = all_annotations
    
    output_file = os.path.join(output_dir, "coco_annotations.json")
    with open(output_file, "w") as f:
        json.dump(coco_format, f)
    
    print(f"Created a total of {total_annotation_count} annotations for {total_image_count} images")
    print(f"Saved COCO format annotations to {output_file}")

if __name__ == "__main__":
    main()

Processing sreeni/bolt1...
Downloading images for sreeni/bolt1...
Downloaded 329 files with extensions ['.jpg', '.png']
Downloading masks for sreeni/bolt1...
Downloaded 329 files with extensions ['.png']
Found 329 mask files in /kaggle/working/sreeni_bolt1_masks
Skipped 0 masks with no valid objects
Found 0 empty masks (only background)
Processed 329 regular images and created 327 annotations for sreeni/bolt1
Checking for augmented data in sreeni/bolt1/augment/...
Downloaded 600 files with extensions ['.jpg', '.png']
Downloading augmented masks for sreeni/bolt1...
Downloaded 600 files with extensions ['.png']
Found 600 mask files in /kaggle/working/sreeni_bolt1_augmented_masks
Skipped 0 masks with no valid objects
Found 0 empty masks (only background)
Processed 600 augmented images and created 590 annotations for sreeni/bolt1
Processing sreeni/bolt2...
Downloading images for sreeni/bolt2...
Downloaded 257 files with extensions ['.jpg', '.png']
Downloading masks for sreeni/bolt2...
Down

<h3>CocoJson with filepaths</h3>

In [5]:
import os
import json
import numpy as np
import cv2
from PIL import Image
from shapely.geometry import Polygon, MultiPolygon
from google.cloud import storage
import glob

# Label ids of the dataset
category_ids = {
    "Background": 0,
    "Bolt": 1,
    "Bolt Washer": 2,
    "Busbar": 3,
    "Cable": 4,
    "Connector": 5,
    "Nut": 6,
    "Plastic Film": 7,
    "Plastic Cover": 8
}

# Mapping from category names to folder names
category_to_folders = {
    "Bolt": ["bolt1", "bolt2", "bolt3"],
    "Busbar": ["busbar_long", "busbar_mid", "busbar_short"],
    "Cable": ["cable"],
    "Connector": ["connectors"],
    "Nut": ["nut1", "nut2"],
    "Plastic Film": ["plastic_film"],
    "Plastic Cover": ["plastic_cover"]
}

# Create reverse mapping from folder to category ID
folder_to_category_id = {}
for category, folders in category_to_folders.items():
    category_id = category_ids[category]
    for folder in folders:
        folder_to_category_id[folder] = category_id

# Define all categories as multipolygon for safety
multipolygon_ids = list(range(1, 9))  # All categories except background

# Function to download files from Google Cloud Storage
def download_from_gcs(bucket_name, prefix, local_dir, extensions=None):
    """
    Download files from GCS bucket with specified prefix and extensions to local directory
    
    Args:
    - bucket_name: Name of the GCS bucket
    - prefix: Prefix path in the bucket
    - local_dir: Local directory to download files to
    - extensions: List of extensions to download. If None, download all files.
    """
    storage_client = storage.Client(project='tidy-rig-452705-i9')
    bucket = storage_client.bucket(bucket_name)
    blobs = bucket.list_blobs(prefix=prefix)
    
    downloaded_count = 0
    for blob in blobs:
        # If no extensions specified, download all files
        # Otherwise, check if the blob name ends with any of the specified extensions
        if extensions is None or any(blob.name.endswith(ext) for ext in extensions):
            filename = os.path.basename(blob.name)
            destination_path = os.path.join(local_dir, filename)
            blob.download_to_filename(destination_path)
            downloaded_count += 1
    
    print(f"Downloaded {downloaded_count} files{' with extensions ' + str(extensions) if extensions else ''}")
    return downloaded_count

# Get COCO json format
def get_coco_json_format():
    return {
        "images": [],
        "annotations": [],
        "categories": []
    }

# Create categories in COCO format
def create_category_annotation(category_ids):
    return [{"id": value, "name": key, "supercategory": "none"} for key, value in category_ids.items()]

# Create image entry for COCO format
def create_image_annotation(file_path, width, height, image_id):
    return {
        "file_name": file_path,
        "height": height,
        "width": width,
        "id": image_id
    }

# Create sub-masks from grayscale mask
def create_sub_masks(mask_image, w, h):
    mask_array = np.array(mask_image)
    sub_masks = {}
    unique_values = np.unique(mask_array)
    
    for value in unique_values:
        if value == 0:  # Skip background
            continue
        binary_mask = np.zeros((h, w), dtype=np.uint8)
        binary_mask[mask_array == value] = 1
        sub_masks[value] = Image.fromarray(binary_mask)
    
    return sub_masks

# Create polygon from binary mask
def create_sub_mask_annotation(sub_mask):
    sub_mask_array = np.array(sub_mask, dtype=np.uint8)
    contours, _ = cv2.findContours(sub_mask_array, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    polygons = []
    segmentations = []
    
    for contour in contours:
        if contour.size >= 6:  # Need at least 3 points (x,y)
            polygon = Polygon(contour.reshape(-1, 2))
            if polygon.is_valid:
                polygons.append(polygon)
                segmentation = contour.flatten().tolist()
                segmentations.append(segmentation)
    
    return polygons, segmentations

# Create annotation in COCO format
def create_annotation_format(polygon, segmentation, image_id, category_id, annotation_id):
    min_x, min_y, max_x, max_y = polygon.bounds
    width = max_x - min_x
    height = max_y - min_y
    bbox = [min_x, min_y, width, height]
    area = polygon.area
    
    return {
        "segmentation": segmentation,
        "iscrowd": 0,
        "area": area,
        "image_id": image_id,
        "bbox": bbox,
        "category_id": category_id,
        "id": annotation_id
    }

# Process images and masks to create COCO annotations
def images_annotations_info(image_dir, mask_dir, component_name):
    annotation_id = 0
    image_id = 0
    annotations = []
    images = []
    
    mask_files = glob.glob(os.path.join(mask_dir, "*.png"))
    
    print(f"Found {len(mask_files)} mask files in {mask_dir}")
    skipped_count = 0
    empty_mask_count = 0
    
    for mask_path in mask_files:
        mask_filename = os.path.basename(mask_path)
        # Remove "_semantic_mask" from the filename before changing extension
        base_name = os.path.splitext(mask_filename)[0]
        if "_semantic_mask" in base_name:
            base_name = base_name.replace("_semantic_mask", "")
            
        # Try both .jpg and .png extensions
        image_extensions = ['.jpg', '.png']
        image_filename = None
        image_path = None
        
        for ext in image_extensions:
            potential_image_filename = base_name + ext
            potential_image_path = os.path.join(image_dir, potential_image_filename)
            
            if os.path.exists(potential_image_path):
                image_filename = potential_image_filename
                image_path = potential_image_path
                break
        
        if image_path is None:
            print(f"Warning: No matching image for mask {mask_filename}")
            print(f"Checked extensions: {image_extensions}")
            continue
        
        # Check if mask has any objects
        mask_image = Image.open(mask_path).convert("L")
        mask_array = np.array(mask_image)
        unique_values = np.unique(mask_array)
        if len(unique_values) == 1 and unique_values[0] == 0:
            print(f"Warning: Mask {mask_filename} contains only background")
            empty_mask_count += 1
            continue
            
        w, h = mask_image.size
        
        image = create_image_annotation(image_path, w, h, image_id)
        images.append(image)
        
        sub_masks = create_sub_masks(mask_image, w, h)
        
        # After processing all sub_masks
        if len(sub_masks) == 0:
            print(f"Warning: No valid objects found in mask {mask_filename}")
            skipped_count += 1
            continue
        
        for pixel_value, sub_mask in sub_masks.items():
            # Get the category ID based on the component name
            category_id = folder_to_category_id.get(component_name, int(pixel_value))
            
            polygons, segmentations = create_sub_mask_annotation(sub_mask)
            
            if category_id in multipolygon_ids and len(polygons) > 1:
                multi_poly = MultiPolygon(polygons)
                annotation = create_annotation_format(
                    multi_poly.convex_hull, segmentations, image_id, category_id, annotation_id
                )
                annotations.append(annotation)
                annotation_id += 1
            else:
                for i in range(len(polygons)):
                    annotation = create_annotation_format(
                        polygons[i], [segmentations[i]], image_id, category_id, annotation_id
                    )
                    annotations.append(annotation)
                    annotation_id += 1
        
        image_id += 1
    
    # After the loop
    print(f"Skipped {skipped_count} masks with no valid objects")
    print(f"Found {empty_mask_count} empty masks (only background)")
    
    return images, annotations, annotation_id, image_id

# Main execution
def main():
    kaggle_dir = "/kaggle/working/"
    output_dir = os.path.join(kaggle_dir, "output")
    os.makedirs(output_dir, exist_ok=True)
    
    bucket_name = "segmentedimages"
    
    # Define all components to process
    sreeni_components = [
        "bolt1", "bolt2", "bolt3", 
        "busbar_long", "busbar_mid", "busbar_short", 
        "cable", "connectors", 
        "mechmind", "multi",
        "nut1", "nut2", 
        "plastic_cover", 
    ]
    
    # Define labelstudio components
    labelstudio_components = [
        "busbar_mid", "multi", "plastic_film"
    ]
    
    coco_format = get_coco_json_format()
    coco_format["categories"] = create_category_annotation(category_ids)
    
    all_images = []
    all_annotations = []
    total_annotation_count = 0
    total_image_count = 0
    
    # Process sreeni components
    for component in sreeni_components:
        print(f"Processing sreeni/{component}...")
        
        # 1. Process regular images and masks
        image_dir = os.path.join(kaggle_dir, f"sreeni_{component}_images")
        mask_dir = os.path.join(kaggle_dir, f"sreeni_{component}_masks")
        
        os.makedirs(image_dir, exist_ok=True)
        os.makedirs(mask_dir, exist_ok=True)
        
        print(f"Downloading images for sreeni/{component}...")
        download_from_gcs(bucket_name, f"sreeni/{component}/images", image_dir, extensions=['.jpg', '.png'])
        
        print(f"Downloading masks for sreeni/{component}...")
        download_from_gcs(bucket_name, f"sreeni/{component}/segmented_images", mask_dir, extensions=['.png'])
        
        images, annotations, annotation_count, last_image_id = images_annotations_info(
            image_dir, mask_dir, component
        )
        
        all_images.extend(images)
        all_annotations.extend(annotations)
        total_annotation_count += annotation_count
        total_image_count += len(images)
        
        print(f"Processed {len(images)} regular images and created {annotation_count} annotations for sreeni/{component}")
        
        # 2. Process augmented images and masks if they exist
        aug_image_dir = os.path.join(kaggle_dir, f"sreeni_{component}_augmented_images")
        aug_mask_dir = os.path.join(kaggle_dir, f"sreeni_{component}_augmented_masks")
        
        os.makedirs(aug_image_dir, exist_ok=True)
        os.makedirs(aug_mask_dir, exist_ok=True)
        
        print(f"Checking for augmented data in sreeni/{component}/augment/...")
        aug_images_count = download_from_gcs(
            bucket_name, 
            f"sreeni/{component}/augment/augmented_images", 
            aug_image_dir, 
            extensions=['.jpg', '.png']
        )
        
        if aug_images_count > 0:
            print(f"Downloading augmented masks for sreeni/{component}...")
            download_from_gcs(
                bucket_name, 
                f"sreeni/{component}/augment/augmented_masks", 
                aug_mask_dir, 
                extensions=['.png']
            )
            
            aug_images, aug_annotations, aug_annotation_count, _ = images_annotations_info(
                aug_image_dir, aug_mask_dir, component
            )
            
            # Update image IDs for augmented images
            for img in aug_images:
                img["id"] += total_image_count
            
            # Update image IDs in annotations
            for ann in aug_annotations:
                ann["image_id"] += total_image_count
                ann["id"] += total_annotation_count
            
            all_images.extend(aug_images)
            all_annotations.extend(aug_annotations)
            total_annotation_count += aug_annotation_count
            total_image_count += len(aug_images)
            
            print(f"Processed {len(aug_images)} augmented images and created {aug_annotation_count} annotations for sreeni/{component}")
    
    # Process labelstudio components (similar structure as sreeni components)
    for component in labelstudio_components:
        print(f"Processing labelstudio/{component}...")
        
        # 1. Process regular images and masks
        image_dir = os.path.join(kaggle_dir, f"labelstudio_{component}_images")
        mask_dir = os.path.join(kaggle_dir, f"labelstudio_{component}_masks")
        
        os.makedirs(image_dir, exist_ok=True)
        os.makedirs(mask_dir, exist_ok=True)
        
        print(f"Downloading images for labelstudio/{component}...")
        download_from_gcs(bucket_name, f"labelstudio/{component}/images", image_dir, extensions=['.jpg', '.png'])
        
        print(f"Downloading masks for labelstudio/{component}...")
        download_from_gcs(bucket_name, f"labelstudio/{component}/segmented_images", mask_dir, extensions=['.png'])
        
        images, annotations, annotation_count, last_image_id = images_annotations_info(
            image_dir, mask_dir, component
        )
        
        all_images.extend(images)
        all_annotations.extend(annotations)
        total_annotation_count += annotation_count
        total_image_count += len(images)
        
        print(f"Processed {len(images)} regular images and created {annotation_count} annotations for labelstudio/{component}")
        
        # 2. Process augmented images and masks if they exist
        aug_image_dir = os.path.join(kaggle_dir, f"labelstudio_{component}_augmented_images")
        aug_mask_dir = os.path.join(kaggle_dir, f"labelstudio_{component}_augmented_masks")
        
        os.makedirs(aug_image_dir, exist_ok=True)
        os.makedirs(aug_mask_dir, exist_ok=True)
        
        print(f"Checking for augmented data in labelstudio/{component}/augment/...")
        aug_images_count = download_from_gcs(
            bucket_name, 
            f"labelstudio/{component}/augment/augmented_images", 
            aug_image_dir, 
            extensions=['.jpg', '.png']
        )
        
        if aug_images_count > 0:
            print(f"Downloading augmented masks for labelstudio/{component}...")
            download_from_gcs(
                bucket_name, 
                f"labelstudio/{component}/augment/augmented_masks", 
                aug_mask_dir, 
                extensions=['.png']
            )
            
            aug_images, aug_annotations, aug_annotation_count, _ = images_annotations_info(
                aug_image_dir, aug_mask_dir, component
            )
            
            # Update image IDs for augmented images
            for img in aug_images:
                img["id"] += total_image_count
            
            # Update image IDs in annotations
            for ann in aug_annotations:
                ann["image_id"] += total_image_count
                ann["id"] += total_annotation_count
            
            all_images.extend(aug_images)
            all_annotations.extend(aug_annotations)
            total_annotation_count += aug_annotation_count
            total_image_count += len(aug_images)
            
            print(f"Processed {len(aug_images)} augmented images and created {aug_annotation_count} annotations for labelstudio/{component}")
    
    coco_format["images"] = all_images
    coco_format["annotations"] = all_annotations
    
    output_file = os.path.join(output_dir, "full_coco_annotations.json")
    with open(output_file, "w") as f:
        json.dump(coco_format, f)
    
    print(f"Created a total of {total_annotation_count} annotations for {total_image_count} images")
    print(f"Saved COCO format annotations to {output_file}")

if __name__ == "__main__":
    main()

Processing sreeni/bolt1...
Downloading images for sreeni/bolt1...
Downloaded 329 files with extensions ['.jpg', '.png']
Downloading masks for sreeni/bolt1...
Downloaded 329 files with extensions ['.png']
Found 329 mask files in /kaggle/working/sreeni_bolt1_masks
Skipped 0 masks with no valid objects
Found 0 empty masks (only background)
Processed 329 regular images and created 327 annotations for sreeni/bolt1
Checking for augmented data in sreeni/bolt1/augment/...
Downloaded 600 files with extensions ['.jpg', '.png']
Downloading augmented masks for sreeni/bolt1...
Downloaded 600 files with extensions ['.png']
Found 600 mask files in /kaggle/working/sreeni_bolt1_augmented_masks
Skipped 0 masks with no valid objects
Found 0 empty masks (only background)
Processed 600 augmented images and created 590 annotations for sreeni/bolt1
Processing sreeni/bolt2...
Downloading images for sreeni/bolt2...
Downloaded 257 files with extensions ['.jpg', '.png']
Downloading masks for sreeni/bolt2...
Down