In [1]:
import os
import json
import pandas as pd

In [8]:
import os
import json
import pandas as pd

def extract_building_polygons(json_folder):
    rows = []

    for filename in os.listdir(json_folder):
        if filename.endswith(".json"):
            json_path = os.path.join(json_folder, filename)
            with open(json_path, "r") as f:
                data = json.load(f)

            img_name = filename.replace(".json", ".tif")
            features = data["features"]["xy"]
            metadata = data.get("metadata", {})
            disaster_type = metadata.get("disaster_type", None)

            for feat in features:
                props = feat["properties"]
                subtype = props.get("subtype")
                uid = props.get("uid")
                wkt = feat.get("wkt")
                disaster = disaster_type

                rows.append({
                    "uid": uid,
                    "image_id": img_name,
                    "subtype": subtype,
                    "wkt": wkt,
                    "disaster": disaster
                })

    df = pd.DataFrame(rows)
    return df

In [9]:
json_folder = "./tier1/labels"
output_csv = "building_polygons_metadata.csv"
df = extract_building_polygons(json_folder)

df.to_csv(output_csv, index=False)

In [11]:
df = pd.read_csv(output_csv)

print(df.head())
print(df["subtype"].value_counts())
print(df["disaster"].value_counts())
print(df.isna().sum())

                                    uid  \
0  8c42624a-d093-486c-ad1c-4fbd070faf6c   
1  986e8b4d-c2ca-4fe3-946a-3757c1fa4435   
2  a4069f10-166f-4b97-a25c-5af342975d42   
3  ba868904-2116-49d5-a44c-a19dcb2a6361   
4  42b9b7a8-ebce-405a-a888-1c4f4bf9300f   

                                        image_id subtype  \
0  santa-rosa-wildfire_00000138_pre_disaster.tif     NaN   
1  santa-rosa-wildfire_00000138_pre_disaster.tif     NaN   
2  santa-rosa-wildfire_00000138_pre_disaster.tif     NaN   
3  santa-rosa-wildfire_00000138_pre_disaster.tif     NaN   
4  santa-rosa-wildfire_00000138_pre_disaster.tif     NaN   

                                                 wkt disaster  
0  POLYGON ((0 12.47651487918631, 14.511311627017...     fire  
1  POLYGON ((2.881611831691816 49.11659135604885,...     fire  
2  POLYGON ((55.88447421102212 0.4888322292836593...     fire  
3  POLYGON ((78.42987464102379 22.27268595973858,...     fire  
4  POLYGON ((54.64130596559908 127.5597927785371,...     fir

In [None]:
!pip install rasterio

Collecting rasterio
  Using cached rasterio-1.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.1 kB)
Collecting affine (from rasterio)
  Using cached affine-2.4.0-py3-none-any.whl.metadata (4.0 kB)
Collecting click>=4.0 (from rasterio)
  Using cached click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Collecting cligj>=0.5 (from rasterio)
  Using cached cligj-0.7.2-py3-none-any.whl.metadata (5.0 kB)
Collecting click-plugins (from rasterio)
  Using cached click_plugins-1.1.1-py2.py3-none-any.whl.metadata (6.4 kB)
Using cached rasterio-1.4.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (22.3 MB)
Using cached click-8.1.8-py3-none-any.whl (98 kB)
Using cached cligj-0.7.2-py3-none-any.whl (7.1 kB)
Using cached affine-2.4.0-py3-none-any.whl (15 kB)
Using cached click_plugins-1.1.1-py2.py3-none-any.whl (7.5 kB)
Installing collected packages: click, affine, cligj, click-plugins, rasterio


In [3]:
import rasterio
import numpy as np
from shapely import wkt
from PIL import Image
from rasterio.windows import Window
from shapely.geometry import Polygon


In [5]:
def crop_building_to_square(image_path, wkt_str, crop_size=256):
    """
    Crop the building polygon from the image and resize to square.
    
    - image_path (str): Path to the image file
    - wkt_str (str): Well-Known Text (WKT) string for the building polygon
    - crop_size (int): The size of the square crop
    
    Returns:
    - PIL Image: The cropped and resized image
    """
    
    with rasterio.open(image_path) as src:
        polygon = wkt.loads(wkt_str)
        minx, miny, maxx, maxy = polygon.bounds
        
        # Calculate the center and dimensions of the bounding box
        width = maxx - minx
        height = maxy - miny
        center_x = int((minx + maxx) / 2)
        center_y = int((miny + maxy) / 2)
        
        # Resize the shorter side to match the longer side
        if width > height:
            new_height = width  # Match height to width
            half_crop = new_height // 2
            window = Window(
                max(center_x - half_crop, 0),  # Prevent negative coordinates
                max(center_y - new_height // 2, 0),
                width,
                new_height
            )
        else:
            new_width = height  # Match width to height
            half_crop = new_width // 2
            window = Window(
                max(center_x - new_width // 2, 0),
                max(center_y - half_crop, 0),  # Prevent negative coordinates
                new_width,
                height
            )
        
        # Read the window from the image
        crop = src.read(window=window)
        
        # Convert from int16 to uint8
        # First normalize to 0-1 range
        crop = crop.astype(np.float32)
        crop = (crop - crop.min()) / (crop.max() - crop.min())
        # Then scale to 0-255 and convert to uint8
        crop = (crop * 255).astype(np.uint8)
        
        # Convert to PIL Image for resizing
        crop_image = Image.fromarray(crop.transpose(1, 2, 0))  # Convert from CHW to HWC format
        
        # Resize directly to target size
        final_image = crop_image.resize((crop_size, crop_size), Image.Resampling.LANCZOS)
        
        return final_image 
    
def crop_and_save_buildings(csv_path, image_dir, output_dir, crop_size=256, output_format="png"):
    """
    Process the CSV with building metadata, crop buildings from the image, 
    and save them as images in the specified format.

    Args:
    - csv_path (str): Path to the CSV file containing building metadata.
    - image_dir (str): Directory where the original images (.tif) are stored.
    - output_dir (str): Directory where cropped images will be saved.
    - crop_size (int): Desired size of the square crop (default: 256).
    - output_format (str): The format to save the cropped images ("png" by default).
    """
    # Load the CSV file into a DataFrame
    df = pd.read_csv(csv_path)
    
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Loop through the rows of the dataframe
    for idx, row in df.iterrows():
        uid = row['uid']
        image_id = row['image_id']
        wkt_str = row['wkt']
        disaster_type = row['disaster']
        
        # Skip rows where there is no valid WKT string or missing disaster type
        if pd.isna(wkt_str) or pd.isna(disaster_type):
            continue
        
        # Construct the image path based on the image_id (replace .json with .tif)
        image_path = os.path.join(image_dir, image_id.replace('.json', '.tif'))
        
        # Perform the cropping
        final_image = crop_building_to_square(image_path, wkt_str, crop_size)
        
        # Save the cropped image with a unique name (uid + output format)
        output_image_path = os.path.join(output_dir, f"{uid}.{output_format}")
        
        # Save the image in the specified output format
        final_image.save(output_image_path, format=output_format)
        
        print(f"Saved {output_image_path}")

In [None]:
crop_and_save_buildings(
    csv_path="building_polygons_metadata.csv",
    image_dir="./tier1/images",
    output_dir="./tier1/cropped_square_buildings",
    crop_size=256,
    output_format="png"
)

Saved ./tier1/cropped_square_buildings/8c42624a-d093-486c-ad1c-4fbd070faf6c.png
Saved ./tier1/cropped_square_buildings/986e8b4d-c2ca-4fe3-946a-3757c1fa4435.png
Saved ./tier1/cropped_square_buildings/a4069f10-166f-4b97-a25c-5af342975d42.png
Saved ./tier1/cropped_square_buildings/ba868904-2116-49d5-a44c-a19dcb2a6361.png
Saved ./tier1/cropped_square_buildings/42b9b7a8-ebce-405a-a888-1c4f4bf9300f.png
Saved ./tier1/cropped_square_buildings/29bbf8a9-aa07-49f0-85c4-651f08bcda91.png
Saved ./tier1/cropped_square_buildings/67206056-6357-4708-ada0-49c160098d9c.png
Saved ./tier1/cropped_square_buildings/e02f40d2-9206-4122-9c1c-93bcd501ab21.png
Saved ./tier1/cropped_square_buildings/cc2a8ef3-5299-422f-bd5b-d4b0529b39f1.png
Saved ./tier1/cropped_square_buildings/287f1b39-323d-4d15-98d5-abae8f3b8dff.png
Saved ./tier1/cropped_square_buildings/1d76ad54-b65e-4b2d-a2ec-5fb613456860.png
Saved ./tier1/cropped_square_buildings/aefbfe6b-c73c-4e62-aa6a-9c6c506d7c9b.png
Saved ./tier1/cropped_square_buildings/d