In [1]:
import rasterio as rio

import geopandas as gpd
import shapely
import matplotlib.pyplot as plt
from rasterio.windows import bounds as window_bounds, Window
from rasterio.plot import reshape_as_image
from shapely.geometry import box
from pathlib import Path
import cv2
import numpy as np
from affine import Affine
from shapely.ops import transform
import pandas as pd


In [2]:

class RasterIOUtils:
    @staticmethod
    def save_png(arr, arr_path):
        arr_path = str(arr_path)
        assert arr_path.endswith(".png")
        
        if arr.ndim == 2:
            cv2.imwrite(arr_path, arr, )
        elif arr.ndim == 3:
            cv2.imwrite(arr_path, cv2.cvtColor(arr, cv2.COLOR_RGB2BGR))
        else:
            raise ValueError(f"invalid number of dimensions: {arr.ndim=}")
        
    @staticmethod
    def read_png(arr_path):
        arr =  cv2.imread(str(arr_path), cv2.IMREAD_ANYCOLOR)
        if arr.ndim == 2:
            return arr
        elif arr.ndim == 3:
            return cv2.cvtColor(arr, cv2.COLOR_BGR2RGB)
        else:
            raise ValueError(f"invalid number of dimensions: {arr.ndim=}")
            
            
        

In [3]:
with rio.open(tif_path) as src:
    print(src.name)

NameError: name 'tif_path' is not defined

In [4]:
gdf.

SyntaxError: invalid syntax (3305303092.py, line 1)

In [5]:
from pydantic import BaseModel, ConfigDict


class CropConfig(BaseModel):
    src: rio.DatasetReader
    gdf: gpd.GeoDataFrame
    window: rio.windows.Window
    
    model_config = ConfigDict(arbitrary_types_allowed=True)



def world_geom_to_window_pixels(geom, world_to_full_pixel, col_off, row_off):
    def _to_px(x, y, z=None):
        col, row = world_to_full_pixel * (x, y)
        return col - col_off, row - row_off
    return transform(_to_px, geom)
        
class CropRasterUtils:
    @staticmethod
    def crop_window_x(src, window):
        arr = src.read(window=window)
        arr = reshape_as_image(arr=arr[:3])
        return arr    
    
    @staticmethod
    def _create_polygons_intersected_with_window(gdf, src, window):
        window_geom = box(*window_bounds(window, src.transform))

        # Fast spatial index query
        candidate_idx = list(gdf.sindex.intersection(window_geom.bounds))
        gdf_candidates = gdf.iloc[candidate_idx]

        # Precise intersection filter (geometry vs window polygon)
        gdf_in_window = gdf_candidates[gdf_candidates.intersects(window_geom)].copy()
        gdf_clipped = gdf_in_window.copy()
        gdf_clipped["geometry"] = gdf_clipped.geometry.intersection(window_geom)


        full_transform = src.transform         # maps pixel -> world
        world_to_full_pixel = ~full_transform # invert: world -> pixel
        col_off, row_off = window.col_off, window.row_off

        gdf_window_px = gdf_clipped.copy()
        gdf_window_px["geometry"] = gdf_window_px.geometry.apply(lambda row: world_geom_to_window_pixels(row, 
                                                                                                         col_off=col_off, 
                                                                                                         row_off=row_off,
                                                                                                         world_to_full_pixel=world_to_full_pixel))
        return gdf_window_px
    
    @staticmethod
    def _create_mask_from_polygons(gdf, mask_height, mask_width):
        mask = np.zeros((mask_height, mask_width), dtype=np.uint8)
        for geom in gdf.geometry:
            if geom.is_empty:
                continue
            if geom.geom_type ==  "Polygon":
                geom = [geom]
            elif geom.geom_type ==  "MultiPolygon":
                geom = list(geom.geoms)
            for geo in geom:
                exterior = np.array(geo.exterior.coords, dtype=np.int32)
                mask = cv2.fillPoly(mask, [exterior], (1))
        return mask
    
    @staticmethod
    def crop_window_y(gdf, src, window):
        gdf_window_px = CropRasterUtils._create_polygons_intersected_with_window(gdf, src, window)
        mask = CropRasterUtils._create_mask_from_polygons(gdf_window_px, 
                                                     mask_height=window.height, 
                                                     mask_width=window.width)
        
        return mask        

    @staticmethod
    def process(crop_config: CropConfig):
        x = CropRasterUtils.crop_window_x(src=crop_config.src, 
                                     window=crop_config.window)
        y = CropRasterUtils.crop_window_y(src=crop_config.src, 
                                     gdf=crop_config.gdf,
                                     window=crop_config.window)
        
        
        return x, y
        
        

In [6]:
from tqdm.notebook import tqdm


In [19]:
import polars as pl
    
data_path = Path("/Users/danylo_kunyk/Desktop/africa/data/ai-challenge/train_tier_1/acc/a42435")

tif_path = data_path / f"{data_path.name}.tif"
labels_path = data_path.parent / f"{data_path.name}-labels" / f'{data_path.name}.geojson' 
    
    
class Worker:
    @staticmethod
    def process(tif_path, gdf, horizontal_step: int, vertical_step: int, output_dir: str):
        gdf = gpd.read_file(labels_path)
        
        processed_windows = []
        with rio.open(tif_path) as src:
            gdf = gdf.to_crs(src.crs)
            for horizontal_step_id in tqdm(range(src.meta['width'] // horizontal_step)):
                for vertical_step_id in range(src.meta['height'] // vertical_step):
                    x_start, y_start = horizontal_step * horizontal_step_id, vertical_step * vertical_step_id
                    x_path = output_dir / f"x_{horizontal_step_id=}_{vertical_step_id=}.png"
                    y_path = output_dir / f"y_{horizontal_step_id=}_{vertical_step_id=}.png"
                    crop_config = CropConfig(
                        src=src,
                        gdf=gdf,
                        window=Window(x_start, y_start, horizontal_step, vertical_step)
                    )
                    x, y = CropRasterUtils.process(
                        crop_config=crop_config
                    )
                    if np.all(y == 0):
                        print("Skip no annot")
                        continue
                    
                    RasterIOUtils.save_png(
                        x,
                        arr_path=x_path
                    )
                    RasterIOUtils.save_png(
                        y,
                        arr_path=y_path 
                    )
                    processed_windows.append(dict(
                        window=dict(crop_config.window.todict()),
                        x_path=str(x_path),
                        y_path=str(y_path)
                    ))
                    break
                break                    
        return pd.DataFrame(processed_windows)
            

In [20]:
import shutil

HORIZONTAL_STEP = 1024
VERTICAL_STEP = 1024


output_dir = Path("dataset_processed/")
split_dir = Path("/Users/danylo_kunyk/Desktop/africa/data/ai-challenge/train_tier_1/")

shutil.rmtree(output_dir, ignore_errors=True)
output_dir.mkdir(parents=True)


for city_path in split_dir.glob("*"):
    if not city_path.is_dir():
        continue
    for chunk_path in city_path.glob("*"):
        if (not chunk_path.is_dir()) or chunk_path.name.endswith("-labels") or chunk_path.name.startswith(".DS_Store"):
            continue
        tif_path = chunk_path / f"{chunk_path.name}.tif"
        labels_path = chunk_path.with_stem(f"{chunk_path.name}-labels") / f"{chunk_path.name}.geojson"


        chunk_output_dir = output_dir / city_path.name / chunk_path.name 
        chunk_output_dir.mkdir(exist_ok=True, parents=True)
        
        print(tif_path.exists(), labels_path.exists())
        resulting_df = Worker.process(tif_path, labels_path, 
                                      horizontal_step=HORIZONTAL_STEP,
                                        vertical_step=VERTICAL_STEP, 
                                        output_dir=chunk_output_dir)
        resulting_df['tif_path'] = tif_path
        resulting_df['labels_path'] = labels_path

        resulting_df['chunk_id'] = tif_path.parent.name
        resulting_df['city_id'] = tif_path.parent.parent.name
        
    


True True


  0%|          | 0/38 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/50 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/51 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/42 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/66 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/43 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/49 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  return ogr_read(


  0%|          | 0/32 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/39 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  return ogr_read(


  0%|          | 0/35 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/49 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/37 [00:00<?, ?it/s]

Skip no annot
Skip no annot
True True


  0%|          | 0/43 [00:00<?, ?it/s]

True True


  0%|          | 0/48 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/38 [00:00<?, ?it/s]

True True


  0%|          | 0/38 [00:00<?, ?it/s]

Skip no annot
True True


  0%|          | 0/39 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/32 [00:00<?, ?it/s]

True True


  0%|          | 0/16 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/36 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/17 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/19 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/21 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/19 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/7 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/6 [00:00<?, ?it/s]

True True


  0%|          | 0/8 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/56 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/39 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
True True


  0%|          | 0/82 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip no annot
Skip n

  0%|          | 0/64 [00:00<?, ?it/s]

Skip no annot
Skip no annot
Skip no annot


In [18]:
!rm -rf dataset_processed

In [201]:
y.shape

(1024, 1024, 3)