In [30]:
import os
import numpy as np
import pandas as pd
import rasterio
import tensorflow as tf

# =====================
# CONFIG
# =====================
SEQ_LEN = 6                # past 6 timesteps
HORIZONS = 3               # predict next 3 hours
PATCH_SIZE = 13           # patch size
HALF = PATCH_SIZE // 2

# CSV must have at least these columns
required = ["seq_band_idxs", "target_band_idxs",
            "era5_t2m_file", "era5_d2m_file", "era5_tp_file",
            "era5_u10_file", "era5_v10_file",
            "viirs_file", "dem_file", "lulc_file"]


In [31]:
import pandas as pd
import rasterio

# Path to your CSV
csv_path = r"C:\Users\Ankit\Datasets_Forest_fire\sequence_index_hourly.csv"

# Columns that contain raster file paths
raster_cols = [
    "era5_t2m_file", "era5_d2m_file", "era5_tp_file",
    "era5_u10_file", "era5_v10_file",
    "viirs_file", "dem_file", "lulc2015_file", "lulc2016_file"
]

# Load CSV
df = pd.read_csv(csv_path)

# Collect raster sizes
sizes = {}

for col in raster_cols:
    if col not in df.columns:
        continue
    for path in df[col].dropna().unique():
        try:
            with rasterio.open(path) as src:
                sizes[path] = (src.width, src.height)
        except Exception as e:
            sizes[path] = f"Error: {e}"

# Show results
for path, size in sizes.items():
    print(f"{path} → {size}")


C:\Users\Ankit\Datasets_Forest_fire\ERA5_fast_tif_stacks\ERA5_t2m_2015_2016_stack.tif → (17, 13)
C:\Users\Ankit\Datasets_Forest_fire\ERA5_fast_tif_stacks\ERA5_d2m_2015_2016_stack.tif → (17, 13)
C:\Users\Ankit\Datasets_Forest_fire\ERA5_fast_tif_stacks\ERA5_tp_2015_2016_stack.tif → (17, 13)
C:\Users\Ankit\Datasets_Forest_fire\ERA5_fast_tif_stacks\ERA5_u10_2015_2016_stack.tif → (17, 13)
C:\Users\Ankit\Datasets_Forest_fire\ERA5_fast_tif_stacks\ERA5_v10_2015_2016_stack.tif → (17, 13)
C:\Users\Ankit\Datasets_Forest_fire\VIIRS_fire_time_stack.tif → (17, 13)
C:\Users\Ankit\Datasets_Forest_fire\merged_DEM_30m_32644_aligned_filled.tif → (13604, 11904)


In [32]:
def get_safe_center(h, w, patch_size=PATCH_SIZE):
    half = patch_size // 2
    r = h // 2
    c = w // 2
    r = np.clip(r, half, h - half - 1)
    c = np.clip(c, half, w - half - 1)
    return r, c


In [33]:
def extract_patch(raster_path, r, c, patch_size=PATCH_SIZE):
    with rasterio.open(raster_path) as src:
        H, W = src.height, src.width
        if H < patch_size or W < patch_size:
            raise ValueError(f"Raster {raster_path} smaller than patch size.")
        window = rasterio.windows.Window(c - patch_size//2, r - patch_size//2,
                                         patch_size, patch_size)
        patch = src.read(1, window=window).astype(np.float32)
        nodata = src.nodata
        if nodata is not None:
            patch = np.where(patch == nodata, np.nan, patch)
    return patch


In [34]:
def extract_sample(row, seq_len=SEQ_LEN, horizons=HORIZONS, patch_size=PATCH_SIZE):
    # Open one raster to get safe center
    with rasterio.open(row["era5_t2m_file"]) as src:
        H, W = src.height, src.width
    r, c = get_safe_center(H, W, patch_size)

    # Sequence ERA5 (5 vars × seq_len)
    seq_patches = []
    for fcol in ["era5_t2m_file", "era5_d2m_file", "era5_tp_file",
                 "era5_u10_file", "era5_v10_file"]:
        patch = extract_patch(row[fcol], r, c, patch_size)
        seq_patches.append(patch)
    seq_stack = np.stack(seq_patches, axis=-1)  # (H, W, 5)

    # Static DEM
    dem_patch = extract_patch(row["dem_file"], r, c, patch_size)[..., None]

    # Static LULC (only one file now)
    lulc_patch = extract_patch(row["lulc_file"], r, c, patch_size)[..., None]

    # Combine static → (H, W, 2)
    static_stack = np.concatenate([dem_patch, lulc_patch], axis=-1)

    # Expand static to each timestep → (seq_len, H, W, 2)
    static_seq = np.repeat(static_stack[None, ...], seq_len, axis=0)

    # Expand ERA5 to match sequence → (seq_len, H, W, 5)
    seq_seq = np.repeat(seq_stack[None, ...], seq_len, axis=0)

    # Final input X = (seq_len, H, W, channels=7)
    X = np.concatenate([seq_seq, static_seq], axis=-1)

    # Target VIIRS (3 horizons)
    target_patch = extract_patch(row["viirs_file"], r, c, patch_size)
    y = np.repeat(target_patch[None, ...], horizons, axis=0)  # (HORIZONS, H, W)

    return X, y


In [35]:
def generator(csv_path):
    df = pd.read_csv(csv_path)
    # Check required columns
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise ValueError(f"CSV missing columns: {missing}")

    for _, row in df.iterrows():
        yield extract_sample(row)


In [36]:
def create_dataset(csv_path, batch_size=4, shuffle=True):
    output_signature = (
        tf.TensorSpec(shape=(SEQ_LEN, PATCH_SIZE, PATCH_SIZE, 7), dtype=tf.float32),
        tf.TensorSpec(shape=(HORIZONS, PATCH_SIZE, PATCH_SIZE), dtype=tf.float32),
    )
    ds = tf.data.Dataset.from_generator(
        lambda: generator(csv_path),
        output_signature=output_signature
    )
    if shuffle:
        ds = ds.shuffle(100)
    ds = ds.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return ds


In [None]:
if __name__ == "__main__":
    csv_path = r"C:\Users\Ankit\Datasets_Forest_fire\sequence_index_hourly.csv"   # <-- put your CSV here
    ds = create_dataset(csv_path, batch_size=2)
    for X, y in ds.take(1):
        print("X shape:", X.shape)  # (B, SEQ_LEN, PATCH, PATCH, 8)
        print("y shape:", y.shape)  # (B, HORIZONS, PATCH, PATCH)
