In [None]:
# Cleanup and create uniform csv files - prepare the formats for training

import pandas as pd
import os

###########################################
# 1. Paths to your original merged CSVs
###########################################

sh_raw = "../data/output/shanghai_merged_full.csv"
vg_raw = "../data/output/vegas_merged_full.csv"

###########################################
# 2. Load
###########################################

sh = pd.read_csv(sh_raw)
vg = pd.read_csv(vg_raw)

print("Loaded:", sh.shape, vg.shape)

###########################################
# 3. Define leakage columns to drop
###########################################

leak_cols = [
    "lights_mean",       # target proxy
    "lights_nonzero",    # weak proxy
    "tile_id",           # ID leakage
    "tile_num",          # another ID
    "geojson_file",
    "tile_file",
    "tile_crs",
    "minx", "miny", "maxx", "maxy",   # coordinate leakage (Shanghai)
]

###########################################
# 4. SHANGHAI CLEANING
###########################################

print("\n=== Cleaning Shanghai ===")

# Shanghai typically has f1â€¦f768 and building features.
# First: we must create a valid 'tile' column

if "tile" not in sh.columns:
    # Shanghai usually has tile_id, or embedded in filename.
    if "tile_id" in sh.columns:
        sh["tile"] = sh["tile_id"].astype(int)
    elif "image_path" in sh.columns:
        # extract from filename: imgXXXX.tif
        sh["tile"] = sh["image_path"].str.extract(r'img(\d+)', expand=False).astype(int)
    else:
        raise ValueError("Cannot infer Shanghai tile identifier! Add tile or tile_id.")

# Build consistent image_path
def build_shanghai_path(tile):
    return f"../data/spacenet/AOI_4_Shanghai/AOI_4_Shanghai_Train/RGB-PanSharpen/RGB-PanSharpen_AOI_4_Shanghai_img{tile}.tif"

sh["image_path"] = sh["tile"].apply(build_shanghai_path)

# Add city label
sh["city"] = "shanghai"

# Define target (lights_sum)
if "lights_sum" not in sh.columns:
    raise ValueError("Shanghai file missing target column lights_sum")

sh.rename(columns={"lights_sum": "target"}, inplace=True)

# Remove leakage
drop_cols = [c for c in leak_cols if c in sh.columns]
sh.drop(columns=drop_cols, inplace=True, errors="ignore")

###########################################
# 5. VEGAS CLEANING
###########################################

print("\n=== Cleaning Vegas ===")

if "tile" not in vg.columns:
    raise ValueError("Vegas must have a tile column, but it does not!")

# Vegas already uses tile correctly
def build_vegas_path(tile):
    return f"../data/spacenet/AOI_2_Vegas/PS-RGB/SN2_buildings_train_AOI_2_Vegas_PS-RGB_img{tile}.tif"

vg["image_path"] = vg["tile"].apply(build_vegas_path)

# Add city label
vg["city"] = "vegas"

# Define target
vg.rename(columns={"lights_sum": "target"}, inplace=True)

# Remove leakage
drop_cols = [c for c in leak_cols if c in vg.columns]
vg.drop(columns=drop_cols, inplace=True, errors="ignore")

###########################################
# 6. Sort columns to unify order
###########################################

def reorder(df):
    cols = df.columns.tolist()
    front = ["tile", "city", "image_path", "target"]
    others = sorted([c for c in cols if c not in front])
    return df[front + others]

sh_clean = reorder(sh)
vg_clean = reorder(vg)

print("\nFinal Shanghai shape:", sh_clean.shape)
print("Final Vegas shape:", vg_clean.shape)

###########################################
# 7. SAVE CLEANED FILES
###########################################

sh_clean.to_csv("../data/output/shanghai_merged_full_clean.csv", index=False)
vg_clean.to_csv("../data/output/vegas_merged_full_clean.csv", index=False)

print("\nSaved:")
print(" - ../data/output/shanghai_merged_full_clean.csv")
print(" - ../data/output/vegas_merged_full_clean.csv")

In [None]:
# validate files exist

import os
import pandas as pd

df = pd.read_csv("../data/output/vegas_merged_full_clean.csv")

missing = [p for p in df["image_path"] if not os.path.exists(p)]
print("Missing images:", len(missing))