# OG Solar Datasets — Cleaning Notebook
**What this notebook does**
1) Discover original city files; 2) Inspect null %; 3) Label-encode `Assumed_building_type` (stable mapping);
4) Drop null rows (after showing percentages first); 5) Remove 0 and negative `Estimated_building_height`;
6) Save per-city cleaned Parquet and one combined Parquet.

> Run each cell top-to-bottom. If your files are in another folder or Excel/CSV mix, edit the `DATA_DIR` or `FILE_GLOB` below.

In [None]:

# === Cell 1: Setup & discovery ===
import os, glob
from pathlib import Path
import pandas as pd
import numpy as np

# Adjust if needed
DATA_DIR = Path("data/raw")
FILE_GLOB = "*rooftop*solar*"

# Will save here
OUT_DIR = Path("data/interim/cleaned")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Supported file types
files = sorted(list(DATA_DIR.glob(FILE_GLOB)))
files = [p for p in files if p.suffix.lower() in {".csv", ".xlsx", ".xls"}]

print(f"Found {len(files)} files:")
for f in files:
    print(" -", f.name)

# Common columns we will enforce
common_columns = [
    "City",
    "Surface_area",
    "Potential_installable_area",
    "Peak_installable_capacity",
    "Energy_potential_per_year",
    "Assumed_building_type",
    "Estimated_tilt",
    "Estimated_building_height",
    "Estimated_capacity_factor"
]

# Stable label mapping (text -> int) for Assumed_building_type
BT_STR2INT = {
    "commercial": 0,
    "industrial": 1,
    "multifamily residential": 2,
    "public": 3,
    "single family residential": 4,
}
BT_INT2STR = {v:k for k,v in BT_STR2INT.items()}

# Helper to read any supported file and standardize columns
def read_standardize(path: Path, common_cols):
    if path.suffix.lower() == ".csv":
        df = pd.read_csv(path, low_memory=False)
    else:
        df = pd.read_excel(path)
    # standardize col names
    df.columns = (df.columns
                    .str.strip()
                    .str.replace(r"\s+", "_", regex=True))
    # if City missing or blank, infer from filename stem
    if "City" not in df.columns:
        df["City"] = path.stem.replace("_rooftop_solar_potential", "")
    # keep only the common columns (create if missing)
    for c in common_cols:
        if c not in df.columns:
            df[c] = pd.NA
    df = df[common_cols].copy()
    # coerce numeric
    num_cols = [c for c in common_cols if c not in {"City", "Assumed_building_type"}]
    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    # city as string
    df["City"] = df["City"].astype("string").str.strip()
    # ensure building type consistent casing if string
    if df["Assumed_building_type"].dtype == "O" or pd.api.types.is_string_dtype(df["Assumed_building_type"]):
        df["Assumed_building_type"] = df["Assumed_building_type"].astype("string").str.strip().str.lower()
    return df

dfs = {f.stem: read_standardize(f, common_columns) for f in files}
print("Loaded and standardized.")


## Step 1 — Show % of null rows **per dataset** (before we drop anything)
We compute the percentage of rows that have **any null** within the selected `common_columns`. This lets you decide if dropping is acceptable.

In [None]:

# === Cell 2: Null % per dataset ===
from pandas import DataFrame

summary_rows = []
for name, df in dfs.items():
    n = len(df)
    # rows with any null across the selected columns
    any_null = df[common_columns].isna().any(axis=1).sum()
    pct = (any_null / n * 100) if n else 0.0
    summary_rows.append({"dataset": name, "rows": n, "rows_with_any_null": any_null, "pct_with_any_null": round(pct, 2)})

null_summary = DataFrame(summary_rows).sort_values("pct_with_any_null", ascending=False).reset_index(drop=True)
null_summary


## Step 2 — Label-encode `Assumed_building_type`
Mapping used (stable across all files):

- 0 → commercial
- 1 → industrial
- 2 → multifamily residential
- 3 → public
- 4 → single family residential

Below we preview unique values per dataset **before** encoding, then encode to the mapping and show a quick check.

In [None]:

# === Cell 3: Inspect unique values then encode ===
preview = {}
for name, df in dfs.items():
    u = df["Assumed_building_type"].dropna().unique()
    preview[name] = sorted(map(str, u.tolist())) if len(u) else []
preview_df = pd.DataFrame.from_dict(preview, orient="index")
print("Unique 'Assumed_building_type' values BEFORE encoding (per dataset):")
display(preview_df)

def encode_building_type(series: pd.Series) -> pd.Series:
    s = series.copy()
    # if numeric already, keep (but ensure it's int-like and within 0..4)
    if pd.api.types.is_integer_dtype(s) or pd.api.types.is_float_dtype(s):
        s = pd.to_numeric(s, errors="coerce")
        # keep as pandas nullable Int64
        s = s.where(s.isin(list(BT_INT2STR.keys())), other=pd.NA).astype("Int64")
        return s
    # else map from string -> int
    s = s.astype("string").str.strip().str.lower().map(BT_STR2INT)
    return s.astype("Int64")

for name, df in dfs.items():
    dfs[name]["Assumed_building_type"] = encode_building_type(df["Assumed_building_type"])

print("\nMapping used (int -> name):", BT_INT2STR)
# quick post-check
check = {name: sorted(dfs[name]["Assumed_building_type"].dropna().unique().tolist()) for name in dfs}
print("Encoded unique codes per dataset:", check)


## Step 3 — Drop rows with **any null** in the selected columns
We first report **how many rows will be dropped** per dataset, then perform the drop.

In [None]:

# === Cell 4: Report and drop null rows ===
drop_report = []
for name, df in dfs.items():
    n0 = len(df)
    mask_keep = ~df[common_columns].isna().any(axis=1)
    dropped = int((~mask_keep).sum())
    kept = int(mask_keep.sum())
    drop_report.append({"dataset": name, "before": n0, "drop_null_rows": dropped, "after": kept, "pct_dropped": round((dropped/n0*100) if n0 else 0.0, 2)})
    dfs[name] = df.loc[mask_keep].reset_index(drop=True)

drop_null_df = pd.DataFrame(drop_report).sort_values("pct_dropped", ascending=False).reset_index(drop=True)
drop_null_df


## Step 4 — Remove rows where `Estimated_building_height` is 0 or negative
This step is applied **after** the null-row drop. We report how many rows are removed per dataset.

In [None]:

# === Cell 5: Filter non-positive heights ===
height_report = []
for name, df in dfs.items():
    n0 = len(df)
    bad = df["Estimated_building_height"].le(0).sum()
    dfs[name] = df.loc[df["Estimated_building_height"].gt(0)].reset_index(drop=True)
    height_report.append({"dataset": name, "before": n0, "removed_height_le_zero": int(bad), "after": len(dfs[name]),
                          "pct_removed": round((bad/n0*100) if n0 else 0.0, 2)})
height_le0_df = pd.DataFrame(height_report).sort_values("pct_removed", ascending=False).reset_index(drop=True)
height_le0_df


## Step 5 — Save cleaned data (per-city + combined)
We save each cleaned dataset to Parquet in `data/interim/cleaned/` and a single combined file `data/interim/all_cities_clean.parquet`. Change paths if needed.

In [None]:

# === Cell 6: Write outputs ===
import polars as pl

combined = []
for name, df in dfs.items():
    out_path = OUT_DIR / f"{name}.parquet"
    df.to_parquet(out_path, index=False)
    combined.append(df.assign(filename=f"{name}.parquet"))
    print("Wrote:", out_path)

combined_df = pd.concat(combined, ignore_index=True)
combined_out = Path("data/interim/all_cities_clean.parquet")
combined_df.to_parquet(combined_out, index=False)
print("Wrote combined:", combined_out.resolve())

# Quick sanity: basic shapes
print("Per-dataset cleaned sizes:")
print({k: len(v) for k,v in dfs.items()})
print("Combined:", combined_df.shape)
