In [1]:

# === Cell 1: Setup & discovery ===
import os, glob
from pathlib import Path
import pandas as pd
import numpy as np

# Adjust if needed
DATA_DIR = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/original_datasets")
FILE_GLOB = "*rooftop*solar*"

# Will save here
OUT_DIR = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/cleaned_datasets")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# Supported file types
files = sorted(list(DATA_DIR.glob(FILE_GLOB)))
files = [p for p in files if p.suffix.lower() in {".csv", ".xlsx", ".xls"}]

print(f"Found {len(files)} files:")
for f in files:
    print(" -", f.name)

# Common columns we will enforce
common_columns = [
    "City",
    "Surface_area",
    "Potential_installable_area",
    "Peak_installable_capacity",
    "Energy_potential_per_year",
    "Assumed_building_type",
    "Estimated_tilt",
    "Estimated_building_height",
    "Estimated_capacity_factor"
]

# Stable label mapping (text -> int) for Assumed_building_type
BT_STR2INT = {
    "commercial": 0,
    "industrial": 1,
    "multifamily residential": 2,
    "public": 3,
    "single family residential": 4,
}
BT_INT2STR = {v:k for k,v in BT_STR2INT.items()}

# Helper to read any supported file and standardize columns
def read_standardize(path: Path, common_cols):
    if path.suffix.lower() == ".csv":
        df = pd.read_csv(path, low_memory=False)
    else:
        df = pd.read_excel(path)
    # standardize col names
    df.columns = (df.columns
                    .str.strip()
                    .str.replace(r"\s+", "_", regex=True))
    # if City missing or blank, infer from filename stem
    if "City" not in df.columns:
        df["City"] = path.stem.replace("_rooftop_solar_potential", "")
    # keep only the common columns (create if missing)
    for c in common_cols:
        if c not in df.columns:
            df[c] = pd.NA
    df = df[common_cols].copy()
    # coerce numeric
    num_cols = [c for c in common_cols if c not in {"City", "Assumed_building_type"}]
    for c in num_cols:
        df[c] = pd.to_numeric(df[c], errors="coerce")
    # city as string
    df["City"] = df["City"].astype("string").str.strip()
    # ensure building type consistent casing if string
    if df["Assumed_building_type"].dtype == "O" or pd.api.types.is_string_dtype(df["Assumed_building_type"]):
        df["Assumed_building_type"] = df["Assumed_building_type"].astype("string").str.strip().str.lower()
    return df

dfs = {f.stem: read_standardize(f, common_columns) for f in files}
print("Loaded and standardized.")


Found 25 files:
 - accra_rooftop_solarpotential.csv
 - almaty_rooftop_solarpotential.csv
 - antigua_rooftop_solarpotential.csv
 - beirut_rooftop_solarpotential.csv
 - colombo_rooftop_solarpotential.csv
 - daressalaam_rooftop_solarpotential.csv
 - dhaka_rooftop_solarpotential.csv
 - dominica_rooftop_solarpotential.csv
 - grenada_rooftop_solarpotential.csv
 - izmir_rooftop_solarpotential.csv
 - johannesburg_rooftop_solarpotential.csv
 - karachi_rooftop_solarpotential.csv
 - lagosstate_rooftop_solarpotential.csv
 - maldives_rooftop_solarpotential.csv
 - manila_rooftop_solarpotential.csv
 - mexicocity_rooftop_solarpotential.csv
 - nairobi_rooftop_solarpotential.csv
 - panama8cities_rooftop_solarpotential.csv
 - rustavi_rooftop_solarpotential.csv
 - samarkand_rooftop_solarpotential.csv
 - sanpedrosula_rooftop_solarpotential.csv
 - sintmaarten_rooftop_solarpotential.csv
 - stlucia_rooftop_solarpotential.csv
 - svg_rooftop_solarpotential.csv
 - tegucigalpa_rooftop_solarpotential.csv
Loaded an

In [2]:

# === Cell 2: Null % per dataset ===
from pandas import DataFrame

summary_rows = []
for name, df in dfs.items():
    n = len(df)
    # rows with any null across the selected columns
    any_null = df[common_columns].isna().any(axis=1).sum()
    pct = (any_null / n * 100) if n else 0.0
    summary_rows.append({"dataset": name, "rows": n, "rows_with_any_null": any_null, "pct_with_any_null": round(pct, 2)})

null_summary = DataFrame(summary_rows).sort_values("pct_with_any_null", ascending=False).reset_index(drop=True)
null_summary


Unnamed: 0,dataset,rows,rows_with_any_null,pct_with_any_null
0,izmir_rooftop_solarpotential,287695,97589,33.92
1,svg_rooftop_solarpotential,46707,14209,30.42
2,stlucia_rooftop_solarpotential,3524,878,24.91
3,maldives_rooftop_solarpotential,92074,17980,19.53
4,rustavi_rooftop_solarpotential,10160,1834,18.05
5,sintmaarten_rooftop_solarpotential,15279,2275,14.89
6,lagosstate_rooftop_solarpotential,1542057,220321,14.29
7,mexicocity_rooftop_solarpotential,589629,83020,14.08
8,tegucigalpa_rooftop_solarpotential,165629,19707,11.9
9,dominica_rooftop_solarpotential,37841,4332,11.45


In [3]:
# === New Cell: Per-dataset per-column % NULL ===
import pandas as pd
import numpy as np

# matrix of %null: index = column, columns = dataset
null_pct_mat = {}
null_cnt_mat = {}

for name, df in dfs.items():
    s_pct = df[common_columns].isna().mean().rename(name) * 100.0  # % null per column
    s_cnt = df[common_columns].isna().sum().rename(name)           # count null per column
    null_pct_mat[name] = s_pct
    null_cnt_mat[name] = s_cnt

null_pct_df = pd.concat(null_pct_mat.values(), axis=1)
null_cnt_df = pd.concat(null_cnt_mat.values(), axis=1)

# Pretty view: % null sorted by most-missing columns overall
display(null_pct_df.assign(_overall=null_pct_df.mean(axis=1))
                     .sort_values("_overall", ascending=False)
                     .drop(columns=["_overall"])
                     .style.format("{:.2f}%").background_gradient(axis=None, cmap="Reds"))

# Quick spotlight: top-3 missing columns per dataset
spotlight_rows = []
for name in dfs:
    top3 = null_pct_df[name].sort_values(ascending=False).head(3)
    for col, pct in top3.items():
        spotlight_rows.append({"dataset": name, "column": col, "pct_null": round(pct, 2), "null_count": int(null_cnt_df.at[col, name])})
pd.DataFrame(spotlight_rows).sort_values(["pct_null","dataset"], ascending=False).reset_index(drop=True)

Unnamed: 0,accra_rooftop_solarpotential,almaty_rooftop_solarpotential,antigua_rooftop_solarpotential,beirut_rooftop_solarpotential,colombo_rooftop_solarpotential,daressalaam_rooftop_solarpotential,dhaka_rooftop_solarpotential,dominica_rooftop_solarpotential,grenada_rooftop_solarpotential,izmir_rooftop_solarpotential,johannesburg_rooftop_solarpotential,karachi_rooftop_solarpotential,lagosstate_rooftop_solarpotential,maldives_rooftop_solarpotential,manila_rooftop_solarpotential,mexicocity_rooftop_solarpotential,nairobi_rooftop_solarpotential,panama8cities_rooftop_solarpotential,rustavi_rooftop_solarpotential,samarkand_rooftop_solarpotential,sanpedrosula_rooftop_solarpotential,sintmaarten_rooftop_solarpotential,stlucia_rooftop_solarpotential,svg_rooftop_solarpotential,tegucigalpa_rooftop_solarpotential
Estimated_building_height,0.37%,0.75%,6.01%,0.63%,0.27%,0.74%,0.83%,10.16%,2.88%,0.15%,7.61%,0.33%,14.20%,19.06%,0.47%,0.35%,0.01%,9.40%,15.65%,0.00%,5.28%,13.16%,24.23%,27.80%,9.79%
Estimated_capacity_factor,1.43%,7.75%,1.99%,3.11%,0.95%,3.41%,1.41%,1.70%,1.25%,33.82%,0.71%,11.12%,0.17%,0.71%,1.79%,14.08%,1.28%,0.73%,2.72%,0.78%,1.41%,2.46%,1.25%,4.09%,2.34%
City,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Surface_area,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Potential_installable_area,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Energy_potential_per_year,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Peak_installable_capacity,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Estimated_tilt,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Assumed_building_type,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%


Unnamed: 0,dataset,column,pct_null,null_count
0,izmir_rooftop_solarpotential,Estimated_capacity_factor,33.82,97311
1,svg_rooftop_solarpotential,Estimated_building_height,27.80,12985
2,stlucia_rooftop_solarpotential,Estimated_building_height,24.23,854
3,maldives_rooftop_solarpotential,Estimated_building_height,19.06,17545
4,rustavi_rooftop_solarpotential,Estimated_building_height,15.65,1590
...,...,...,...,...
70,colombo_rooftop_solarpotential,City,0.00,0
71,beirut_rooftop_solarpotential,City,0.00,0
72,antigua_rooftop_solarpotential,City,0.00,0
73,almaty_rooftop_solarpotential,City,0.00,0


In [4]:
# === Drop 'Estimated_building_height' from the workflow ===
if "Estimated_building_height" in common_columns:
    common_columns.remove("Estimated_building_height")
common_columns

['City',
 'Surface_area',
 'Potential_installable_area',
 'Peak_installable_capacity',
 'Energy_potential_per_year',
 'Assumed_building_type',
 'Estimated_tilt',
 'Estimated_capacity_factor']

In [5]:
# === Remove column from each dataset (if present) ===
for name, df in dfs.items():
    if "Estimated_building_height" in df.columns:
        dfs[name] = df.drop(columns=["Estimated_building_height"])


In [6]:
# Recompute %null and counts using *current* common_columns
null_pct_mat, null_cnt_mat = {}, {}

for name, df in dfs.items():
    s_pct = df[common_columns].isna().mean().rename(name) * 100.0
    s_cnt = df[common_columns].isna().sum().rename(name)
    null_pct_mat[name] = s_pct
    null_cnt_mat[name] = s_cnt

null_pct_df = pd.concat(null_pct_mat.values(), axis=1)
null_cnt_df = pd.concat(null_cnt_mat.values(), axis=1)

# Safety: if an old matrix exists, this ensures height row is gone
null_pct_df = null_pct_df.drop(index=["Estimated_building_height"], errors="ignore")
null_cnt_df = null_cnt_df.drop(index=["Estimated_building_height"], errors="ignore")

In [7]:
# Heatmap-style table (columns = datasets, rows = features)
display(null_pct_df.assign(_overall=null_pct_df.mean(axis=1))
                     .sort_values("_overall", ascending=False)
                     .drop(columns=["_overall"])
                     .style.format("{:.2f}%").background_gradient(axis=None, cmap="Reds"))

# Spotlight: top-3 missing columns per dataset (after height removal)
spotlight_rows = []
for name in null_pct_df.columns:
    top3 = null_pct_df[name].sort_values(ascending=False).head(3)
    for col, pct in top3.items():
        spotlight_rows.append({
            "dataset": name, "column": col,
            "pct_null": round(float(pct), 2),
            "null_count": int(null_cnt_df.at[col, name])
        })
pd.DataFrame(spotlight_rows).sort_values(["pct_null","dataset"], ascending=False).reset_index(drop=True)


Unnamed: 0,accra_rooftop_solarpotential,almaty_rooftop_solarpotential,antigua_rooftop_solarpotential,beirut_rooftop_solarpotential,colombo_rooftop_solarpotential,daressalaam_rooftop_solarpotential,dhaka_rooftop_solarpotential,dominica_rooftop_solarpotential,grenada_rooftop_solarpotential,izmir_rooftop_solarpotential,johannesburg_rooftop_solarpotential,karachi_rooftop_solarpotential,lagosstate_rooftop_solarpotential,maldives_rooftop_solarpotential,manila_rooftop_solarpotential,mexicocity_rooftop_solarpotential,nairobi_rooftop_solarpotential,panama8cities_rooftop_solarpotential,rustavi_rooftop_solarpotential,samarkand_rooftop_solarpotential,sanpedrosula_rooftop_solarpotential,sintmaarten_rooftop_solarpotential,stlucia_rooftop_solarpotential,svg_rooftop_solarpotential,tegucigalpa_rooftop_solarpotential
Estimated_capacity_factor,1.43%,7.75%,1.99%,3.11%,0.95%,3.41%,1.41%,1.70%,1.25%,33.82%,0.71%,11.12%,0.17%,0.71%,1.79%,14.08%,1.28%,0.73%,2.72%,0.78%,1.41%,2.46%,1.25%,4.09%,2.34%
City,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Potential_installable_area,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Surface_area,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Peak_installable_capacity,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Energy_potential_per_year,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Assumed_building_type,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Estimated_tilt,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%


Unnamed: 0,dataset,column,pct_null,null_count
0,izmir_rooftop_solarpotential,Estimated_capacity_factor,33.82,97311
1,mexicocity_rooftop_solarpotential,Estimated_capacity_factor,14.08,82998
2,karachi_rooftop_solarpotential,Estimated_capacity_factor,11.12,32984
3,almaty_rooftop_solarpotential,Estimated_capacity_factor,7.75,10725
4,svg_rooftop_solarpotential,Estimated_capacity_factor,4.09,1909
...,...,...,...,...
70,antigua_rooftop_solarpotential,Potential_installable_area,0.00,0
71,almaty_rooftop_solarpotential,City,0.00,0
72,almaty_rooftop_solarpotential,Potential_installable_area,0.00,0
73,accra_rooftop_solarpotential,City,0.00,0


In [9]:
# === Diagnose why CF nulls didn't fill, then fill when possible ===
import numpy as np
import pandas as pd

HOURS_PER_YEAR = 8760
diag_rows = []
fill_rows = []

for name, df in dfs.items():
    # ensure numeric
    for c in ["Energy_potential_per_year", "Peak_installable_capacity", "Estimated_capacity_factor"]:
        if c not in df.columns:
            df[c] = np.nan
        df[c] = pd.to_numeric(df[c], errors="coerce")

    cf_isna = df["Estimated_capacity_factor"].isna()
    e_ok    = df["Energy_potential_per_year"].notna()
    p_ok    = df["Peak_installable_capacity"].notna()
    p_pos   = df["Peak_installable_capacity"].fillna(0) > 0
    denom   = df["Peak_installable_capacity"] * HOURS_PER_YEAR

    # compute candidate CF only where both energy present & peak > 0
    calc_mask = e_ok & p_pos
    cf_calc = np.divide(
        df["Energy_potential_per_year"],
        denom,
        out=np.full(len(df), np.nan, dtype=float),
        where=calc_mask
    )

    # --- diagnostics ---
    n_null_cf = int(cf_isna.sum())
    reasons = {
        "null_CF_rows": n_null_cf,
        "energy_missing": int((cf_isna & ~e_ok).sum()),
        "peak_missing":   int((cf_isna & ~p_ok).sum()),
        "peak_zero":      int((cf_isna & p_ok & ~p_pos).sum()),
        "calc_available": int((cf_isna & calc_mask).sum()),  # rows we *can* compute
    }
    reasons["would_be_nan_even_if_calc"] = int((cf_isna & calc_mask & pd.isna(cf_calc)).sum())
    diag = {"dataset": name, **reasons}
    diag_rows.append(diag)

    # --- fill only where CF is null AND we can compute ---
    before = n_null_cf
    fill_mask = cf_isna & calc_mask & pd.notna(cf_calc)
    df.loc[fill_mask, "Estimated_capacity_factor"] = cf_calc[fill_mask]
    after = int(df["Estimated_capacity_factor"].isna().sum())

    fill_rows.append({
        "dataset": name,
        "rows": len(df),
        "nulls_before": before,
        "filled_from_formula": int(before - after),
        "nulls_after": after
    })

diag_df  = pd.DataFrame(diag_rows).sort_values("null_CF_rows", ascending=False).reset_index(drop=True)
fill_df  = pd.DataFrame(fill_rows).sort_values("filled_from_formula", ascending=False).reset_index(drop=True)

print("=== Why CF couldn't be filled (per dataset) ===")
display(diag_df)

print("=== Fill results (per dataset) ===")
display(fill_df)


=== Why CF couldn't be filled (per dataset) ===


Unnamed: 0,dataset,null_CF_rows,energy_missing,peak_missing,peak_zero,calc_available,would_be_nan_even_if_calc
0,izmir_rooftop_solarpotential,97311,0,0,97311,0,0
1,mexicocity_rooftop_solarpotential,82998,0,0,82998,0,0
2,karachi_rooftop_solarpotential,32984,0,0,32984,0,0
3,daressalaam_rooftop_solarpotential,18186,0,0,18186,0,0
4,almaty_rooftop_solarpotential,10725,0,0,10725,0,0
5,dhaka_rooftop_solarpotential,8936,0,0,8936,0,0
6,manila_rooftop_solarpotential,5397,0,0,5397,0,0
7,tegucigalpa_rooftop_solarpotential,3870,0,0,3870,0,0
8,accra_rooftop_solarpotential,3834,0,0,3834,0,0
9,nairobi_rooftop_solarpotential,3502,0,0,3502,0,0


=== Fill results (per dataset) ===


Unnamed: 0,dataset,rows,nulls_before,filled_from_formula,nulls_after
0,accra_rooftop_solarpotential,268947,3834,0,3834
1,almaty_rooftop_solarpotential,138339,10725,0,10725
2,antigua_rooftop_solarpotential,48904,975,0,975
3,beirut_rooftop_solarpotential,69859,2173,0,2173
4,colombo_rooftop_solarpotential,267989,2543,0,2543
5,daressalaam_rooftop_solarpotential,533855,18186,0,18186
6,dhaka_rooftop_solarpotential,632745,8936,0,8936
7,dominica_rooftop_solarpotential,37841,645,0,645
8,grenada_rooftop_solarpotential,51115,639,0,639
9,izmir_rooftop_solarpotential,287695,97311,0,97311


In [10]:
# === Drop calculated column from the workflow ===
for col in ["Estimated_capacity_factor", "Estimated_building_height"]:
    if col in common_columns:
        common_columns.remove(col)
common_columns

['City',
 'Surface_area',
 'Potential_installable_area',
 'Peak_installable_capacity',
 'Energy_potential_per_year',
 'Assumed_building_type',
 'Estimated_tilt']

In [11]:
# === Physically drop those columns if present ===
for name, df in dfs.items():
    cols_to_drop = [c for c in ["Estimated_capacity_factor", "Estimated_building_height"] if c in df.columns]
    if cols_to_drop:
        dfs[name] = df.drop(columns=cols_to_drop)

In [12]:
# === Recompute % null per column per dataset ===
null_pct_mat, null_cnt_mat = {}, {}
for name, df in dfs.items():
    s_pct = df[common_columns].isna().mean().rename(name) * 100.0
    s_cnt = df[common_columns].isna().sum().rename(name)
    null_pct_mat[name] = s_pct
    null_cnt_mat[name] = s_cnt

null_pct_df = pd.concat(null_pct_mat.values(), axis=1)
null_cnt_df = pd.concat(null_cnt_mat.values(), axis=1)

display(null_pct_df.assign(_overall=null_pct_df.mean(axis=1))
                     .sort_values("_overall", ascending=False)
                     .drop(columns=["_overall"])
                     .style.format("{:.2f}%").background_gradient(axis=None, cmap="Reds"))

Unnamed: 0,accra_rooftop_solarpotential,almaty_rooftop_solarpotential,antigua_rooftop_solarpotential,beirut_rooftop_solarpotential,colombo_rooftop_solarpotential,daressalaam_rooftop_solarpotential,dhaka_rooftop_solarpotential,dominica_rooftop_solarpotential,grenada_rooftop_solarpotential,izmir_rooftop_solarpotential,johannesburg_rooftop_solarpotential,karachi_rooftop_solarpotential,lagosstate_rooftop_solarpotential,maldives_rooftop_solarpotential,manila_rooftop_solarpotential,mexicocity_rooftop_solarpotential,nairobi_rooftop_solarpotential,panama8cities_rooftop_solarpotential,rustavi_rooftop_solarpotential,samarkand_rooftop_solarpotential,sanpedrosula_rooftop_solarpotential,sintmaarten_rooftop_solarpotential,stlucia_rooftop_solarpotential,svg_rooftop_solarpotential,tegucigalpa_rooftop_solarpotential
City,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Surface_area,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Potential_installable_area,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Peak_installable_capacity,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Energy_potential_per_year,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Assumed_building_type,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%
Estimated_tilt,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%,0.00%


In [13]:
# === Label-encode Assumed_building_type (stable mapping) ===
import pandas as pd

BT_STR2INT = {
    "commercial": 0,
    "industrial": 1,
    "multifamily residential": 2,
    "public": 3,
    "single family residential": 4,
}
BT_INT2STR = {v:k for k,v in BT_STR2INT.items()}
print("Mapping (int -> name):", BT_INT2STR)

def encode_bt(series: pd.Series) -> pd.Series:
    s = series.copy()
    # numeric-like: accept only 0..4, else NA
    if pd.api.types.is_integer_dtype(s) or pd.api.types.is_float_dtype(s):
        s = pd.to_numeric(s, errors="coerce")
        s = s.where(s.isin(list(BT_INT2STR.keys())), other=pd.NA).astype("Int64")
        return s
    # text-like: map using stable dict
    s = s.astype("string").str.strip().str.lower().map(BT_STR2INT)
    return s.astype("Int64")

# apply to all datasets
previews = []
for name, df in dfs.items():
    before = df["Assumed_building_type"].dropna().unique().tolist()
    df["Assumed_building_type"] = encode_bt(df["Assumed_building_type"])
    after = df["Assumed_building_type"].dropna().unique().tolist()
    previews.append({
        "dataset": name,
        "unique_before": sorted(map(str, before)),
        "unique_after_codes": sorted(map(int, after)) if len(after) else []
    })

pd.DataFrame(previews)


Mapping (int -> name): {0: 'commercial', 1: 'industrial', 2: 'multifamily residential', 3: 'public', 4: 'single family residential'}


Unnamed: 0,dataset,unique_before,unique_after_codes
0,accra_rooftop_solarpotential,"[commercial, industrial, multi-family resident...","[0, 1, 3, 4]"
1,almaty_rooftop_solarpotential,"[commercial, industrial, multifamily residenti...","[0, 1, 2, 3, 4]"
2,antigua_rooftop_solarpotential,"[commercial, hotels, industrial, multi-family ...","[0, 1, 3]"
3,beirut_rooftop_solarpotential,"[commercial, industrial, multi-family resident...","[0, 1, 3, 4]"
4,colombo_rooftop_solarpotential,"[commercial, industrial, multi-family resident...","[0, 1, 3, 4]"
5,daressalaam_rooftop_solarpotential,"[commercial, industrial, multi-family resident...","[0, 1, 3, 4]"
6,dhaka_rooftop_solarpotential,"[commercial, industrial, multi-family resident...","[0, 1, 3, 4]"
7,dominica_rooftop_solarpotential,"[commercial, hotels, industrial, multi-family ...","[0, 1, 3]"
8,grenada_rooftop_solarpotential,"[commercial, hotels, industrial, multi-family ...","[0, 1, 3]"
9,izmir_rooftop_solarpotential,"[commercial, industrial, multifamily residenti...","[0, 1, 2, 3, 4]"


In [14]:
# === Save final cleaned datasets (per-city + combined) ===
import pandas as pd
from pathlib import Path
import json

OUT_DIR.mkdir(parents=True, exist_ok=True)

# 1) Save stable label mapping alongside outputs (for provenance)
BT_STR2INT = {
    "commercial": 0,
    "industrial": 1,
    "multifamily residential": 2,
    "public": 3,
    "single family residential": 4,
}
(BT_INT2STR := {v: k for k, v in BT_STR2INT.items()})

with open(OUT_DIR / "building_type_mapping.json", "w", encoding="utf-8") as f:
    json.dump({"str2int": BT_STR2INT, "int2str": BT_INT2STR}, f, indent=2)

# 2) Write each dataset (Parquet for speed; CSV optional)
manifest = []
for name, df in dfs.items():
    p_parquet = OUT_DIR / f"{name}.parquet"
    p_csv     = OUT_DIR / f"{name}.csv"          # comment out if you don't want CSVs

    df.to_parquet(p_parquet, index=False)        # requires pyarrow
    df.to_csv(p_csv, index=False)                # remove if not needed

    manifest.append({"dataset": name, "rows": int(len(df)), "parquet": str(p_parquet), "csv": str(p_csv)})

# 3) Combined file
combined_df = pd.concat(dfs.values(), ignore_index=True)
combined_parquet = OUT_DIR / "all_cities_clean.parquet"
combined_csv     = OUT_DIR / "all_cities_clean.csv"

combined_df.to_parquet(combined_parquet, index=False)
combined_df.to_csv(combined_csv, index=False)

# 4) Save a manifest for quick reference
manifest_path = OUT_DIR / "_manifest.json"
with open(manifest_path, "w", encoding="utf-8") as f:
    json.dump({
        "columns": list(combined_df.columns),
        "total_rows": int(len(combined_df)),
        "per_dataset": manifest,
        "combined": {"parquet": str(combined_parquet), "csv": str(combined_csv)}
    }, f, indent=2)

# 5) Quick confirmation table
pd.DataFrame(manifest + [{
    "dataset": "__ALL__",
    "rows": int(len(combined_df)),
    "parquet": str(combined_parquet),
    "csv": str(combined_csv)
}]).sort_values("dataset").reset_index(drop=True)


Unnamed: 0,dataset,rows,parquet,csv
0,__ALL__,6530761,C:\Users\User\Desktop\ML\Project\solar-potenti...,C:\Users\User\Desktop\ML\Project\solar-potenti...
1,accra_rooftop_solarpotential,268947,C:\Users\User\Desktop\ML\Project\solar-potenti...,C:\Users\User\Desktop\ML\Project\solar-potenti...
2,almaty_rooftop_solarpotential,138339,C:\Users\User\Desktop\ML\Project\solar-potenti...,C:\Users\User\Desktop\ML\Project\solar-potenti...
3,antigua_rooftop_solarpotential,48904,C:\Users\User\Desktop\ML\Project\solar-potenti...,C:\Users\User\Desktop\ML\Project\solar-potenti...
4,beirut_rooftop_solarpotential,69859,C:\Users\User\Desktop\ML\Project\solar-potenti...,C:\Users\User\Desktop\ML\Project\solar-potenti...
5,colombo_rooftop_solarpotential,267989,C:\Users\User\Desktop\ML\Project\solar-potenti...,C:\Users\User\Desktop\ML\Project\solar-potenti...
6,daressalaam_rooftop_solarpotential,533855,C:\Users\User\Desktop\ML\Project\solar-potenti...,C:\Users\User\Desktop\ML\Project\solar-potenti...
7,dhaka_rooftop_solarpotential,632745,C:\Users\User\Desktop\ML\Project\solar-potenti...,C:\Users\User\Desktop\ML\Project\solar-potenti...
8,dominica_rooftop_solarpotential,37841,C:\Users\User\Desktop\ML\Project\solar-potenti...,C:\Users\User\Desktop\ML\Project\solar-potenti...
9,grenada_rooftop_solarpotential,51115,C:\Users\User\Desktop\ML\Project\solar-potenti...,C:\Users\User\Desktop\ML\Project\solar-potenti...
