In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

COMBINED = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/reference/all_cities_clean.parquet")   # your big combined file
WEATHER  = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/weather_api_integration/city_weather.csv")               # the NASA file we made
OUT_DIR  = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets"); OUT_DIR.mkdir(parents=True, exist_ok=True)

In [2]:
df = pd.read_parquet(COMBINED)
weather = pd.read_csv(WEATHER)
print("Loaded:", df.shape, weather.shape)

Loaded: (6530761, 7) (25, 5)


In [3]:
# map dataset city names → weather city names
city_fix_map = {
    "DarEsSalaam": "Dar es Salaam",
    "GreatDhakaRegion": "Dhaka",
    "Honduras": "San Pedro Sula",
    "Lagos": "Lagos State",
    "LagosState": "Lagos State",
    "Panama": "Panama City",
    "SVG": "Saint Vincent and the Grenadines",
    "StLucia": "Saint Lucia",
    "StMaarten": "Sint Maarten",
    "SouthAfrica": "Johannesburg"
}
df["City_clean"] = df["City"].replace(city_fix_map)
weather["City_clean"] = weather["City"].str.strip()

dfm = df.merge(weather, on="City_clean", how="left", suffixes=("", "_w"))
print("After merge:", dfm.shape)

# quick check (should be all zeros)
print(dfm[["avg_GHI_kWhm2_day","avg_temp_C","clearness_index","precip_mm_day"]].isna().sum())

After merge: (6530761, 13)
avg_GHI_kWhm2_day    0
avg_temp_C           0
clearness_index      0
precip_mm_day        0
dtype: int64


In [4]:
# rename to clean names
dfm = dfm.rename(columns={
    "Surface_area": "RoofSurface_m2",
    "Potential_installable_area": "InstallableArea_m2",
    "Peak_installable_capacity": "PeakCapacity_kWp",
    "Energy_potential_per_year": "EnergyPotential_kWh",
    "Assumed_building_type": "BuildingTypeEncoded",
    "Estimated_tilt": "Tilt_deg",
    "avg_GHI_kWhm2_day": "GHI_kWh_per_m2_day",
    "avg_temp_C": "AvgTemp_C",
    "clearness_index": "ClearnessIndex",
    "precip_mm_day": "Precip_mm_per_day",
})

# decode building type labels
building_mapping = {
    0: "single family residential",
    1: "multifamily residential",
    2: "commercial",
    3: "small commercial",
    4: "industrial",
    5: "public sector",
    6: "peri-urban settlement",
    7: "schools",
    8: "public health facilities",
    9: "hotels"
}
dfm["BuildingType"] = dfm["BuildingTypeEncoded"].map(building_mapping)

In [5]:
# keep only rows with valid area (needed for the normalized target)
eps = 1e-9
mask = dfm["InstallableArea_m2"].notna() & (dfm["InstallableArea_m2"] > 0)
dfm = dfm.loc[mask].copy()

# new target
dfm["kWh_per_m2"] = dfm["EnergyPotential_kWh"] / (dfm["InstallableArea_m2"] + eps)

# tilt variants
dfm["tilt"]      = dfm["Tilt_deg"]
dfm["tilt2"]     = dfm["Tilt_deg"]**2
rad              = np.deg2rad(dfm["Tilt_deg"].astype(float))
dfm["tilt_sin"]  = np.sin(rad)
dfm["tilt_cos"]  = np.cos(rad)

# drop columns we don't want in the training file (leakage/helpers/encoded)
drop_cols = [
    "City_clean", "City_w",                # merge helpers
    "BuildingTypeEncoded",                 # we’ll encode from the string later
    "EnergyPotential_kWh", "InstallableArea_m2", "RoofSurface_m2", "PeakCapacity_kWp"  # leakage risks
]
df_final = dfm.drop(columns=[c for c in drop_cols if c in dfm.columns])

# reorder to: id, labels, features, target
cols_order = [
    "City", "BuildingType",
    "tilt","tilt2","tilt_sin","tilt_cos",
    "GHI_kWh_per_m2_day","AvgTemp_C","ClearnessIndex","Precip_mm_per_day",
    "kWh_per_m2"
]
df_final = df_final[cols_order]

print("Final ready-for-training shape/cols:", df_final.shape)
df_final.head()

Final ready-for-training shape/cols: (6243501, 11)


Unnamed: 0,City,BuildingType,tilt,tilt2,tilt_sin,tilt_cos,GHI_kWh_per_m2_day,AvgTemp_C,ClearnessIndex,Precip_mm_per_day,kWh_per_m2
0,Accra,multifamily residential,9.12,83.174398,0.158503,0.987359,4.824408,27.070833,0.486667,3.935833,268.290184
2,Accra,multifamily residential,21.690001,470.456123,0.369585,0.929197,4.824408,27.070833,0.486667,3.935833,254.086636
3,Accra,multifamily residential,3.53,12.4609,0.061571,0.998103,4.824408,27.070833,0.486667,3.935833,274.120557
4,Accra,multifamily residential,18.360001,337.089622,0.314987,0.949096,4.824408,27.070833,0.486667,3.935833,273.618893
5,Accra,multifamily residential,10.68,114.062407,0.185324,0.982678,4.824408,27.070833,0.486667,3.935833,264.811711


In [6]:
csv_path = OUT_DIR / "all_cities_weather_ready_train.csv"
parq_path = OUT_DIR / "all_cities_weather_ready_train.parquet"

df_final.to_csv(csv_path, index=False)
df_final.to_parquet(parq_path, index=False)

print("Saved:\n -", csv_path.as_posix(), "\n -", parq_path.as_posix())

Saved:
 - C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/all_cities_weather_ready_train.csv 
 - C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/New_approach/dataset/cleaned_datasets/all_cities_weather_ready_train.parquet


In [7]:
df_final.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6243501 entries, 0 to 6530760
Data columns (total 11 columns):
 #   Column              Dtype  
---  ------              -----  
 0   City                string 
 1   BuildingType        object 
 2   tilt                float64
 3   tilt2               float64
 4   tilt_sin            float64
 5   tilt_cos            float64
 6   GHI_kWh_per_m2_day  float64
 7   AvgTemp_C           float64
 8   ClearnessIndex      float64
 9   Precip_mm_per_day   float64
 10  kWh_per_m2          float64
dtypes: float64(9), object(1), string(1)
memory usage: 571.6+ MB
