In [1]:
# lr_stratified_deleaked.py
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from joblib import dump

In [None]:
TRAIN_CSV = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/splits/combined_stratified_city_type/train_combined.csv")
TEST_CSV  = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/splits/combined_stratified_city_type/test_combined.csv")
TARGET = "Energy_potential_per_year"
LEAKY = {"Peak_installable_capacity", "Potential_installable_area"}
OUT_DIR = Path("outputs_lr_stratified")
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
# ====== LOAD ======
train = pd.read_csv(TRAIN_CSV).dropna(subset=[TARGET]).copy()
test  = pd.read_csv(TEST_CSV ).dropna(subset=[TARGET]).copy()

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5224608 entries, 0 to 5224607
Data columns (total 7 columns):
 #   Column                      Dtype  
---  ------                      -----  
 0   City                        object 
 1   Surface_area                float64
 2   Potential_installable_area  float64
 3   Peak_installable_capacity   float64
 4   Energy_potential_per_year   float64
 5   Assumed_building_type       int64  
 6   Estimated_tilt              float64
dtypes: float64(5), int64(1), object(1)
memory usage: 279.0+ MB


In [4]:
# ====== FEATURE PICKER (OG numeric minus TARGET minus LEAKY) ======
def pick_features(frame):
    feats = frame.select_dtypes(include=[np.number]).columns.difference([TARGET])
    feats = [c for c in feats if c not in LEAKY and c.lower() not in {"id","idx","index"}]
    return feats

In [5]:
features = pick_features(train)
print(f"[Stratified] Using {len(features)} features (excluding {sorted(LEAKY) if LEAKY else 'none'}):")
print(" - " + "\n - ".join(features))

[Stratified] Using 3 features (excluding ['Peak_installable_capacity', 'Potential_installable_area']):
 - Assumed_building_type
 - Estimated_tilt
 - Surface_area


In [6]:
# quick correlation peek on TRAIN only
corr = train[features + [TARGET]].corr()[TARGET].drop(labels=[TARGET]).sort_values(key=np.abs, ascending=False)
corr.head(10).to_frame("pearson_corr_to_target").to_csv(OUT_DIR / "corr_top10_train.csv")


In [7]:
# Drop rows missing any feature (clean baseline)
train = train.dropna(subset=features)
test  = test.dropna(subset=features)

Xtr, ytr = train[features].to_numpy(), train[TARGET].astype(float).to_numpy()
Xte, yte = test[features].to_numpy(),  test[TARGET].astype(float).to_numpy()

In [8]:
# ====== MODEL PIPELINE ======
pipe = Pipeline([("scaler", StandardScaler()), ("lr", LinearRegression())])

pipe.fit(Xtr, ytr)
yhat = pipe.predict(Xte)

mae = mean_absolute_error(yte, yhat)
mse = mean_squared_error(yte, yhat)
r2  = r2_score(yte, yhat)

print("\n=== Stratified fixed split (de-leaked LR) ===")
print(f"MAE = {mae:.6f}")
print(f"MSE = {mse:.6f}")
print(f"R2  = {r2:.6f}")

coef_df = pd.DataFrame({"feature": features, "coef": pipe.named_steps["lr"].coef_}) \
            .sort_values("coef", key=np.abs, ascending=False)
coef_df.to_csv(OUT_DIR / "coefficients.csv", index=False)


=== Stratified fixed split (de-leaked LR) ===
MAE = 5662.715477
MSE = 328448908.915259
R2  = 0.936754


In [9]:
pd.DataFrame({"MAE":[mae], "MSE":[mse], "R2":[r2]}).to_csv(OUT_DIR / "metrics.csv", index=False)
dump(pipe, OUT_DIR / "lr_stratified.joblib")

print(f"\nArtifacts -> {OUT_DIR.resolve()}")


Artifacts -> C:\Users\User\Desktop\ML\Project\solar-potential-analysis-github-setup\scripts\outputs_lr_stratified


In [13]:
# === Stratified with proper one-hot for Assumed_building_type (int labels) ===
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

TARGET = "Energy_potential_per_year"
CAT_COLS = ["Assumed_building_type"]       # FORCE categorical
NUM_COLS = ["Estimated_tilt"]              # numeric


pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT_COLS),
        ("num", StandardScaler(), NUM_COLS),
    ],
    remainder="drop",
)

pipe = Pipeline([("pre", pre), ("lr", LinearRegression())])

Xtr, ytr = train[CAT_COLS + NUM_COLS], train[TARGET].astype(float).values
Xte, yte = test [CAT_COLS + NUM_COLS], test [TARGET].astype(float).values

pipe.fit(Xtr, ytr)
yhat = pipe.predict(Xte)

print("MAE =", mean_absolute_error(yte, yhat))
print("MSE =", mean_squared_error(yte, yhat))
print("R2  =", r2_score(yte, yhat))


MAE = 20843.727901880135
MSE = 4973511067.991931
R2  = 0.04229751618505084


In [15]:
# === Stratified: Compare (A) EPY with a weak size proxy vs (B) Normalized target ===
import pandas as pd, numpy as np
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.linear_model import HuberRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

TARGET_EPY = "Energy_potential_per_year"

# Load and keep only columns we might use
cols = ["Assumed_building_type","Estimated_tilt","Surface_area",
        "Potential_installable_area", TARGET_EPY]


# ---------- A) EPY with weak size proxy (Surface_area only) ----------
CAT_COLS = ["Assumed_building_type"]
NUM_COLS_A = [c for c in ["Estimated_tilt","Surface_area"] if c in train.columns]

# Drop rows missing any of these features
train_A = train.dropna(subset=CAT_COLS + NUM_COLS_A)
test_A  = test.dropna(subset=CAT_COLS + NUM_COLS_A)

pre_A = ColumnTransformer(
    [("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT_COLS),
     ("num", StandardScaler(), NUM_COLS_A)],
    remainder="drop",
)

models_A = {
    "Linear": Pipeline([("pre", pre_A), ("m", LinearRegression())]),
    "Ridge":  Pipeline([("pre", pre_A), ("m", Ridge(alpha=1.0, random_state=42))]),
    "Huber":  Pipeline([("pre", pre_A), ("m", HuberRegressor())]),
}

Xtr_A, ytr_A = train_A[CAT_COLS + NUM_COLS_A], train_A[TARGET_EPY].astype(float).values
Xte_A, yte_A = test_A [CAT_COLS + NUM_COLS_A], test_A [TARGET_EPY].astype(float).values

print("=== A) Target = EPY; Features = {type, tilt, Surface_area} ===")
for name, pipe in models_A.items():
    pipe.fit(Xtr_A, ytr_A)
    yhat = pipe.predict(Xte_A)
    print(f"{name:>6s}  | MAE={mean_absolute_error(yte_A,yhat):,.2f} "
          f"| MSE={mean_squared_error(yte_A,yhat):,.2f} | R2={r2_score(yte_A,yhat):.4f}")

# ---------- B) Normalized target: kWh_per_m2 ----------
if "Potential_installable_area" in train.columns:
    eps = 1e-9
    train_B = train.dropna(subset=["Potential_installable_area"]).copy()
    test_B  = test.dropna(subset=["Potential_installable_area"]).copy()
    train_B["kWh_per_m2"] = train_B[TARGET_EPY] / (train_B["Potential_installable_area"] + eps)
    test_B["kWh_per_m2"]  = test_B[TARGET_EPY]  / (test_B["Potential_installable_area"]  + eps)

    NUM_COLS_B = [c for c in ["Estimated_tilt"] if c in train_B.columns]  # no size features
    train_B = train_B.dropna(subset=CAT_COLS + NUM_COLS_B + ["kWh_per_m2"])
    test_B  = test_B.dropna(subset=CAT_COLS + NUM_COLS_B + ["kWh_per_m2"])

    pre_B = ColumnTransformer(
        [("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT_COLS),
         ("num", StandardScaler(), NUM_COLS_B)],
        remainder="drop",
    )

    models_B = {
        "Linear": Pipeline([("pre", pre_B), ("m", LinearRegression())]),
        "Ridge":  Pipeline([("pre", pre_B), ("m", Ridge(alpha=1.0, random_state=42))]),
        "Huber":  Pipeline([("pre", pre_B), ("m", HuberRegressor())]),
    }

    Xtr_B, ytr_B = train_B[CAT_COLS + NUM_COLS_B], train_B["kWh_per_m2"].astype(float).values
    Xte_B, yte_B = test_B [CAT_COLS + NUM_COLS_B], test_B ["kWh_per_m2"].astype(float).values

    print("\n=== B) Target = kWh_per_m2; Features = {type, tilt} (no size) ===")
    for name, pipe in models_B.items():
        pipe.fit(Xtr_B, ytr_B)
        yhat = pipe.predict(Xte_B)
        print(f"{name:>6s}  | MAE={mean_absolute_error(yte_B,yhat):,.4f} "
              f"| MSE={mean_squared_error(yte_B,yhat):,.4f} | R2={r2_score(yte_B,yhat):.4f}")
else:
    print("\n[Skipped B] Potential_installable_area not available, so kWh_per_m2 cannot be computed.")


=== A) Target = EPY; Features = {type, tilt, Surface_area} ===
Linear  | MAE=5,621.47 | MSE=326,555,228.38 | R2=0.9371
 Ridge  | MAE=5,621.46 | MSE=326,555,261.75 | R2=0.9371
 Huber  | MAE=4,571.72 | MSE=412,432,113.71 | R2=0.9206

=== B) Target = kWh_per_m2; Features = {type, tilt} (no size) ===
Linear  | MAE=35.8043 | MSE=4,042.3284 | R2=0.0185
 Ridge  | MAE=35.8043 | MSE=4,042.3284 | R2=0.0185
 Huber  | MAE=34.6246 | MSE=4,176.0292 | R2=-0.0140


In [16]:
# === Stratified: kWh_per_m2 with City + non-linear tilt features (no weather API yet) ===
import pandas as pd, numpy as np
from pathlib import Path

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

TARGET_EPY = "Energy_potential_per_year"

# Load
use_cols = ["City","Assumed_building_type","Estimated_tilt","Potential_installable_area", TARGET_EPY]

# Normalized target
eps = 1e-9
train["kWh_per_m2"] = train[TARGET_EPY] / (train["Potential_installable_area"] + eps)
test["kWh_per_m2"]  = test[TARGET_EPY]  / (test["Potential_installable_area"]  + eps)

# Nonlinear tilt features (keep them as numeric columns)
for df in (train, test):
    t = np.deg2rad(df["Estimated_tilt"])
    df["tilt"] = df["Estimated_tilt"]
    df["tilt2"] = df["Estimated_tilt"] ** 2
    df["tilt_sin"] = np.sin(t)
    df["tilt_cos"] = np.cos(t)

CAT_COLS = ["City", "Assumed_building_type"]   # City allowed in stratified; NOT for LOCO
NUM_COLS = ["tilt","tilt2","tilt_sin","tilt_cos"]

pre = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), CAT_COLS),
        ("num", StandardScaler(), NUM_COLS),
    ],
    remainder="drop",
)

models = {
    "Linear": Pipeline([("pre", pre), ("m", LinearRegression())]),
    "Ridge":  Pipeline([("pre", pre), ("m", Ridge(alpha=1.0, random_state=42))]),
    "RF":     Pipeline([("pre", pre), ("m", RandomForestRegressor(n_estimators=400, max_depth=None, n_jobs=-1, random_state=42))]),
}

Xtr, ytr = train[CAT_COLS + NUM_COLS], train["kWh_per_m2"].astype(float).values
Xte, yte = test [CAT_COLS + NUM_COLS], test ["kWh_per_m2"].astype(float).values

print("=== Stratified: Target = kWh_per_m2; Features = City + type + tilt nonlinear ===")
for name, pipe in models.items():
    pipe.fit(Xtr, ytr)
    yhat = pipe.predict(Xte)
    print(f"{name:>6s} | MAE={mean_absolute_error(yte,yhat):,.4f} "
          f"| MSE={mean_squared_error(yte,yhat):,.4f} | R2={r2_score(yte,yhat):.4f}")


=== Stratified: Target = kWh_per_m2; Features = City + type + tilt nonlinear ===
Linear | MAE=28.0455 | MSE=3,163.3046 | R2=0.2319
 Ridge | MAE=27.8699 | MSE=3,172.9300 | R2=0.2296
    RF | MAE=23.1751 | MSE=2,854.0478 | R2=0.3070
