In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from joblib import dump

In [2]:
TRAIN_CSV = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/splits/combined_stratified_city_type/train_combined.csv")
TEST_CSV  = Path("C:/Users/User/Desktop/ML/Project/solar-potential-analysis-github-setup/splits/combined_stratified_city_type/test_combined.csv")
TARGET = "Energy_potential_per_year"
OUT_DIR = Path("outputs_lr_stratified")
OUT_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
# ---- load ----
train = pd.read_csv(TRAIN_CSV).dropna(subset=[TARGET]).copy()
test  = pd.read_csv(TEST_CSV ).dropna(subset=[TARGET]).copy()

In [4]:
# OG features = numeric columns except target (drop obvious non-features if present)
num_cols_tr = train.select_dtypes(include=[np.number]).columns.tolist()
features = [c for c in num_cols_tr if c != TARGET and c.lower() not in {"id","idx"}]

In [5]:
# drop rows missing any feature
train = train.dropna(subset=features)
test  = test.dropna(subset=features)

In [6]:
print(f"[Stratified] Detected {len(features)} features:\n - " + "\n - ".join(features))

[Stratified] Detected 5 features:
 - Surface_area
 - Potential_installable_area
 - Peak_installable_capacity
 - Assumed_building_type
 - Estimated_tilt


In [7]:
X_tr, y_tr = train[features].to_numpy(), train[TARGET].astype(float).to_numpy()
X_te, y_te = test[features].to_numpy(),  test[TARGET].astype(float).to_numpy()

In [8]:
pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("lr", LinearRegression())
])

In [9]:
pipe.fit(X_tr, y_tr)
y_hat = pipe.predict(X_te)

In [11]:
mae = mean_absolute_error(y_te, y_hat)
mse = mean_squared_error(y_te, y_hat)
r2  = r2_score(y_te, y_hat)

print("\n=== Stratified fixed split (LR on OG features) ===")
print(f"MAE = {mae:.6f}")
print(f"MSE = {mse:.6f}")
print(f"R2  = {r2:.6f}")


=== Stratified fixed split (LR on OG features) ===
MAE = 3046.449561
MSE = 110259300.978159
R2  = 0.978768


In [None]:
# coef (scaled space) + artifacts
coef_df = pd.DataFrame({"feature": features, "coef": pipe.named_steps["lr"].coef_}) \
            .sort_values("coef", key=np.abs, ascending=False)
coef_df.to_csv(OUT_DIR / "coefficients.csv", index=False)

dump(pipe, OUT_DIR / "lr_stratified.joblib")
pd.DataFrame({"MAE":[mae], "MSE":[mse], "R2":[r2]}).to_csv(OUT_DIR / "metrics.csv", index=False)

display(coef_df.head(20))
print(f"\nArtifacts saved to: {OUT_DIR.resolve()}")