In [26]:
import numpy as np, pandas as pd, warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Modeller
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [41]:
try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except Exception:
    HAS_XGB = False

# Veriyi okuma, yapısını inceleme , değerleri düzenleme

In [27]:
df = pd.read_csv("test_energy_data.csv")
df.head()

Unnamed: 0,Building Type,Square Footage,Number of Occupants,Appliances Used,Average Temperature,Day of Week,Energy Consumption
0,Residential,24563,15,4,28.52,Weekday,2865.57
1,Commercial,27583,56,23,23.07,Weekend,4283.8
2,Commercial,45313,4,44,33.56,Weekday,5067.83
3,Residential,41625,84,17,27.39,Weekend,4624.3
4,Residential,36720,58,47,17.08,Weekday,4820.59


In [None]:
df.info()

In [29]:
df.isnull().sum()

Building Type          0
Square Footage         0
Number of Occupants    0
Appliances Used        0
Average Temperature    0
Day of Week            0
Energy Consumption     0
dtype: int64

In [None]:
df.describe().T

# Encoding işlemi

In [52]:
target_col = "Energy Consumption" 
cat_cols   = ["Building Type", "Day of Week"]
num_cols = ["Square Footage", "Number of Occupants", "Appliances Used", "Average Temperature"]

In [53]:
X = df[num_cols].copy()
y = df[target_col].astype(float).copy()

In [54]:
# Not: CAT_COLS içindekiler OneHot; NUM_COLS Z-score; geri kalanlar (varsa) drop edilir
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), [c for c in num_cols if c in X.columns]),
        ("cat", OneHotEncoder(drop="first", handle_unknown="ignore"), [c for c in cat_cols if c in X.columns]),
    ],
    remainder="drop"
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.30, random_state=42, shuffle=True
)

# Model eğitimi için hazırlık ve model eğitimleri

In [55]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)

In [56]:
model_spaces = {
    # Doğrusal aile
    "LinearRegression": (
        Pipeline([("prep", preprocess), ("model", LinearRegression())]),
        {}
    ),
    "Ridge": (
        Pipeline([("prep", preprocess), ("model", Ridge(random_state=42))]),
        {"model__alpha": np.logspace(-3, 3, 13)}
    ),
    "Lasso": (
        Pipeline([("prep", preprocess), ("model", Lasso(random_state=42, max_iter=20000))]),
        {"model__alpha": np.logspace(-3, 1, 9)}
    ),
    "ElasticNet": (
        Pipeline([("prep", preprocess), ("model", ElasticNet(random_state=42, max_iter=20000))]),
        {"model__alpha": np.logspace(-3, 1, 9),
         "model__l1_ratio": [0.1, 0.3, 0.5, 0.7, 0.9]}
    ),
    "Huber": (
        Pipeline([("prep", preprocess), ("model", HuberRegressor())]),
        {"model__epsilon": [1.1, 1.35, 1.5]}
    ),

    # Kernel tabanlı
    "SVR": (
        Pipeline([("prep", preprocess), ("model", SVR())]),
        {"model__kernel": ["rbf", "linear"],
         "model__C": [0.1, 1, 10, 100],
         "model__gamma": ["scale", 0.1, 0.01]}
    ),

    # Ağaç/ensemble
    "DecisionTree": (
        Pipeline([("prep", preprocess), ("model", DecisionTreeRegressor(random_state=42))]),
        {"model__max_depth": [None, 3, 5, 7, 10],
         "model__min_samples_split": [2, 5, 10],
         "model__min_samples_leaf": [1, 2, 5]}
    ),
    "RandomForest": (
        Pipeline([("prep", preprocess), ("model", RandomForestRegressor(random_state=42, n_jobs=-1))]),
        {"model__n_estimators": [200, 400, 800],
         "model__max_depth": [None, 5, 10, 15],
         "model__min_samples_leaf": [1, 2, 5],
         "model__max_features": ["sqrt", 0.7, None]}
    ),
    "GradientBoosting": (
        Pipeline([("prep", preprocess), ("model", GradientBoostingRegressor(random_state=42))]),
        {"model__n_estimators": [200, 400],
         "model__learning_rate": [0.05, 0.1, 0.2],
         "model__max_depth": [2, 3, 4],
         "model__subsample": [1.0, 0.8]}
    ),
}

if HAS_XGB:
    model_spaces["XGBoost"] = (
        Pipeline([("prep", preprocess), ("model", XGBRegressor(
            random_state=42, objective="reg:squarederror", n_estimators=400, n_jobs=-1))]),
        {"model__max_depth": [3, 5, 7],
         "model__learning_rate": [0.05, 0.1, 0.2],
         "model__subsample": [0.8, 1.0],
         "model__colsample_bytree": [0.8, 1.0]}
    )

In [57]:
def RMSE(y_true, y_pred):
    try:
        return mean_squared_error(y_true, y_pred, squared=False)
    except TypeError:
        return np.sqrt(mean_squared_error(y_true, y_pred))

In [58]:
rows = []
for name, (pipe, grid) in model_spaces.items():
    gs = GridSearchCV(pipe, grid, scoring="r2", cv=cv, n_jobs=-1)
    gs.fit(X_train, y_train)
    yhat = gs.best_estimator_.predict(X_test)
    rows.append({
        "model": name,
        "cv_best_r2": gs.best_score_,
        "test_r2": r2_score(y_test, yhat),
        "test_mae": mean_absolute_error(y_test, yhat),
        "test_rmse": RMSE(y_test, yhat),
        "best_params": gs.best_params_
    })

In [59]:
results = pd.DataFrame(rows).sort_values("test_r2", ascending=False).reset_index(drop=True)
results

Unnamed: 0,model,cv_best_r2,test_r2,test_mae,test_rmse,best_params
0,Huber,0.590733,0.695126,415.05406,446.103245,{'model__epsilon': 1.5}
1,Lasso,0.598798,0.695122,416.66838,446.106412,{'model__alpha': 10.0}
2,LinearRegression,0.591647,0.695109,415.057026,446.115656,{}
3,SVR,0.538571,0.688552,418.623915,450.887414,"{'model__C': 100, 'model__gamma': 'scale', 'mo..."
4,Ridge,0.614775,0.680406,423.556354,456.745863,{'model__alpha': 10.0}
5,ElasticNet,0.61554,0.677212,424.417105,459.022373,"{'model__alpha': 0.31622776601683794, 'model__..."
6,XGBoost,0.485706,0.605042,449.146358,507.751238,"{'model__colsample_bytree': 0.8, 'model__learn..."
7,GradientBoosting,0.471567,0.60387,455.180212,508.504057,"{'model__learning_rate': 0.1, 'model__max_dept..."
8,RandomForest,0.53781,0.600484,444.661858,510.672136,"{'model__max_depth': 5, 'model__max_features':..."
9,DecisionTree,0.343412,0.161377,600.863071,739.875428,"{'model__max_depth': 5, 'model__min_samples_le..."


# hiperparametre optimizasyonu

In [70]:
final_models = {
    "Huber": HuberRegressor(epsilon=1.5),
    "Lasso": Lasso(alpha=10.0, random_state=42),
    "LinearRegression": LinearRegression(),
    "SVR": SVR(C=100, gamma='scale', kernel='rbf'),
    "Ridge": Ridge(alpha=10.0, random_state=42)
}

In [71]:
final_results = []

In [72]:
for name, model in final_models.items():
    pipe = Pipeline([
        ("scaler", StandardScaler()),
        ("model", model)
    ])
    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    final_results.append({
        "model": name,
        "test_r2": r2_score(y_test, y_pred),
        "test_mae": mean_absolute_error(y_test, y_pred),
        "test_rmse": np.sqrt(mean_squared_error(y_test, y_pred))
    })

In [73]:
pd.DataFrame(final_results)

Unnamed: 0,model,test_r2,test_mae,test_rmse
0,Huber,0.695126,415.05406,446.103245
1,Lasso,0.695122,416.66838,446.106412
2,LinearRegression,0.695109,415.057026,446.115656
3,SVR,0.33425,556.728032,659.220744
4,Ridge,0.680406,423.556354,456.745863
