In [1]:
import pandas as pd

df = pd.read_csv("data/surabaya-house-prices-cleaned.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5397 entries, 0 to 5396
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   price            5397 non-null   float64
 1   num_bedroom      5397 non-null   int64  
 2   num_bathroom     5397 non-null   int64  
 3   garage_capacity  5397 non-null   int64  
 4   land_area        5397 non-null   int64  
 5   building_area    5397 non-null   int64  
dtypes: float64(1), int64(5)
memory usage: 253.1 KB


In [4]:
from sklearn.preprocessing import StandardScaler

X = df.drop("price", axis=1)
y = df["price"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=0
)

In [6]:
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    root_mean_squared_error,
    r2_score,
)


def evaluation_metric(y_test, y_pred, no_print=False):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    if no_print:
        return [f"{mae:.5f}", f"{mse:.5f}", f"{rmse:.5f}", f"{r2:.5f}"]

    print(f"MAE : {mae}")
    print(f"MSE : {mse}")
    print(f"RMSE : {rmse}")
    print(f"R-squared : {r2}")

In [12]:
from sklearn.ensemble import RandomForestRegressor

# Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
# Best RMSE: 4872031149.238477
rf = RandomForestRegressor(random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
evaluation_metric(y_test, y_pred)

MAE : 1625534713.5223165
MSE : 1.6339932098596667e+19
RMSE : 4042268187.3666754
R-squared : 0.7325593881694958


In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

parameters = {
    "n_estimators": [100, 150, 200, 250, 300],
    "max_depth": [1, 2, 3, 4],
}

# Grid Search
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=0),
    param_grid=parameters,
    cv=5,
    scoring=(
        "neg_mean_absolute_error",
        "neg_root_mean_squared_error",
        "r2",
    ),
    verbose=2,
    n_jobs=-1,
)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best RMSE:", grid_search.best_score_)

ValueError: For multi-metric scoring, the parameter refit must be set to a scorer key or a callable to refit an estimator with the best parameter setting on the whole data and make the best_* attributes available for that metric. If this is not needed, refit should be set to False explicitly. True was passed.