In [24]:
import pandas as pd

df = pd.read_csv("data/surabaya-house-prices-cleaned.csv")

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   price            1668 non-null   float64
 1   num_bedroom      1668 non-null   int64  
 2   num_bathroom     1668 non-null   int64  
 3   garage_capacity  1668 non-null   int64  
 4   land_area        1668 non-null   int64  
 5   building_area    1668 non-null   int64  
dtypes: float64(1), int64(5)
memory usage: 78.3 KB


In [26]:
from sklearn.preprocessing import StandardScaler

X = df.drop("price", axis=1)
y = df["price"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [27]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=0
)

In [28]:
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    root_mean_squared_error,
    r2_score,
)


def evaluation_metric(y_test, y_pred, no_print=False):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    if no_print:
        return [f"{mae:.5f}", f"{mse:.5f}", f"{rmse:.5f}", f"{r2:.5f}"]

    print(f"MAE : {mae}")
    print(f"MSE : {mse}")
    print(f"RMSE : {rmse}")
    print(f"R-squared : {r2}")

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
evaluation_metric(y_test, y_pred)

MAE : 1743063887.502786
MSE : 1.622347483492939e+19
RMSE : 4027837488.64442
R-squared : 0.8199081977066628


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

parameters = {
    "max_depth": [3, 5, 7, 10],
    "n_estimators": [100, 200, 300, 400, 500],
    "max_features": [10, 20, 30, 40],
    "min_samples_leaf": [1, 2, 4],
}

# Grid Search
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=0),
    param_grid=parameters,
    cv=5,
    scoring="r2",
    verbose=2,
    n_jobs=-1,
)
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best RMSE:", grid_search.best_score_)

Fitting 5 folds for each of 240 candidates, totalling 1200 fits
[CV] END max_depth=3, max_features=10, min_samples_leaf=1, n_estimators=100; total time=   0.2s
[CV] END max_depth=3, max_features=10, min_samples_leaf=1, n_estimators=100; total time=   0.2s
[CV] END max_depth=3, max_features=10, min_samples_leaf=1, n_estimators=100; total time=   0.2s
[CV] END max_depth=3, max_features=10, min_samples_leaf=1, n_estimators=100; total time=   0.2s
[CV] END max_depth=3, max_features=10, min_samples_leaf=1, n_estimators=100; total time=   0.2s
[CV] END max_depth=3, max_features=10, min_samples_leaf=1, n_estimators=200; total time=   0.4s
[CV] END max_depth=3, max_features=10, min_samples_leaf=1, n_estimators=200; total time=   0.4s
[CV] END max_depth=3, max_features=10, min_samples_leaf=1, n_estimators=200; total time=   0.5s
[CV] END max_depth=3, max_features=10, min_samples_leaf=1, n_estimators=200; total time=   0.5s
[CV] END max_depth=3, max_features=10, min_samples_leaf=1, n_estimators=

  _data = np.array(data, dtype=dtype, copy=copy,


Best Parameters: {'max_depth': 10, 'max_features': 10, 'min_samples_leaf': 1, 'n_estimators': 300}
Best RMSE: 0.621960589579488
