# Modeling

Baca data yang sudah dibersihkan

In [29]:
import pandas as pd

df = pd.read_csv("data/surabaya-house-prices-cleaned.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1668 entries, 0 to 1667
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   price            1668 non-null   float64
 1   num_bedroom      1668 non-null   int64  
 2   num_bathroom     1668 non-null   int64  
 3   garage_capacity  1668 non-null   int64  
 4   land_area        1668 non-null   int64  
 5   building_area    1668 non-null   int64  
dtypes: float64(1), int64(5)
memory usage: 78.3 KB


Scaling datasetnya

In [17]:
from sklearn.preprocessing import StandardScaler

X = df.drop("price", axis=1)
y = df["price"]

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Bagi menjadi data train dan test

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=0
)

Buat fungsi untuk melakukan matriks evaluasi

In [19]:
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    root_mean_squared_error,
    r2_score,
)


def evaluation_metric(y_test, y_pred, no_print=False):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = root_mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    if no_print:
        return [f"{mae:.5f}", f"{mse:.5f}", f"{rmse:.5f}", f"{r2:.5f}"]

    print(f"MAE : {mae}")
    print(f"MSE : {mse}")
    print(f"RMSE : {rmse}")
    print(f"R-squared : {r2}")

Coba bandingkan model random forest dengan model gradient boosting

In [20]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

models = {
    "rf": RandomForestRegressor(random_state=0),
    "gb": GradientBoostingRegressor(random_state=0),
}
results = []

for model in models.values():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    results.append(evaluation_metric(y_test, y_pred, no_print=True))


evaluation_df = pd.DataFrame(
    data=results,
    columns=["MAE", "MSE", "RMSE", "R-squared"],
)

evaluation_df["Method"] = [name for name in models.keys()]
evaluation_df.set_index("Method", inplace=True)
evaluation_df.sort_values("R-squared", ascending=False)

Unnamed: 0_level_0,MAE,MSE,RMSE,R-squared
Method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
rf,1743063887.50279,1.6223474834929388e+19,4027837488.64442,0.81991
gb,1829571564.88534,1.7980304145322715e+19,4240318873.07107,0.80041


Buat model random forest dengan default value

In [21]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=0)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
evaluation_metric(y_test, y_pred)

MAE : 1743063887.502786
MSE : 1.622347483492939e+19
RMSE : 4027837488.64442
R-squared : 0.8199081977066628


Lakukan tuning parameter menggunakan grid search

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    "n_estimators": [50, 100, 200, 500],
    "max_features": [1.0, "sqrt", "log2", None],
    "max_depth": [10, 20, 50, None],
    "min_samples_split": [2, 5, 10],
    "min_samples_leaf": [1, 2, 4],
    "bootstrap": [True, False],
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=0),
    param_grid=parameters,
    cv=5,
    scoring="r2",
    verbose=1,
    n_jobs=-1,
)
grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

y_pred = best_rf.predict(X_test)
evaluation_metric(y_test, y_pred)

Fitting 5 folds for each of 1152 candidates, totalling 5760 fits
Best Parameters: {'bootstrap': True, 'max_depth': 50, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
MAE : 1782172306.9981408
MSE : 1.5772145597849993e+19
RMSE : 3971416069.5965858
R-squared : 0.8249182646966461
