# Load the Data

In [1]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Install the dataset
X, y = fetch_california_housing(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



## Normalization with StandardScaler

In [3]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [4]:
print("Shape of training data:", X_train.shape)
print("Shape of testing data:", X_test.shape)

Shape of training data: (14448, 8)
Shape of testing data: (6192, 8)


## Model Developing with Random Forest Regressor

In [5]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error



In [6]:
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
initial_mse = mean_squared_error(y_test, y_pred)

print(f"Initial MSE on test set (without tuning): {initial_mse:.2f}")


Initial MSE on test set (without tuning): 0.26


# Hyperparemeter Tunning

## 1. grid search 

In [None]:
from sklearn.model_selection import GridSearchCV
import time


In [20]:
start_time = time.time()

# define the grid parameter for Grid Search
param_grid = {
    'n_estimators':[100, 200, 300],
    'max_depth': [10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# GridSearchCV initialization
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

print(f"Best parameters (Grid Search): {grid_search.best_params_}")
best_rf_grid = grid_search.best_estimator_

# Evaluate model's performance after Grid Search

y_pred_grid = best_rf_grid.predict(X_test)
grid_search_mse = mean_squared_error(y_test, y_pred_grid)
print(f"MSE after Grid Search: {grid_search_mse:.2f}")


end_time = time.time()  
execution_time = end_time - start_time  

print(f"Execution time: {execution_time:.4f} seconds")

Fitting 3 folds for each of 162 candidates, totalling 486 fits
Best parameters (Grid Search): {'bootstrap': True, 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
MSE after Grid Search: 0.25
Execution time: 860.5205 seconds


## 2. Random search 

In [22]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

In [24]:
start_time = time.time()
param_dist = {
    'n_estimators': np.arange(100, 500, 100),
    'max_depth': [None] + list(np.arange(10, 50, 10)),
    'min_samples_split': np.arange(2, 11, 2),
    'min_samples_leaf': np.arange(1, 5),
    'bootstrap': [True, False]
}


random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_dist, n_iter=5, cv=3, n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)


print(f"Best parameters (Random Search): {random_search.best_params_}")
best_rf_random = random_search.best_estimator_



y_pred_random = best_rf_random.predict(X_test)
random_search_mse = mean_squared_error(y_test, y_pred_random)
print(f"MSE after Random Search: {random_search_mse:.2f}")


end_time = time.time()  
execution_time = end_time - start_time  

print(f"Execution time: {execution_time:.4f} seconds")

Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best parameters (Random Search): {'n_estimators': np.int64(300), 'min_samples_split': np.int64(6), 'min_samples_leaf': np.int64(2), 'max_depth': np.int64(30), 'bootstrap': True}
MSE after Random Search: 0.25
Execution time: 74.9553 seconds


## Optuna search

In [33]:
import optuna
from optuna.integration import OptunaSearchCV
from sklearn.model_selection import cross_val_score


In [34]:
def objective(trial):
    rf_params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 500),
        'max_depth': trial.suggest_int('max_depth', 10, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 4),
        'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
    }

    rf = RandomForestRegressor(**rf_params, random_state=42, n_jobs=-1)
    score = cross_val_score(rf, X_train, y_train, cv=3, scoring='neg_mean_squared_error', n_jobs=-1)
    return score.mean()  # Semakin tinggi semakin bagus (karena negatif MSE)

start_time = time.time()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=30)

print("Best trial:")
print(study.best_trial)

# Train ulang dengan best parameter
best_params = study.best_trial.params
best_rf = RandomForestRegressor(**best_params, random_state=42, n_jobs=-1)
best_rf.fit(X_train, y_train)

y_pred = best_rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"MSE on test set: {mse:.2f}")

end_time = time.time()
print(f"Execution time: {end_time - start_time:.2f} seconds")

[I 2025-07-24 15:26:55,681] A new study created in memory with name: no-name-c18a9b03-efa8-4bfe-bd3e-180e2e78ba7a
[I 2025-07-24 15:27:07,093] Trial 0 finished with value: -0.5040954662553717 and parameters: {'n_estimators': 312, 'max_depth': 34, 'min_samples_split': 6, 'min_samples_leaf': 2, 'bootstrap': False}. Best is trial 0 with value: -0.5040954662553717.
[I 2025-07-24 15:27:10,939] Trial 1 finished with value: -0.28140935176481807 and parameters: {'n_estimators': 132, 'max_depth': 20, 'min_samples_split': 10, 'min_samples_leaf': 2, 'bootstrap': True}. Best is trial 1 with value: -0.28140935176481807.
[I 2025-07-24 15:27:14,701] Trial 2 finished with value: -0.28176098500979885 and parameters: {'n_estimators': 150, 'max_depth': 15, 'min_samples_split': 3, 'min_samples_leaf': 3, 'bootstrap': True}. Best is trial 1 with value: -0.28140935176481807.
[I 2025-07-24 15:27:20,777] Trial 3 finished with value: -0.471311648635568 and parameters: {'n_estimators': 181, 'max_depth': 22, 'min_

Best trial:
FrozenTrial(number=5, state=TrialState.COMPLETE, values=[-0.27691679891353266], datetime_start=datetime.datetime(2025, 7, 24, 15, 27, 25, 352804), datetime_complete=datetime.datetime(2025, 7, 24, 15, 27, 38, 784801), params={'n_estimators': 453, 'max_depth': 33, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': True}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'n_estimators': IntDistribution(high=500, log=False, low=100, step=1), 'max_depth': IntDistribution(high=50, log=False, low=10, step=1), 'min_samples_split': IntDistribution(high=10, log=False, low=2, step=1), 'min_samples_leaf': IntDistribution(high=4, log=False, low=1, step=1), 'bootstrap': CategoricalDistribution(choices=(True, False))}, trial_id=5, value=None)
MSE on test set: 0.25
Execution time: 261.43 seconds
