In [36]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV

In [21]:
dataset = sklearn.datasets.fetch_california_housing()
housing_df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
housing_df_target = dataset.target
housing_df_target = pd.DataFrame(housing_df_target, columns=['MedHouseVal'])
housing_df = pd.concat([housing_df, housing_df_target], axis=1)
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [9]:
housing_df.isna().sum()

Unnamed: 0,0
MedInc,0
HouseAge,0
AveRooms,0
AveBedrms,0
Population,0
AveOccup,0
Latitude,0
Longitude,0


In [10]:
housing_df.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31


In [12]:
housing_df.shape

(20640, 8)

In [33]:
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [22]:
x = housing_df.drop(['MedHouseVal'], axis=1)
y = housing_df['MedHouseVal']

In [23]:
x_valtrain, x_test, y_valtrain, y_test = train_test_split(x, y, test_size=0.2, random_state=2)
x_train, x_val, y_train, y_val = train_test_split(x_valtrain, y_valtrain, test_size=0.25, random_state=1)


Training Model dengan XGBRegressor yang telah di-tuning

In [37]:
model = XGBRegressor(random_state = 42)
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
}
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=5, # Contoh 5-fold cross-validation pada x_train
    scoring='neg_mean_absolute_error',
    verbose=2,
    n_jobs=-1
)
grid_search.fit(x_train, y_train)
print(f"Hiperparameter Terbaik (dari validasi silang pada data training): {grid_search.best_params_}")
print(f"MAE Terbaik dari CV: {-grid_search.best_score_:.4f}")

best_model = grid_search.best_estimator_

Fitting 5 folds for each of 27 candidates, totalling 135 fits
Hiperparameter Terbaik (dari validasi silang pada data training): {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 300}
MAE Terbaik dari CV: 0.3081


Membuat Prediksi dan Error dari model tuning

In [43]:
trainTuningPredict = best_model.predict(x_train)
valTuningPredict = best_model.predict(x_val)
testTuningPredict = best_model.predict(x_test)

In [48]:
tuningErrorTrain = mean_absolute_error(y_train, trainTuningPredict)
tuningErrorVal = mean_absolute_error(y_val, valTuningPredict)
tuningErrorTest = mean_absolute_error(y_test, testTuningPredict)

print('Error data training: ', tuningErrorTrain)
print('Error data validasi: ', tuningErrorVal)
print('Error data testing: ', tuningErrorTest)

Error data training:  0.12031881491661263
Error data validasi:  0.30359563457538685
Error data testing:  0.30216011359650957


Training Model dengan XGBRegressor Default

In [40]:
model2 = XGBRegressor()
model2.fit(x_train, y_train)

Membuat Prediksi dan Error dari Model Default

In [41]:
trainPredict = model2.predict(x_train)
valPredict = model2.predict(x_val)
testPredict = model2.predict(x_test)

In [46]:
errorTrain = mean_absolute_error(y_train, trainPredict)
errorVal = mean_absolute_error(y_val, valPredict)
errorTest = mean_absolute_error(y_test, testPredict)

print('Error data training: ', error)
print('Error data validasi: ', errorVal)
print('Error data testing: ', errorTest)


Error data training:  0.17376493763374573
Error data validasi:  0.3174580870890037
Error data testing:  0.32022186605934777


In [49]:
Perbedaan_peforma = abs(errorTest - tuningErrorTest)
print(f"Perbedaan peforma model deafult dan tuning : {Perbedaan_peforma}")

Perbedaan peforma model deafult dan tuning : 0.0180617524628382
