In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

In [28]:
# Contoh akses dataset (ganti path dengan lokasi file kamu)
df = pd.read_csv("2021socio_economic_indonesia.csv")

# Tampilkan 5 data teratas
df.head()

Unnamed: 0,province,cities_reg,poorpeople_percentage,reg_gdp,life_exp,avg_schooltime,exp_percap
0,Aceh,Simeulue,18.98,2.275,65.24,9.48,7148
1,Aceh,Aceh Singkil,20.36,2.425,67.355,8.68,8776
2,Aceh,Aceh Selatan,13.18,5.531,64.36,8.88,8180
3,Aceh,Aceh Tenggara,13.41,5.063,68.155,9.67,8030
4,Aceh,Aceh Timur,14.45,10.616,68.705,8.21,8577


In [29]:
# Cek missing values
print(df.isnull().sum())

# Drop kolom 'province' (tidak dibutuhkan untuk model numerik)
df = df.drop(columns=['province'])

# Cek dan hapus duplikat
print(f"Jumlah duplikat: {df.duplicated().sum()}")
df = df.drop_duplicates()

# Cek apakah ada baris yang semua nilainya 0 atau NaN (kecuali kolom 'province' yang sudah dihapus)
# Tandai baris yang isinya semuanya 0 atau NaN
def is_all_zero_or_nan(row):
    return ((row == 0) | (row.isna())).all()

invalid_rows = df.apply(is_all_zero_or_nan, axis=1)

print(f"Jumlah baris yang seluruh nilainya kosong (0 atau NaN): {invalid_rows.sum()}")

# Jika ada, kita bisa drop baris tersebut
df = df[~invalid_rows].reset_index(drop=True)

# Cek deskripsi statistik
df.describe()

province                 0
cities_reg               0
poorpeople_percentage    0
reg_gdp                  0
life_exp                 0
avg_schooltime           0
exp_percap               0
dtype: int64
Jumlah duplikat: 0
Jumlah baris yang seluruh nilainya kosong (0 atau NaN): 0


Unnamed: 0,poorpeople_percentage,reg_gdp,life_exp,avg_schooltime,exp_percap
count,514.0,514.0,514.0,514.0,514.0
mean,12.273152,34.798333,69.619076,8.436615,10324.787938
std,7.458703,84.155498,3.455911,1.630842,2717.144186
min,2.38,1.042,55.37,1.42,3976.0
25%,7.15,5.5875,67.33625,7.51,8574.0
50%,10.455,13.0685,69.9225,8.305,10196.5
75%,14.8875,28.8495,72.01875,9.3375,11719.0
max,41.66,819.0,77.855,12.83,23888.0


In [30]:
# Pisahkan fitur dan target
X = df.drop(columns=['poorpeople_percentage'])
y = df['poorpeople_percentage']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_train = X_train.drop(columns=['cities_reg'])
X_test = X_test.drop(columns=['cities_reg'])


In [31]:
# Inisialisasi model
xgb_model = XGBRegressor(random_state=42)

# Training model awal (tanpa tuning)
xgb_model.fit(X_train, y_train)

# Prediksi
y_pred = xgb_model.predict(X_test)


In [32]:
# Evaluasi
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"RMSE: {rmse}")
print(f"R² Score: {r2}")


MAE: 3.955571806824323
RMSE: 5.283116259946732
R² Score: 0.6032307027386334


In [None]:
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [3, 4, 5],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0]
}

grid_search = GridSearchCV(estimator=XGBRegressor(random_state=42), 
                           param_grid=param_grid, 
                           scoring='neg_mean_squared_error',
                           cv=3,
                           verbose=1,
                           n_jobs=-1)

grid_search.fit(X_train, y_train)

# Model terbaik
best_model = grid_search.best_estimator_

# Prediksi ulang dengan model terbaik
y_pred_best = best_model.predict(X_test)

# Evaluasi ulang
mae_best = mean_absolute_error(y_test, y_pred_best)
rmse_best = np.sqrt(mean_squared_error(y_test, y_pred_best))
r2_best = r2_score(y_test, y_pred_best)

print(f"[Best] MAE: {mae_best}")
print(f"[Best] RMSE: {rmse_best}")
print(f"[Best] R² Score: {r2_best}")


In [None]:
# Feature importance
importances = best_model.feature_importances_
features = X.columns

plt.figure(figsize=(8, 6))
sns.barplot(x=importances, y=features)
plt.title('Feature Importance - XGBoost')
plt.show()
