In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
data = pd.read_csv('Data Official Statistics.csv')
data.head(10)

Unnamed: 0,PROVINSI,TPT,TPP_SD,TPP_SMP,TPP_SMA,UPAH,PENGELUARAN
0,ACEH,6.03,99.08,94.55,74.46,2828145.778,874702.5
1,SUMATERA UTARA,5.89,98.75,94.35,74.43,2826233.443,951545.5
2,SUMATERA BARAT,5.94,95.81,90.65,68.64,2980044.498,1038088.5
3,RIAU,4.23,98.09,90.52,67.79,3284759.902,1139477.0
4,JAMBI,4.53,97.76,89.35,66.62,2890582.041,1063650.5
5,SUMATERA SELATAN,4.11,97.58,87.95,64.81,3261016.731,886096.0
6,BENGKULU,3.42,97.1,89.25,63.41,2910804.247,992975.0
7,LAMPUNG,4.23,98.67,87.67,64.54,2737913.743,889077.5
8,KEP. BANGKA BELITUNG,4.56,96.01,87.11,68.96,3419400.003,1287651.5
9,KEP. RIAU,6.8,97.92,95.51,78.97,4760712.898,1557937.5


TPT: Tingkat Pengangguran Terbuka

TPP_SD/SMP/SMA: Tingkat Penyelesaian Pendidikan

UPAH: Rata-rata upah sebulan karyawan

PENGELUARAN: Rata-rata pengeluaran per kapita sebulan untuk makanan dan bukan makanan

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34 entries, 0 to 33
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PROVINSI     34 non-null     object 
 1   TPT          34 non-null     float64
 2   TPP_SD       34 non-null     float64
 3   TPP_SMP      34 non-null     float64
 4   TPP_SMA      34 non-null     float64
 5   UPAH         34 non-null     float64
 6   PENGELUARAN  34 non-null     float64
dtypes: float64(6), object(1)
memory usage: 2.0+ KB


# Train test split

In [None]:
# Fitur dan target
X = data[['TPT', 'TPP_SD', 'TPP_SMP', 'TPP_SMA', 'UPAH']]
y = data['PENGELUARAN']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=30)


# Modelling

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV

# Definisi parameter grid
param_grid = {
    'fit_intercept': [True, False],
    'positive': [True, False],
    'copy_X': [True, False],
    'n_jobs': [-1, None]
}

# Model
model = LinearRegression()

# GridSearchCV
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='r2', verbose=1)
grid_search.fit(X_train, y_train)

# Model terbaik
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)

Fitting 5 folds for each of 16 candidates, totalling 80 fits
Best Parameters: {'copy_X': True, 'fit_intercept': False, 'n_jobs': -1, 'positive': False}


# Evaluasi

In [None]:
# Menampilkan koefisien model
coefficients = best_model.coef_
intercept = best_model.intercept_

# Menampilkan koefisien untuk setiap variabel
print("Intercept:", intercept)
for feature, coef in zip(X_train.columns, coefficients):
    print(f"Koefisien untuk {feature}: {coef}")

Intercept: 0.0
Koefisien untuk TPT: -44312.83386287643
Koefisien untuk TPP_SD: -20896.963138932893
Koefisien untuk TPP_SMP: 23920.179971892216
Koefisien untuk TPP_SMA: 4154.637272322874
Koefisien untuk UPAH: 0.26702411101723555


In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Prediksi dan data aktual
y_pred = best_model.predict(X_test)

# Evaluasi Metrik
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae}")
print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R²): {r2}")


Mean Absolute Error (MAE): 180143.1311508012
Mean Squared Error (MSE): 51798204901.92779
Root Mean Squared Error (RMSE): 227592.18989659505
R-squared (R²): 0.7734250701295733
