In [59]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from linearmodels.panel import PanelOLS

In [67]:
data = pd.read_csv(r'E:\COOLYEAH\smt_5\EKT\Ekonometrika-DataPanel\Data\Indonesian Salary Panel.csv')

data.head()

Unnamed: 0,REGION,YEAR,POVERTY INDEX (%),HUMAN DEVELOPMENT INDEX,POPULATION DENSITY (KM2),OPEN UNEMPLOYMENT RATE (%),HIGH SCHOOL EDUCATION LEVEL (%),GRDP (RP),SALARY (RP)
0,ACEH,2016,3.27,70.0,88,7.85,74.46,116374.299885,2118500
1,ACEH,2017,2.95,70.6,90,6.98,70.64,121240.978718,2500000
2,ACEH,2018,2.825,71.19,91,6.44,70.68,126824.365236,2700000
3,ACEH,2019,2.61,71.9,93,5.825,69.96,132069.620798,2916810
4,ACEH,2020,2.785,71.99,91,5.995,70.07,131580.967158,3165031


#### Pre-Processing

In [68]:
print(data.isnull().sum())  # Melihat jumlah missing values di setiap kolom

REGION                             0
YEAR                               0
POVERTY INDEX (%)                  0
HUMAN DEVELOPMENT INDEX            0
POPULATION DENSITY (KM2)           0
OPEN UNEMPLOYMENT RATE (%)         0
HIGH SCHOOL EDUCATION LEVEL (%)    0
GRDP (RP)                          7
SALARY (RP)                        0
dtype: int64


In [69]:
data.fillna(data.mean(numeric_only=True), inplace=True)
  # Mengisi missing values dengan rata-rata

In [70]:
scaler = StandardScaler()
columns_to_scale = ['POVERTY INDEX (%)', 'HUMAN DEVELOPMENT INDEX', 'POPULATION DENSITY (KM2)', 
                    'OPEN UNEMPLOYMENT RATE (%)', 'HIGH SCHOOL EDUCATION LEVEL (%)', 'GRDP (RP)', 'SALARY (RP)']
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

In [71]:
# Menghapus baris dengan REGION == 'INDONESIA'
data = data[data['REGION'] != 'INDONESIA']

# Menampilkan daftar REGION untuk memastikan "INDONESIA" tidak ada
print(data['REGION'].unique())  

['ACEH' 'BALI' 'BANTEN' 'BENGKULU' 'DI YOGYAKARTA' 'DKI JAKARTA'
 'GORONTALO' 'JAMBI' 'JAWA BARAT' 'JAWA TENGAH' 'JAWA TIMUR'
 'KALIMANTAN BARAT' 'KALIMANTAN SELATAN' 'KALIMANTAN TENGAH'
 'KALIMANTAN TIMUR' 'KALIMANTAN UTARA' 'KEP. BANGKA BELITUNG' 'KEP. RIAU'
 'LAMPUNG' 'MALUKU' 'MALUKU UTARA' 'NUSA TENGGARA BARAT'
 'NUSA TENGGARA TIMUR' 'PAPUA' 'PAPUA BARAT' 'RIAU' 'SULAWESI BARAT'
 'SULAWESI SELATAN' 'SULAWESI TENGAH' 'SULAWESI TENGGARA' 'SULAWESI UTARA'
 'SUMATERA BARAT' 'SUMATERA SELATAN' 'SUMATERA UTARA']


In [72]:
data = data.set_index(['REGION', 'YEAR'])

In [74]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Menghitung VIF untuk setiap variabel independen
X = sm.add_constant(data[['POVERTY INDEX (%)', 'HUMAN DEVELOPMENT INDEX', 'POPULATION DENSITY (KM2)', 
                          'OPEN UNEMPLOYMENT RATE (%)', 'HIGH SCHOOL EDUCATION LEVEL (%)', 'GRDP (RP)']])
vif = pd.DataFrame()
vif["Variable"] = X.columns
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
print(vif)

                          Variable       VIF
0                            const  1.000587
1                POVERTY INDEX (%)  2.054844
2          HUMAN DEVELOPMENT INDEX  6.092083
3         POPULATION DENSITY (KM2)  1.983853
4       OPEN UNEMPLOYMENT RATE (%)  1.382359
5  HIGH SCHOOL EDUCATION LEVEL (%)  3.881158
6                        GRDP (RP)  2.124508


#### UJI HETEROSKEDASTISITAS 
(before transform)

In [53]:
# Pisahkan variabel independen (X) dan dependen (y)
X = data[['POVERTY INDEX (%)', 'HUMAN DEVELOPMENT INDEX', 'POPULATION DENSITY (KM2)', 
          'OPEN UNEMPLOYMENT RATE (%)', 'HIGH SCHOOL EDUCATION LEVEL (%)', 'GRDP (RP)']]
y = data['SALARY (RP)']

# Tambahkan konstanta untuk model regresi
X = sm.add_constant(X)

# Lakukan regresi OLS
model = sm.OLS(y, X).fit()

# Hitung residual
residuals = model.resid

# Breusch-Pagan Test
bp_test = het_breuschpagan(residuals, model.model.exog)

# Hasil
labels = ['Lagrange multiplier statistic', 'p-value', 
          'f-value', 'f p-value']
for name, value in zip(labels, bp_test):
    print(f"{name}: {value}")

Lagrange multiplier statistic: 18.72992436281417
p-value: 0.004644915742334965
f-value: 3.2886479647205173
f p-value: 0.004003560878673151


Berdasarkan hasil Breusch-Pagan Test, kita melihat adanya heteroskedastisitas dalam model Pooled OLS karena p-value < 0.05 (baik dari Lagrange multiplier maupun f-statistik). Artinya, asumsi homoskedastisitas (kesamaan variansi residual) tidak terpenuhi.

In [54]:
data.head()

Unnamed: 0,REGION,YEAR,POVERTY INDEX (%),HUMAN DEVELOPMENT INDEX,POPULATION DENSITY (KM2),OPEN UNEMPLOYMENT RATE (%),HIGH SCHOOL EDUCATION LEVEL (%),GRDP (RP),SALARY (RP)
0,ACEH,2016,0.948117,-0.177595,-0.241121,1.673422,1.122892,-0.453009,-0.462115
1,ACEH,2017,0.721815,-0.026595,-0.240358,1.159026,0.771971,-0.441919,0.187377
2,ACEH,2018,0.633415,0.121889,-0.239976,0.839746,0.775645,-0.429196,0.52787
3,ACEH,2019,0.481369,0.300573,-0.239213,0.476121,0.709503,-0.417244,0.896982
4,ACEH,2020,0.605128,0.323223,-0.239976,0.576635,0.719608,-0.418357,1.319571


In [57]:
# Definisi model Pooled OLS
pooled_model = PanelOLS.from_formula(
    'Q("SALARY (RP)") ~ 1 + Q("POVERTY INDEX (%)") + Q("HUMAN DEVELOPMENT INDEX") + Q("POPULATION DENSITY (KM2)") + Q("OPEN UNEMPLOYMENT RATE (%)") + Q("HIGH SCHOOL EDUCATION LEVEL (%)") + Q("GRDP (RP)")',
    data=data
)

# Menjalankan estimasi model
pooled_result = pooled_model.fit()
print(pooled_result.summary)

                          PanelOLS Estimation Summary                           
Dep. Variable:       Q('SALARY (RP)')   R-squared:                        0.2821
Estimator:                   PanelOLS   R-squared (Between):              0.4650
No. Observations:                 238   R-squared (Within):              -0.0494
Date:                Fri, Nov 22 2024   R-squared (Overall):              0.2821
Time:                        05:21:02   Log-likelihood                   -301.00
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      15.132
Entities:                          34   P-value                           0.0000
Avg Obs:                       7.0000   Distribution:                   F(6,231)
Min Obs:                       7.0000                                           
Max Obs:                       7.0000   F-statistic (robust):             15.132
                            

In [63]:
# Mengambil residu dari model Pooled OLS
residuals = pooled_result.resids
X = data[['POVERTY INDEX (%)', 'HUMAN DEVELOPMENT INDEX', 'POPULATION DENSITY (KM2)', 
          'OPEN UNEMPLOYMENT RATE (%)', 'HIGH SCHOOL EDUCATION LEVEL (%)', 'GRDP (RP)']]

# Menambahkan konstanta
X = sm.add_constant(X)

# Uji heteroskedastisitas (Breusch-Pagan)
bp_test = het_breuschpagan(residuals, X)
print(f'Lagrange multiplier statistic: {bp_test[0]}')
print(f'p-value: {bp_test[1]}')
print(f'F-statistic: {bp_test[2]}')
print(f'F p-value: {bp_test[3]}')

Lagrange multiplier statistic: 18.729924362814145
p-value: 0.00464491574233501
F-statistic: 3.288647964720517
F p-value: 0.004003560878673151


In [66]:
# Menggunakan robust standard errors untuk Pooled OLS
pooled_model_robust = pooled_model.fit(cov_type='kernel', kernel='bartlett')

# Menampilkan hasil dengan robust standard errors
print(pooled_model_robust)


                          PanelOLS Estimation Summary                           
Dep. Variable:       Q('SALARY (RP)')   R-squared:                        0.2821
Estimator:                   PanelOLS   R-squared (Between):              0.4650
No. Observations:                 238   R-squared (Within):              -0.0494
Date:                Fri, Nov 22 2024   R-squared (Overall):              0.2821
Time:                        05:24:23   Log-likelihood                   -301.00
Cov. Estimator:        Driscoll-Kraay                                           
                                        F-statistic:                      15.132
Entities:                          34   P-value                           0.0000
Avg Obs:                       7.0000   Distribution:                   F(6,231)
Min Obs:                       7.0000                                           
Max Obs:                       7.0000   F-statistic (robust):             2064.0
                            

In [45]:
print(data.columns)

Index(['REGION', 'YEAR', 'POVERTY_INDEX', 'HUMAN_DEVELOPMENT_INDEX',
       'POPULATION_DENSITY', 'OPEN_UNEMPLOYMENT_RATE',
       'HIGH_SCHOOL_EDUCATION_LEVEL', 'GRDP', 'SALARY'],
      dtype='object')
