In [2]:
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_absolute_error

# 1) Load the data
df = pd.read_excel('01_Belgium_1.xlsx')

# 2) Rename or select the relevant columns
#    Replace these with your actual column names if different
df = df.rename(columns={
    'T_column_name': 'T',
    'X1_column_name': 'X1',
    'X2_column_name': 'X2',
    'Y_column_name': 'Y'
})

# 3) Sort by T (assuming T is a continuous “time” or index variable)
df = df.sort_values('T')

# 4) Define an 80%‐quantile cutoff to separate interpolation vs. extrapolation
T_cut = df['T'].quantile(0.8)
df_in_range   = df[df['T'] <= T_cut].copy()   # data within the training range
df_extrapolate = df[df['T'] >  T_cut].copy()   # data beyond that range

# 5) From the in‐range data, hold out 20% as the interpolation test set
train_in, test_interp = (
    df_in_range
    .sample(frac=0.8, random_state=42),
    df_in_range.drop(df_in_range.sample(frac=0.8, random_state=42).index)
)

# 6) Prepare matrices for Statsmodels
X_train = sm.add_constant(train_in[['T','X1','X2']])
y_train = train_in['Y']

X_interp_test = sm.add_constant(test_interp[['T','X1','X2']])
y_interp_test = test_interp['Y']

X_extrap = sm.add_constant(df_extrapolate[['T','X1','X2']])
y_extrap = df_extrapolate['Y']

# 7) Fit the OLS model
model = sm.OLS(y_train, X_train).fit()

# 8) Predict and compute MAE
y_pred_interp = model.predict(X_interp_test)
y_pred_extrap = model.predict(X_extrap)

mae_interp = mean_absolute_error(y_interp_test, y_pred_interp)
mae_extrap = mean_absolute_error(y_extrap, y_pred_extrap)

print(f"MAE (interpolation) : {mae_interp:.4f}")
print(f"MAE (extrapolation): {mae_extrap:.4f}")


KeyError: "['X1', 'X2'] not in index"