In [43]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the saved data
model_df = pd.read_csv("data_wpk_few_predictors.csv")

model_df["date"] = pd.to_datetime(model_df["date"])

# Define y and X
y = model_df["gdp_growth"]

X = model_df[[
    "gdp_lag1",
    "indprod_g",
    "retail_sales_g",
    "pers_income_g",
    "unemploy_claims_g",
    "sentiment_ch",
    "housing_g",
    "cap_uti_g",
    "ppi_g",
    "cpi_g"
]]

# Time-based split (no shuffle)
n = len(model_df)
train_size = int(n * 0.8)

X_train = X.iloc[:train_size]
y_train = y.iloc[:train_size]

X_test  = X.iloc[train_size:]
y_test  = y.iloc[train_size:]

# Fit model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# Evaluate
y_pred_train = lin_reg.predict(X_train)
y_pred_test  = lin_reg.predict(X_test)


# Predictions
results_test = model_df.iloc[train_size:].copy()

results_test["gdp_true"]  = y_test.values
results_test["gdp_pred"]  = y_pred_test

# See a few rows
print(results_test[["date", "gdp_true", "gdp_pred"]].tail(20))


rmse_train_l = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test_l  = np.sqrt(mean_squared_error(y_test,  y_pred_test))

mae_train_l = mean_absolute_error(y_train, y_pred_train)
mae_test_l  = mean_absolute_error(y_test,  y_pred_test)

r2_train_l = r2_score(y_train, y_pred_train)
r2_test_l  = r2_score(y_test,  y_pred_test)

print("Train RMSE:", rmse_train_l, "MAE:", mae_train_l, "R2:", r2_train_l)
print("Test  RMSE:", rmse_test_l,  "MAE:", mae_test_l, "R2:", r2_test_l)

# Coefficients
coef_df = pd.DataFrame({
    "variable": X.columns,
    "coefficient": lin_reg.coef_
})
print(coef_df)


          date  gdp_true  gdp_pred
113 2020-09-30  0.083870  0.043086
114 2020-12-31  0.017483  0.018801
115 2021-03-31  0.026516  0.018516
116 2021-06-30  0.032328  0.022364
117 2021-09-30  0.023478  0.012440
118 2021-12-31  0.034072  0.020483
119 2022-03-31  0.017446  0.018967
120 2022-06-30  0.023910  0.017291
121 2022-09-30  0.018201  0.012648
122 2022-12-31  0.016352  0.009745
123 2023-03-31  0.016519  0.013355
124 2023-06-30  0.011460  0.009569
125 2023-09-30  0.019592  0.011449
126 2023-12-31  0.012386  0.010455
127 2024-03-31  0.009924  0.008715
128 2024-06-30  0.015169  0.010876
129 2024-09-30  0.012435  0.009408
130 2024-12-31  0.010567  0.011592
131 2025-03-31  0.007246  0.011867
132 2025-06-30  0.014658  0.009035
Train RMSE: 0.003998082089740733 MAE: 0.0031166001018656667 R2: 0.6034331792080831
Test  RMSE: 0.013532662512789216 MAE: 0.007712346421253461 R2: 0.7047917291476467
            variable  coefficient
0           gdp_lag1     0.052332
1          indprod_g     0.26139

Interpretation: 
The linear model predicts GDP growth quite well. The test R² is about 0.70, which means it can explain around 70% of the changes in GDP growth. Industrial production, retail sales and CPI have the biggest positive impact: when these grow, GDP growth also tends to grow. PPI has a small negative effect, so higher producer prices are linked to slightly weaker growth. The test errors (RMSE ≈ 0.0135 and MAE ≈ 0.0077) are small, so the model’s predictions are usually close to the real GDP numbers.

In [44]:
from sklearn.tree import DecisionTreeRegressor
# Fit Decision Tree Regressor 
# max_depth keeps the tree simple (avoid overfitting)
tree_reg = DecisionTreeRegressor(
    max_depth=3,        
    min_samples_leaf=5, # avoid tiny leaves
    random_state=777
)

tree_reg.fit(X_train, y_train)

# Predictions and metrics
y_pred_train = tree_reg.predict(X_train)
y_pred_test  = tree_reg.predict(X_test)

rmse_train_t = np.sqrt(mean_squared_error(y_train, y_pred_train))
rmse_test_t  = np.sqrt(mean_squared_error(y_test,  y_pred_test))

mae_train_t = mean_absolute_error(y_train, y_pred_train)
mae_test_t  = mean_absolute_error(y_test,  y_pred_test)

r2_train_t = r2_score(y_train, y_pred_train)
r2_test_t  = r2_score(y_test,  y_pred_test)

print("=== Decision Tree Regression Results ===")
print(f"Train RMSE: {rmse_train_t:.4f}, MAE: {mae_train_t:.4f}, R²: {r2_train_t:.4f}")
print(f"Test  RMSE: {rmse_test_t:.4f}, MAE: {mae_test_t:.4f}, R²: {r2_test_t:.4f}")

# Predictions
results_test_tree = model_df.iloc[train_size:].copy()
results_test_tree["gdp_true"] = y_test.values
results_test_tree["gdp_pred_tree"] = y_pred_test

print(results_test_tree[["date", "gdp_true", "gdp_pred_tree"]].tail(20))

# Nowcast for the last available quarter
X_latest = X.iloc[[-1]]
date_latest = model_df["date"].iloc[-1]

gdp_nowcast_tree = tree_reg.predict(X_latest)[0]
print(f"Decision Tree nowcast for {date_latest}: {gdp_nowcast_tree:.4f}")


=== Decision Tree Regression Results ===
Train RMSE: 0.0042, MAE: 0.0032, R²: 0.5711
Test  RMSE: 0.0221, MAE: 0.0125, R²: 0.2104
          date  gdp_true  gdp_pred_tree
113 2020-09-30  0.083870       0.015394
114 2020-12-31  0.017483       0.012231
115 2021-03-31  0.026516       0.009843
116 2021-06-30  0.032328       0.015394
117 2021-09-30  0.023478       0.012231
118 2021-12-31  0.034072       0.015394
119 2022-03-31  0.017446       0.015394
120 2022-06-30  0.023910       0.015394
121 2022-09-30  0.018201       0.002876
122 2022-12-31  0.016352       0.002876
123 2023-03-31  0.016519       0.009843
124 2023-06-30  0.011460       0.009843
125 2023-09-30  0.019592       0.009843
126 2023-12-31  0.012386       0.002876
127 2024-03-31  0.009924       0.002876
128 2024-06-30  0.015169       0.012231
129 2024-09-30  0.012435       0.009843
130 2024-12-31  0.010567       0.009843
131 2025-03-31  0.007246       0.012231
132 2025-06-30  0.014658       0.009843
Decision Tree nowcast for 2025-

Interpretation:
The Decision Tree fits the training data very well, but it does not work as well on the test data.
Its test R² is only 0.21, which means it explains little of the real changes in GDP.
The test errors (RMSE ≈ 0.0221 and MAE ≈ 0.0125) are higher than the linear model, so the predictions are less accurate.
The tree gives almost the same value for many quarters, which makes it unable to follow real movements in GDP growth.

In [45]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest Regressor
rf_reg = RandomForestRegressor(
    n_estimators=300,
    max_depth=4,          # not too deep to avoid overfitting
    min_samples_leaf=3,
    random_state=42
)

rf_reg.fit(X_train, y_train)

# Predictions
y_pred_train_rf = rf_reg.predict(X_train)
y_pred_test_rf  = rf_reg.predict(X_test)

# Metrics
rmse_train_rf = np.sqrt(mean_squared_error(y_train, y_pred_train_rf))
rmse_test_rf  = np.sqrt(mean_squared_error(y_test,  y_pred_test_rf))

mae_train_rf = mean_absolute_error(y_train, y_pred_train_rf)
mae_test_rf  = mean_absolute_error(y_test,  y_pred_test_rf)

r2_train_rf = r2_score(y_train, y_pred_train_rf)
r2_test_rf  = r2_score(y_test,  y_pred_test_rf)

print("=== Random Forest Regression Results ===")
print(f"Train RMSE: {rmse_train_rf:.4f}, MAE: {mae_train_rf:.4f}, R²: {r2_train_rf:.4f}")
print(f"Test  RMSE: {rmse_test_rf:.4f}, MAE: {mae_test_rf:.4f}, R²: {r2_test_rf:.4f}")

# Attach predictions to test set (optional)
results_test_rf = model_df.iloc[train_size:].copy()
results_test_rf["gdp_true"]     = y_test.values
results_test_rf["gdp_pred_rf"]  = y_pred_test_rf

print(results_test_rf[["date", "gdp_true", "gdp_pred_rf"]].tail(20))

# Nowcast for last quarter
X_latest = X.iloc[[-1]]
date_latest = model_df["date"].iloc[-1]
gdp_nowcast_rf = rf_reg.predict(X_latest)[0]
print(f"Random Forest nowcast for {date_latest}: {gdp_nowcast_rf:.4f}")


=== Random Forest Regression Results ===
Train RMSE: 0.0034, MAE: 0.0025, R²: 0.7120
Test  RMSE: 0.0229, MAE: 0.0126, R²: 0.1534
          date  gdp_true  gdp_pred_rf
113 2020-09-30  0.083870     0.011233
114 2020-12-31  0.017483     0.010959
115 2021-03-31  0.026516     0.009933
116 2021-06-30  0.032328     0.011857
117 2021-09-30  0.023478     0.008322
118 2021-12-31  0.034072     0.012605
119 2022-03-31  0.017446     0.012915
120 2022-06-30  0.023910     0.013954
121 2022-09-30  0.018201     0.007421
122 2022-12-31  0.016352     0.006348
123 2023-03-31  0.016519     0.009864
124 2023-06-30  0.011460     0.008811
125 2023-09-30  0.019592     0.010206
126 2023-12-31  0.012386     0.008191
127 2024-03-31  0.009924     0.006728
128 2024-06-30  0.015169     0.011424
129 2024-09-30  0.012435     0.010132
130 2024-12-31  0.010567     0.011145
131 2025-03-31  0.007246     0.013650
132 2025-06-30  0.014658     0.010156
Random Forest nowcast for 2025-06-30 00:00:00: 0.0102


Interpretation:
The Random Forest model learns the training data very well, but it does not generalize to new quarters.
The test R² is very low (0.15), which means the model explains only a small part of the real GDP movements.
Its test errors (RMSE ≈ 0.0229 and MAE ≈ 0.0126) are higher than the linear model, so the predictions are less accurate.
Even with many trees, the model still produces smooth values and misses bigger changes in GDP growth.

In [46]:
comparison = pd.DataFrame({
    "model": ["Linear", "Decision Tree", "Random Forest"],
    "rmse_test": [rmse_test_l, rmse_test_t, rmse_test_rf],
    "mae_test":  [mae_test_l, mae_test_t, mae_test_rf],
    "r2_test":   [r2_test_l, r2_test_t, r2_test_rf]
})

print(comparison)


           model  rmse_test  mae_test   r2_test
0         Linear   0.013533  0.007712  0.704792
1  Decision Tree   0.022133  0.012506  0.210368
2  Random Forest   0.022917  0.012582  0.153402


The linear model gives the best predictions.
It has the lowest errors (RMSE ≈ 0.0135, MAE ≈ 0.0077) and the highest R² (≈ 0.70), so it explains most of the changes in GDP growth.
The Decision Tree and the Random Forest have higher errors and much lower R² values, which means they do not follow GDP movements as well as the linear model.