# Predictive Modeling
## Philippine Health Indicators

**Purpose**
Develop predictive models to estimate future health outcomes and
classify health system performance using regression, machine learning,
and time-series forecasting techniques.

**Dataset Source**
https://www.kaggle.com/datasets/thedevastator/philippine-health-indicators

**Models Used**
- Linear / Ridge / Lasso Regression
- Random Forest Regressor
- ARIMA / SARIMAX (time-series forecasting)


In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    r2_score,
    accuracy_score,
    classification_report
)

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from statsmodels.tsa.arima.model import ARIMA

sns.set(style="whitegrid")
pd.set_option("display.max_columns", 100)

# Load dataset
df = pd.read_csv("/content/cleaned_philippine_health_indicators.csv")

df.head()


In [None]:
# Identify numeric predictors
numeric_cols = df.select_dtypes(include=np.number).columns.tolist()

# Remove non-feature columns
if "Year" in numeric_cols:
    numeric_cols.remove("Year")

numeric_cols


In [None]:
# Choose target health outcome (adjust as needed)
target = numeric_cols[0]

# Predictors
features = [col for col in numeric_cols if col != target]

X = df[features]
y = df[target]

# Handle missing values
X = X.fillna(X.median())
y = y.fillna(y.median())

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

X_train.shape, X_test.shape


In [None]:
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)

y_pred_lr = lr.predict(X_test_scaled)


In [None]:
ridge = Ridge(alpha=1.0)
lasso = Lasso(alpha=0.01)

ridge.fit(X_train_scaled, y_train)
lasso.fit(X_train_scaled, y_train)

y_pred_ridge = ridge.predict(X_test_scaled)
y_pred_lasso = lasso.predict(X_test_scaled)


In [None]:
rf = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)


In [None]:
def evaluate_regression(y_true, y_pred, model_name):
    return {
        "Model": model_name,
        "RMSE": mean_squared_error(y_true, y_pred, squared=False),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R2": r2_score(y_true, y_pred)
    }

results = pd.DataFrame([
    evaluate_regression(y_test, y_pred_lr, "Linear Regression"),
    evaluate_regression(y_test, y_pred_ridge, "Ridge Regression"),
    evaluate_regression(y_test, y_pred_lasso, "Lasso Regression"),
    evaluate_regression(y_test, y_pred_rf, "Random Forest")
])

results


In [None]:
# Define binary outcome using median threshold
df["binary_outcome"] = (df[target] > df[target].median()).astype(int)

X_cls = df[features].fillna(df[features].median())
y_cls = df["binary_outcome"]

X_train, X_test, y_train, y_test = train_test_split(
    X_cls, y_cls,
    test_size=0.2,
    random_state=42
)

rf_cls = RandomForestRegressor(random_state=42)
rf_cls.fit(X_train, y_train)

y_pred_cls = (rf_cls.predict(X_test) > 0.5).astype(int)

print(classification_report(y_test, y_pred_cls))


In [None]:
# Aggregate by year
ts_df = (
    df.groupby("Year")[target]
    .mean()
    .sort_index()
)

ts_df.head()


In [None]:
# Fit ARIMA model
model = ARIMA(ts_df, order=(1, 1, 1))
model_fit = model.fit()

model_fit.summary()


In [None]:
forecast_steps = 5
forecast = model_fit.forecast(steps=forecast_steps)

forecast_years = range(ts_df.index.max() + 1, ts_df.index.max() + 1 + forecast_steps)

forecast_df = pd.DataFrame({
    "Year": forecast_years,
    "Forecasted Value": forecast
})

forecast_df


In [None]:
# Plot historical + forecast
plt.figure(figsize=(10, 5))
plt.plot(ts_df.index, ts_df.values, label="Historical")
plt.plot(forecast_df["Year"], forecast_df["Forecasted Value"],
         linestyle="--", marker="o", label="Forecast")

plt.title(f"Forecast of {target}")
plt.xlabel("Year")
plt.ylabel("Value")
plt.legend()
plt.show()


In [None]:
results.to_csv(
    "/content/predictive_model_results.csv",
    index=False
)

forecast_df.to_csv(
    "/content/health_indicator_forecast.csv",
    index=False
)


## Key Findings from Predictive Modeling

- Machine learning models outperform linear baselines for complex relationships
- Feature importance suggests structural health determinants drive outcomes
- Time-series forecasting provides forward-looking policy insight
- Classification enables health system performance stratification

