## Funding Gap Prediction - 2025

In [1]:
# Library imports
import pandas as pd
import numpy as np
import os
from pathlib import Path

In [2]:
# Set the base path for input data files
CURRENT_DIR = Path().resolve()
DATA_BASE_PATH = CURRENT_DIR.parent / "outputs" / "data_output"

In [3]:
# set the path for the modeling data file
output_dir = os.path.join("..", "outputs", "model_output")
os.makedirs(output_dir, exist_ok=True)

In [4]:
# 1. Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import ExtraTreesRegressor
from catboost import CatBoostRegressor

# 2. Load data
df = pd.read_csv("../outputs/data_output/Financial_Cleaned.csv")
df_2025 = df.dropna(subset=["2025 Required", "2025 Available", "2025 Expenditure"]).copy()

# 3. Feature Engineering
df_2025["Required_per_Available"] = df_2025["2025 Required"] / (df_2025["2025 Available"] + 1)
df_2025["Expenditure_per_Required"] = df_2025["2025 Expenditure"] / (df_2025["2025 Required"] + 1)

# Separate features and target
y = df_2025[['2025 Required', '2025 Available', '2025 Expenditure']]
y_log = np.log1p(y)

drop_cols = ['2025 Required', '2025 Available', '2025 Expenditure', '2026 Required', '2026 Available', '2026 Expenditure']
X = df_2025.drop(columns=[col for col in drop_cols if col in df_2025.columns])

# Identify columns
categorical_cols = [col for col in ['Country', 'Region', 'Agencies', 'Theme', 'Plan name', 'SP_Label'] if col in X.columns]
numeric_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# 4. Preprocessing Pipelines
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('log', FunctionTransformer(np.log1p, validate=False)),
    ('scaler', MinMaxScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# 5. Model definitions
models = {
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42),
    "ExtraTrees": ExtraTreesRegressor(n_estimators=100, random_state=42),
    "CatBoost": CatBoostRegressor(n_estimators=100, verbose=0, random_state=42, allow_writing_files=False)
}

# 6. Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_log, test_size=0.2, random_state=42)

# 7. Training and evaluation
results = {}

for name, base_model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', MultiOutputRegressor(base_model))
    ])
    
    pipeline.fit(X_train, y_train)
    preds_log = pipeline.predict(X_test)
    preds = np.expm1(preds_log)
    y_true = np.expm1(y_test)

    print(f"\n📊 Model: {name}")
    results[name] = {}
    for i, col in enumerate(y.columns):
        mae = mean_absolute_error(y_true.iloc[:, i], preds[:, i])
        r2 = r2_score(y_true.iloc[:, i], preds[:, i])
        results[name][col] = {"MAE": mae, "R2": r2}
        print(f"{col}: MAE = {mae:,.0f}, R² = {r2:.3f}")


📊 Model: RandomForest
2025 Required: MAE = 39,741,781, R² = 0.374
2025 Available: MAE = 26,810,767, R² = 0.325
2025 Expenditure: MAE = 26,698, R² = 0.780

📊 Model: GradientBoosting
2025 Required: MAE = 44,534,263, R² = 0.315
2025 Available: MAE = 29,714,000, R² = 0.164
2025 Expenditure: MAE = 25,706, R² = 0.852

📊 Model: XGBoost
2025 Required: MAE = 38,667,090, R² = 0.514
2025 Available: MAE = 23,656,554, R² = 0.512
2025 Expenditure: MAE = 38,324, R² = 0.712

📊 Model: ExtraTrees
2025 Required: MAE = 40,439,636, R² = 0.239
2025 Available: MAE = 27,428,303, R² = 0.098
2025 Expenditure: MAE = 31,064, R² = 0.814

📊 Model: CatBoost
2025 Required: MAE = 45,726,682, R² = 0.314
2025 Available: MAE = 28,859,468, R² = 0.257
2025 Expenditure: MAE = 37,566, R² = 0.710


In [5]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
from scipy.stats import randint

# 5. Hyperparameter Tuning
param_dist = {
    "estimator__n_estimators": randint(100, 500),
    "estimator__max_depth": randint(5, 50),
    "estimator__min_samples_split": randint(2, 10),
    "estimator__min_samples_leaf": randint(1, 5)
}

rf_base = MultiOutputRegressor(RandomForestRegressor(random_state=42))
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', rf_base)
])

search = RandomizedSearchCV(
    pipeline, 
    param_distributions={"model__" + k: v for k, v in param_dist.items()},
    n_iter=20,
    scoring='neg_mean_absolute_error',
    cv=3,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)

# 6. Evaluation on Test Set
best_model = search.best_estimator_
preds_log = best_model.predict(X_test)
preds = np.expm1(preds_log)
y_true = np.expm1(y_test)

print("\n📊 Tuned RandomForest Model Performance on 2025 Test Set")
for i, col in enumerate(y.columns):
    mae = mean_absolute_error(y_true.iloc[:, i], preds[:, i])
    r2 = r2_score(y_true.iloc[:, i], preds[:, i])
    print(f"{col}: MAE = {mae:,.0f}, R² = {r2:.3f}")

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] END model__estimator__max_depth=47, model__estimator__min_samples_leaf=4, model__estimator__min_samples_split=6, model__estimator__n_estimators=120; total time=   2.8s
[CV] END model__estimator__max_depth=47, model__estimator__min_samples_leaf=4, model__estimator__min_samples_split=6, model__estimator__n_estimators=120; total time=   2.9s
[CV] END model__estimator__max_depth=47, model__estimator__min_samples_leaf=4, model__estimator__min_samples_split=6, model__estimator__n_estimators=120; total time=   2.9s
[CV] END model__estimator__max_depth=43, model__estimator__min_samples_leaf=4, model__estimator__min_samples_split=6, model__estimator__n_estimators=370; total time=   8.1s
[CV] END model__estimator__max_depth=43, model__estimator__min_samples_leaf=4, model__estimator__min_samples_split=6, model__estimator__n_estimators=370; total time=   8.3s
[CV] END model__estimator__max_depth=43, model__estimator__min_samples_lea

In [6]:
# 7. Predict for 2026
df_2026 = df[
    (df["2026 Required"] == 0) &
    (df["2026 Available"] == 0) &
    (df["2026 Expenditure"] == 0)
].copy()

df_2026["Required_per_Available"] = df_2026["2025 Required"] / (df_2026["2025 Available"] + 1)
df_2026["Expenditure_per_Required"] = df_2026["2025 Expenditure"] / (df_2026["2025 Required"] + 1)

X_2026 = df_2026.drop(columns=[col for col in drop_cols if col in df_2026.columns])

# Predict using tuned model
X_2026_preds_log = best_model.predict(X_2026)
X_2026_preds = np.expm1(X_2026_preds_log)

X_2026_preds = np.round(X_2026_preds).astype(int)

df_2026_predictions = df_2026.reset_index(drop=True).copy()
df_2026_predictions[['2026 Required', '2026 Available', '2026 Expenditure']] = X_2026_preds
df_2026_predictions.to_csv("../outputs/model_output/funding_prediction.csv", index=False)