# 1) Config

In [9]:

# === CONFIG ===
DATA_PATH   = "housing.csv"      # <- change me
TARGET      = "Overall_Homeless_Per_Capita"            # <- change me
OUTDIR      = "outputs"

FOLDS       = 5
N_SELECT    = 5
TUNE        = False                    # toggle tuning of best model
BLEND       = False                    # toggle blending of top models
IGNORE      = [ 'Overall Homeless',
                'Overall Homeless Individuals',
                'Overall Homeless People in Families',
                'Unsheltered Homeless',
                'Sheltered Total Homeless',
                'Overall_Homeless_Individuals_Per_Capita',
                'Overall_Homeless_People_in_Families_Per_Capita',
                'Unsheltered_Homeless_Per_Capita',
                'Sheltered_Homeless_Per_Capita']
SESSION_ID  = 222                     # random seed

# Optional: PyCaret setup knobs (kept same as script defaults)
TRAIN_SIZE                  = 0.8
NORMALIZE                   = True
TRANSFORM_TARGET            = False
REMOVE_MULTICOLLINEARITY    = True
MULTICOLLINEARITY_THRESHOLD = 0.95
VERBOSE_SETUP               = True
LOG_EXPERIMENT              = False


# 2) Imports & setup

In [10]:
# If needed:
# %pip install pycaret lightgbm catboost -q

import os
import pandas as pd

from pycaret.regression import (
    setup, compare_models, pull, tune_model, blend_models,
    finalize_model, predict_model, save_model, save_experiment
)

os.makedirs(OUTDIR, exist_ok=True)
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 200)

# 3) Load data

In [12]:
df = pd.read_csv(DATA_PATH)
df = df.dropna()
display(df.head())
print(df.shape, "rows x columns")


Unnamed: 0,Total Population,Median Gross Rent,Median Household Income,Poverty_Rate,Vacancy_Rate,Renter_Household_Rate,Cost_Burdened_Rate,Unemployment_Rate,"Total Year-Round Beds (ES, TH, SH)",Average Temperature,Overall Homeless,Overall Homeless Individuals,Overall Homeless People in Families,Unsheltered Homeless,Sheltered Total Homeless,Overall_Homeless_Per_Capita,Overall_Homeless_Individuals_Per_Capita,Overall_Homeless_People_in_Families_Per_Capita,Unsheltered_Homeless_Per_Capita,Sheltered_Homeless_Per_Capita
0,263206,59532,3931531,8.837944,7.921203,39.721806,43.348727,7.143623,988.0,18.4,1023.0,736.0,287.0,53.0,970.0,0.003887,0.002796,0.00109,0.000201,0.003685
1,294038,69646,4276757,8.08637,7.885972,40.56539,45.259545,6.718333,1019.0,7.1,1208.0,837.0,371.0,179.0,1029.0,0.004108,0.002847,0.001262,0.000609,0.0035
2,289982,72098,4293890,8.044292,8.396699,40.757004,45.706091,6.087872,1007.0,17.2,1105.0,905.0,200.0,240.0,865.0,0.003811,0.003121,0.00069,0.000828,0.002983
3,289010,73387,4437615,8.016332,8.576839,40.898085,44.809796,5.806139,1023.0,4.5,1128.0,848.0,280.0,155.0,973.0,0.003903,0.002934,0.000969,0.000536,0.003367
4,287529,75827,4527440,9.103777,8.65776,40.044081,45.93231,5.663429,1028.0,6.7,1094.0,814.0,280.0,94.0,1000.0,0.003805,0.002831,0.000974,0.000327,0.003478


(3719, 20) rows x columns


# 4) PyCaret setup

In [13]:
_ = setup(
    data=df,
    target=TARGET,
    session_id=SESSION_ID,
    fold=FOLDS,
    train_size=TRAIN_SIZE,
    normalize=NORMALIZE,
    transform_target=TRANSFORM_TARGET,
    remove_multicollinearity=REMOVE_MULTICOLLINEARITY,
    multicollinearity_threshold=MULTICOLLINEARITY_THRESHOLD,
    ignore_features=IGNORE,
    verbose=VERBOSE_SETUP,
    log_experiment=LOG_EXPERIMENT,
)


Unnamed: 0,Description,Value
0,Session id,222
1,Target,Overall_Homeless_Per_Capita
2,Target type,Regression
3,Original data shape,"(3719, 20)"
4,Transformed data shape,"(3719, 9)"
5,Transformed train set shape,"(2975, 9)"
6,Transformed test set shape,"(744, 9)"
7,Ignore features,9
8,Numeric features,10
9,Preprocess,True


# 5) Compare models & capture leaderboard

In [14]:
top_models = compare_models(n_select=N_SELECT, sort="R2")
leaderboard = pull()
leaderboard.to_csv(os.path.join(OUTDIR, "leaderboard.csv"), index=False)
display(leaderboard.head(20))

# Get the single best (first) model whether list or estimator
best_model = top_models[0] if isinstance(top_models, list) else top_models


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.0005,0.0,0.0008,0.7869,0.0008,0.3289,1.39
et,Extra Trees Regressor,0.0004,0.0,0.0008,0.7798,0.0008,0.3294,0.172
lightgbm,Light Gradient Boosting Machine,0.0005,0.0,0.0009,0.7468,0.0009,0.3464,0.11
rf,Random Forest Regressor,0.0005,0.0,0.0009,0.7315,0.0009,0.3481,0.434
gbr,Gradient Boosting Regressor,0.0006,0.0,0.001,0.6686,0.001,0.4305,0.184
knn,K Neighbors Regressor,0.0006,0.0,0.001,0.6583,0.001,0.4306,0.044
dt,Decision Tree Regressor,0.0006,0.0,0.0013,0.4422,0.0013,0.4105,0.032
br,Bayesian Ridge,0.0009,0.0,0.0015,0.3382,0.0014,0.7461,0.028
lar,Least Angle Regression,0.0009,0.0,0.0015,0.3381,0.0014,0.7462,0.026
ridge,Ridge Regression,0.0009,0.0,0.0015,0.3381,0.0014,0.7462,0.038


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,0.0005,0.0,0.0008,0.7869,0.0008,0.3289,1.39
et,Extra Trees Regressor,0.0004,0.0,0.0008,0.7798,0.0008,0.3294,0.172
lightgbm,Light Gradient Boosting Machine,0.0005,0.0,0.0009,0.7468,0.0009,0.3464,0.11
rf,Random Forest Regressor,0.0005,0.0,0.0009,0.7315,0.0009,0.3481,0.434
gbr,Gradient Boosting Regressor,0.0006,0.0,0.001,0.6686,0.001,0.4305,0.184
knn,K Neighbors Regressor,0.0006,0.0,0.001,0.6583,0.001,0.4306,0.044
dt,Decision Tree Regressor,0.0006,0.0,0.0013,0.4422,0.0013,0.4105,0.032
br,Bayesian Ridge,0.0009,0.0,0.0015,0.3382,0.0014,0.7461,0.028
lar,Least Angle Regression,0.0009,0.0,0.0015,0.3381,0.0014,0.7462,0.026
ridge,Ridge Regression,0.0009,0.0,0.0015,0.3381,0.0014,0.7462,0.038


# 6) Optional: Tune best model

In [16]:
if TUNE:
    best_model = tune_model(best_model, optimize="R2")
    tuned_results = pull()
    display(tuned_results.head())


# 7) Optional: Blend top models (graceful if incompatible)

In [17]:
if BLEND and isinstance(top_models, list) and len(top_models) > 1:
    try:
        # blend up to the top 3 models by R2
        blended = blend_models(top_models[:min(3, len(top_models))], optimize="R2")
        blended_results = pull()
        display(blended_results.head())
        # Simple policy: keep the last trained model (blended) as "best"
        best_model = blended
    except Exception as e:
        print("Blending skipped due to:", repr(e))


# 8) Finalize on full training partition & evaluate holdout

In [18]:
final_model = finalize_model(best_model)

# Evaluate on hold-out set held inside PyCaret
holdout_preds = predict_model(final_model)   # returns a dataframe with predictions
holdout_path  = os.path.join(OUTDIR, "holdout_predictions.csv")
holdout_preds.to_csv(holdout_path, index=False)
display(holdout_preds.head())
print("Saved:", holdout_path)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,0.0003,0.0,0.0005,0.9357,0.0005,0.219


Unnamed: 0,Total Population,Median Gross Rent,Median Household Income,Poverty_Rate,Vacancy_Rate,Renter_Household_Rate,Cost_Burdened_Rate,Unemployment_Rate,"Total Year-Round Beds (ES, TH, SH)",Average Temperature,Overall_Homeless_Per_Capita,prediction_label
2705,1235371,346174,19394874,18.513466,13.938038,40.52145,48.248928,11.560264,1936.0,19.799999,0.001702,0.001683
1217,101744,29490,2081070,15.024964,12.575393,30.953615,36.398903,7.315742,88.0,22.200001,0.000914,0.000751
3412,277956,52806,3334185,13.451769,9.225744,33.123146,42.988457,5.303014,490.0,40.099998,0.000961,0.00149
2653,2034094,956963,54818010,6.552794,8.097219,21.629889,53.473621,4.754526,3206.0,15.8,0.001492,0.001434
390,62798,12867,686097,18.239435,20.464594,36.373993,44.720032,11.098045,81.0,48.599998,0.002166,0.003933


Saved: outputs\holdout_predictions.csv


# 9) Save artifacts

In [19]:
# Save pipeline (includes preprocessing + model)
pipeline_path_no_ext = os.path.join(OUTDIR, "best_pipeline")
save_model(final_model, pipeline_path_no_ext)  # creates best_pipeline.pkl (+ JSON)

# Save full experiment (version-dependent in PyCaret; wrap in try)
try:
    experiment_path = os.path.join(OUTDIR, "experiment.pkl")
    save_experiment(experiment_path)
    print("Saved experiment to:", experiment_path)
except Exception as e:
    print("save_experiment not available in this PyCaret version:", repr(e))


Transformation Pipeline and Model Successfully Saved
Saved experiment to: outputs\experiment.pkl


# 10) (Optional) Predict on the full input CSV

In [20]:
full_preds = predict_model(final_model, data=df)
full_preds_path = os.path.join(OUTDIR, "full_data_with_predictions.csv")
full_preds.to_csv(full_preds_path, index=False)
display(full_preds.head())
print("Saved:", full_preds_path)


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,CatBoost Regressor,0.0002,0.0,0.0004,0.9614,0.0004,0.1993


Unnamed: 0,Total Population,Median Gross Rent,Median Household Income,Poverty_Rate,Vacancy_Rate,Renter_Household_Rate,Cost_Burdened_Rate,Unemployment_Rate,"Total Year-Round Beds (ES, TH, SH)",Average Temperature,...,Overall Homeless Individuals,Overall Homeless People in Families,Unsheltered Homeless,Sheltered Total Homeless,Overall_Homeless_Individuals_Per_Capita,Overall_Homeless_People_in_Families_Per_Capita,Unsheltered_Homeless_Per_Capita,Sheltered_Homeless_Per_Capita,Overall_Homeless_Per_Capita,prediction_label
0,263206,59532,3931531,8.837944,7.921203,39.721806,43.348728,7.143623,988.0,18.4,...,736.0,287.0,53.0,970.0,0.002796,0.00109,0.000201,0.003685,0.003887,0.003797
1,294038,69646,4276757,8.08637,7.885972,40.565392,45.259544,6.718333,1019.0,7.1,...,837.0,371.0,179.0,1029.0,0.002847,0.001262,0.000609,0.0035,0.004108,0.004054
2,289982,72098,4293890,8.044292,8.396699,40.757004,45.706089,6.087873,1007.0,17.200001,...,905.0,200.0,240.0,865.0,0.003121,0.00069,0.000828,0.002983,0.003811,0.00393
3,289010,73387,4437615,8.016332,8.576838,40.898087,44.809795,5.806139,1023.0,4.5,...,848.0,280.0,155.0,973.0,0.002934,0.000969,0.000536,0.003367,0.003903,0.004012
4,287529,75827,4527440,9.103777,8.657761,40.044083,45.932308,5.663429,1028.0,6.7,...,814.0,280.0,94.0,1000.0,0.002831,0.000974,0.000327,0.003478,0.003805,0.00409


Saved: outputs\full_data_with_predictions.csv


# 11) Quick summary

In [21]:
print("\n=== Done ===")
print(f"Artifacts saved in: {OUTDIR}")
print(" - leaderboard.csv")
print(" - holdout_predictions.csv")
print(" - full_data_with_predictions.csv")
print(" - best_pipeline.pkl (via save_model)")
print(" - experiment.pkl (via save_experiment, if supported)")


=== Done ===
Artifacts saved in: outputs
 - leaderboard.csv
 - holdout_predictions.csv
 - full_data_with_predictions.csv
 - best_pipeline.pkl (via save_model)
 - experiment.pkl (via save_experiment, if supported)
