In [1]:
import pandas as pd
from catboost import CatBoostRegressor, Pool
import optuna
from sklearn.metrics import mean_squared_error
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load your feature-engineered dataset
df = pd.read_csv('D:\waste_management\data\processed\waste_data_feature_engineered.csv')

In [3]:
target = 'Recycling Rate (%)'
categorical_cols = ['City/District', 'Waste Type', 'Disposal Method', 'Landfill Name']

In [4]:
# Prepare train and test splits by year
train_df = df[df['Year'] < 2023].reset_index(drop=True)
test_df = df[df['Year'] == 2023].reset_index(drop=True)

y_train = train_df[target]
y_test = test_df[target]


In [5]:
# Ensure categorical columns are of 'category' dtype, necessary for CatBoost native handling
for col in categorical_cols:
    train_df[col] = train_df[col].astype('category')
    test_df[col] = test_df[col].astype('category')

In [6]:
# Feature columns excluding target and non-feature columns
exclude_cols = [target, 'Year', 'City_WasteType']
feature_cols = [col for col in df.columns if col not in exclude_cols]

train_pool = Pool(data=train_df[feature_cols], label=y_train, cat_features=[feature_cols.index(c) for c in categorical_cols])
test_pool = Pool(data=test_df[feature_cols], label=y_test, cat_features=[feature_cols.index(c) for c in categorical_cols])

In [7]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'random_seed': 42,
        'eval_metric': 'RMSE',
        'verbose': False,
        'early_stopping_rounds': 50
    }

    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=test_pool, use_best_model=True)

    preds = model.predict(test_df[feature_cols])
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    return rmse

In [None]:
# Create Optuna study to minimize RMSE
study = optuna.create_study(direction='minimize', study_name="CatBoost_Hyperparam_Tuning")
study.optimize(objective, n_trials=50, timeout=1800)  # Run for 50 trials or 30 minutes

print(f"Best Trial RMSE: {study.best_value:.4f}")
print("Best Hyperparameters:")
for key, value in study.best_params.items():
    print(f"  {key}: {value}")

[32m[I 2025-08-10 19:23:58,870][0m A new study created in memory with name: CatBoost_Hyperparam_Tuning[0m
[32m[I 2025-08-10 19:24:02,504][0m Trial 0 finished with value: 14.46669058270715 and parameters: {'iterations': 867, 'depth': 6, 'learning_rate': 0.08271594247571787, 'l2_leaf_reg': 3.694190531286841, 'border_count': 239}. Best is trial 0 with value: 14.46669058270715.[0m


In [None]:
final_params = {
    'iterations': 1490,
    'depth': 5,
    'learning_rate': 0.07852514854846948,
    'l2_leaf_reg': 6.7065746334612495,
    'border_count': 180,
    'random_seed': 42,
    'eval_metric': 'RMSE',
    'verbose': 100,
    'early_stopping_rounds': 100
}

In [None]:
final_model = CatBoostRegressor(**final_params)
final_model.fit(train_pool, eval_set=test_pool, use_best_model=True)

0:	learn: 16.0109655	test: 15.8467568	best: 15.8467568 (0)	total: 24.9ms	remaining: 37s
100:	learn: 12.5058879	test: 14.0624829	best: 14.0624829 (100)	total: 2.62s	remaining: 36s
200:	learn: 10.7901829	test: 14.3305091	best: 13.9590936 (130)	total: 5.21s	remaining: 33.4s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 13.95909358
bestIteration = 130

Shrink model to first 131 iterations.


<catboost.core.CatBoostRegressor at 0x19366c34c40>

In [None]:
from sklearn.metrics import r2_score
# Evaluate
final_preds = final_model.predict(test_df[feature_cols])
final_rmse = np.sqrt(mean_squared_error(y_test, final_preds))
final_r2 = r2_score(y_test, final_preds)

print(f"Final RMSE: {final_rmse:.4f}")
print(f"Final R²: {final_r2:.4f}")

Final RMSE: 13.9591
Final R²: 0.2336


In [None]:
import joblib
final_model.save_model("D:\waste_management\models\catboost_tuned_model.cbm")
joblib.dump(final_model, "D:\waste_management\models\catboost_tuned_model.pkl")


['D:\\waste_management\\models\\catboost_tuned_model.pkl']