In [1]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Regression Models
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor

In [2]:
try:
    from xgboost import XGBRegressor
    xgb_available = True
except ImportError:
    xgb_available = False
    print("⚠ XGBoost not installed.")

try:
    import lightgbm as lgb
    lgb_available = True
except ImportError:
    lgb_available = False
    print("⚠ LightGBM not installed.")

try:
    from catboost import CatBoostRegressor, Pool
    catboost_available = True
except ImportError:
    catboost_available = False
    print("⚠ CatBoost not installed.")

 ================== LOAD DATA ==================

In [3]:
df = pd.read_csv('D:\waste_management\data\processed\waste_data_feature_engineered.csv')

target_col = 'Recycling Rate (%)'
exclude_cols = [target_col, 'Year', 'City_WasteType'] 

In [4]:
feature_cols = [col for col in df.columns if col not in exclude_cols]
# Numeric-only features for models that can't handle categoricals
non_numeric_cols = df[feature_cols].select_dtypes(include=['object']).columns.tolist()
feature_cols_non_cat = [col for col in feature_cols if col not in non_numeric_cols]

In [5]:
# Train/test split (time-based)
train_df = df[df['Year'] < 2023].reset_index(drop=True)
test_df = df[df['Year'] == 2023].reset_index(drop=True)


In [6]:
y_train = train_df[target_col].values
y_test = test_df[target_col].values

In [7]:
# For models that need numeric-only data with no NaNs
imputer = SimpleImputer(strategy='median')
X_train_num = imputer.fit_transform(train_df[feature_cols_non_cat])
X_test_num = imputer.transform(test_df[feature_cols_non_cat])

In [8]:
# Scaled version for SVR and some linear models (optional)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_num)
X_test_scaled = scaler.transform(X_test_num)

In [9]:
results = []

def evaluate_model(name, model, X_train, y_train, X_test, y_test):
    """Fit model, predict, evaluate RMSE & R2, and save results."""
    print(f"\n🔹 Training {name}...")
    start_time = time.time()
    model.fit(X_train, y_train)
    duration = time.time() - start_time
    preds = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    print(f"{name} completed in {duration:.2f}s | RMSE: {rmse:.4f} | R²: {r2:.4f}")
    results.append({'Model': name, 'RMSE': rmse, 'R2': r2, 'Train Time (s)': duration})

================== MODEL RUNS ==================

1. Linear Regression

In [10]:
evaluate_model("Linear Regression", LinearRegression(), X_train_num, y_train, X_test_num, y_test)


🔹 Training Linear Regression...
Linear Regression completed in 0.02s | RMSE: 14.5333 | R²: 0.1692


2. Ridge Regression

In [11]:
evaluate_model("Ridge Regression", Ridge(alpha=1.0, random_state=42), X_train_num, y_train, X_test_num, y_test)


🔹 Training Ridge Regression...
Ridge Regression completed in 0.01s | RMSE: 14.5208 | R²: 0.1707


3. Lasso Regression

In [12]:
evaluate_model("Lasso Regression", Lasso(alpha=0.01, random_state=42, max_iter=10000), X_train_num, y_train, X_test_num, y_test)


🔹 Training Lasso Regression...
Lasso Regression completed in 0.00s | RMSE: 14.5147 | R²: 0.1714


4. Support Vector Regression (scaled data)

In [13]:
evaluate_model("Support Vector Regression", SVR(kernel='rbf', C=1.0, epsilon=0.1), X_train_scaled, y_train, X_test_scaled, y_test)


🔹 Training Support Vector Regression...
Support Vector Regression completed in 0.03s | RMSE: 15.1765 | R²: 0.0941


5. Random Forest Regression

In [14]:
evaluate_model("Random Forest Regression", RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42, n_jobs=-1), X_train_num, y_train, X_test_num, y_test)


🔹 Training Random Forest Regression...
Random Forest Regression completed in 0.62s | RMSE: 15.9832 | R²: -0.0048


6. XGBoost Regression

In [15]:
if xgb_available:
    evaluate_model("XGBoost Regression",
                   XGBRegressor(n_estimators=500, learning_rate=0.05, max_depth=6,
                                random_state=42, n_jobs=-1, verbosity=0),
                   X_train_num, y_train, X_test_num, y_test)


🔹 Training XGBoost Regression...
XGBoost Regression completed in 1.67s | RMSE: 17.3724 | R²: -0.1870


7. LightGBM Regression

In [16]:
if lgb_available:
    evaluate_model("LightGBM Regression",
                   lgb.LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=10,
                                     random_state=42, n_jobs=-1),
                   X_train_num, y_train, X_test_num, y_test)


🔹 Training LightGBM Regression...
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1931
[LightGBM] [Info] Number of data points in the train set: 680, number of used features: 20
[LightGBM] [Info] Start training from score 57.019118
LightGBM Regression completed in 0.86s | RMSE: 17.9472 | R²: -0.2669


8. CatBoost Regression

In [17]:
if lgb_available:
    evaluate_model("LightGBM Regression",
                   lgb.LGBMRegressor(n_estimators=500, learning_rate=0.05, max_depth=10,
                                     random_state=42, n_jobs=-1),
                   X_train_num, y_train, X_test_num, y_test)

# 8. CatBoost Regression (uses categorical features directly)
if catboost_available:
    categorical_feats = ['City/District', 'Waste Type', 'Disposal Method', 'Landfill Name']
    for cat_col in categorical_feats:
        if cat_col in train_df.columns:
            train_df[cat_col] = train_df[cat_col].astype('category')
            test_df[cat_col] = test_df[cat_col].astype('category')

    train_pool = Pool(data=train_df[feature_cols], label=y_train,
                      cat_features=[feature_cols.index(c) for c in categorical_feats if c in feature_cols])
    test_pool = Pool(data=test_df[feature_cols], label=y_test,
                     cat_features=[feature_cols.index(c) for c in categorical_feats if c in feature_cols])

    cat_model = CatBoostRegressor(iterations=500, learning_rate=0.05, depth=6,
                                  random_seed=42, eval_metric='RMSE',
                                  verbose=100, early_stopping_rounds=30)
    print("\n🔹 Training CatBoost Regression...")
    start_time = time.time()
    cat_model.fit(train_pool, eval_set=test_pool)
    duration = time.time() - start_time
    preds = cat_model.predict(test_df[feature_cols])
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    print(f"CatBoost Regression completed in {duration:.2f}s | RMSE: {rmse:.4f} | R²: {r2:.4f}")
    results.append({'Model': 'CatBoost Regression', 'RMSE': rmse, 'R2': r2, 'Train Time (s)': duration})


🔹 Training LightGBM Regression...
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1931
[LightGBM] [Info] Number of data points in the train set: 680, number of used features: 20
[LightGBM] [Info] Start training from score 57.019118
LightGBM Regression completed in 0.87s | RMSE: 17.9472 | R²: -0.2669

🔹 Training CatBoost Regression...
0:	learn: 16.0544048	test: 15.8701644	best: 15.8701644 (0)	total: 117ms	remaining: 58.6s
100:	learn: 12.4278773	test: 14.3201637	best: 14.3201637 (100)	total: 3.37s	remaining: 13.3s
Stopped by overfitting detector  (30 iterations wait)

bestTest = 14.28252404
bestIteration = 111

Shrink model to first 112 iterations.
CatBoost Regression completed in 4.89s | RMSE: 14.2825 | R²: 0.1977


 ================== RESULTS ==================

In [18]:
results_df = pd.DataFrame(results).sort_values(by='RMSE').reset_index(drop=True)
print("\n🏆 Model Comparison Results:")
print(results_df)


🏆 Model Comparison Results:
                       Model       RMSE        R2  Train Time (s)
0        CatBoost Regression  14.282524  0.197670        4.894824
1           Lasso Regression  14.514694  0.171373        0.001121
2           Ridge Regression  14.520818  0.170674        0.005998
3          Linear Regression  14.533344  0.169243        0.015052
4  Support Vector Regression  15.176454  0.094093        0.026418
5   Random Forest Regression  15.983208 -0.004780        0.618122
6         XGBoost Regression  17.372442 -0.187039        1.668615
7        LightGBM Regression  17.947198 -0.266883        0.859451
8        LightGBM Regression  17.947198 -0.266883        0.872703
