# 04 — Ablation Study

Assess the incremental impact of each feature group by running all six models
on progressively larger feature sets:

| Config | Features Included |
|---|---|
| **Baseline** | Listing characteristics only |
| **+ Amenities** | Baseline + 30 amenity dummies |
| **+ Neighbourhoods** | Above + neighbourhood one-hot encoding |
| **+ Distance** | Above + Haversine distance to city centre (full BANR+D model) |

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from src.data_loader import load_listings
from src.data_cleaning import clean_listings
from src.feature_engineering import engineer_features
from src.modeling import (
    prepare_features, split_data,
    train_linear_regression, train_ridge, train_lasso,
    train_random_forest, train_xgboost, train_catboost,
    evaluate_model, compare_models,
)
from src.config import AMENITY_KEYWORDS
from src.visualization import set_style, plot_model_comparison

set_style()

## 1. Build Full Feature Set

In [None]:
listings_raw = load_listings()  # Uses DEFAULT_CITY from config
listings = clean_listings(listings_raw)
listings = engineer_features(listings)  # Uses DEFAULT_CITY from config
print(f'Full feature set: {listings.shape}')

## 2. Define Feature Groups

In [None]:
# Identify column groups
amenity_cols = [c for c in AMENITY_KEYWORDS.keys() if c in listings.columns]
neighbourhood_cols = [c for c in listings.columns if c.startswith('neighbourhood_')]
distance_cols = ['distance_to_city_center']

all_feature_cols = [c for c in listings.columns if c not in ['id', 'log_price', 'estimated_revenue']]
baseline_cols = [c for c in all_feature_cols
                 if c not in amenity_cols + neighbourhood_cols + distance_cols]

configs = {
    'Baseline':            baseline_cols,
    '+ Amenities':         baseline_cols + amenity_cols,
    '+ Neighbourhoods':    baseline_cols + amenity_cols + neighbourhood_cols,
    '+ Distance (Full)':   baseline_cols + amenity_cols + neighbourhood_cols + distance_cols,
}

for name, cols in configs.items():
    print(f'{name:25s}: {len(cols)} features')

## 3. Run All Models on Each Configuration

In [None]:
from sklearn.preprocessing import StandardScaler

ablation_results = {}

for config_name, feature_cols in configs.items():
    print(f'\n{"="*60}')
    print(f'Configuration: {config_name} ({len(feature_cols)} features)')
    print(f'{"="*60}')

    # Prepare data for this config
    existing_cols = [c for c in feature_cols if c in listings.columns]
    X = listings[existing_cols]
    y = listings['log_price']

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
    X_train, X_test, y_train, y_test = split_data(X_scaled, y)

    # Train all models
    models = {
        'LR':     train_linear_regression(X_train, y_train),
        'Ridge':  train_ridge(X_train, y_train)[0],
        'Lasso':  train_lasso(X_train, y_train)[0],
        'RF':     train_random_forest(X_train, y_train),
        'XGB':    train_xgboost(X_train, y_train),
        'CatB':   train_catboost(X_train, y_train),
    }

    config_results = {}
    for model_name, model in models.items():
        results = evaluate_model(model, X_train, X_test, y_train, y_test)
        config_results[model_name] = results
        print(f'  {model_name:8s} — R²: {results["test_r2"]:.4f}  MSE: {results["test_mse"]:.4f}')

    ablation_results[config_name] = compare_models(config_results)

## 4. Comparison Summary

In [None]:
# Build a summary table: config × model → Test R²
summary_rows = []
for config_name, comparison_df in ablation_results.items():
    for model_name in comparison_df.index:
        summary_rows.append({
            'Configuration': config_name,
            'Model': model_name,
            'Test R²': comparison_df.loc[model_name, 'Test R²'],
            'Test MSE': comparison_df.loc[model_name, 'Test MSE'],
        })

summary_df = pd.DataFrame(summary_rows)
pivot_r2 = summary_df.pivot(index='Model', columns='Configuration', values='Test R²')
pivot_r2 = pivot_r2[list(configs.keys())]  # reorder columns
display(pivot_r2)

In [None]:
# Visualise the improvement from each feature group
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(12, 6))
pivot_r2.T.plot(kind='bar', ax=ax, width=0.8)
ax.set_title('Ablation Study — Test R² by Feature Configuration')
ax.set_ylabel('Test R²')
ax.set_xlabel('Feature Configuration')
ax.set_xticklabels(ax.get_xticklabels(), rotation=15, ha='right')
ax.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()