# 03 — Modelling

Train and evaluate six regression models on the London Airbnb dataset:

| # | Model | Type |
|---|---|---|
| 1 | Linear Regression | Baseline |
| 2 | Ridge Regression | L2 Regularisation |
| 3 | Lasso Regression | L1 Regularisation |
| 4 | Random Forest | Ensemble (bagging) |
| 5 | XGBoost | Gradient Boosting |
| 6 | CatBoost | Gradient Boosting |

In [None]:
import sys
sys.path.insert(0, '..')

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from src.data_loader import load_listings, load_reviews
from src.data_cleaning import clean_listings
from src.feature_engineering import engineer_features
from src.modeling import (
    prepare_features, split_data,
    train_linear_regression, train_ridge, train_lasso,
    train_random_forest, train_xgboost, train_catboost,
    evaluate_model, compare_models, run_kfold_cv,
    get_feature_importance,
)
from src.visualization import (
    set_style,
    plot_predictions_vs_actual,
    plot_feature_importance,
    plot_regularization_trace,
    plot_model_comparison,
)

set_style()

## 1. Prepare Data

Load, clean, engineer features, then split into train/test sets.

In [None]:
# Load and process
listings_raw = load_listings()  # Uses DEFAULT_CITY from config
listings = clean_listings(listings_raw)

# If you have pre-computed sentiment, merge it here.
# Otherwise, run notebook 02 first to generate sentiment scores.

listings = engineer_features(listings)  # Uses DEFAULT_CITY from config
print(f'Feature matrix: {listings.shape}')

In [None]:
X, y, scaler = prepare_features(listings)
X_train, X_test, y_train, y_test = split_data(X, y)

print(f'Training set: {X_train.shape[0]:,} samples')
print(f'Test set:     {X_test.shape[0]:,} samples')
print(f'Features:     {X_train.shape[1]}')

## 2. Linear Regression (Baseline)

In [None]:
lr_model = train_linear_regression(X_train, y_train)
lr_results = evaluate_model(lr_model, X_train, X_test, y_train, y_test)

print(f'Linear Regression — Test R²: {lr_results["test_r2"]}, Test MSE: {lr_results["test_mse"]}')
plot_predictions_vs_actual(y_train, lr_results['train_pred'], y_test, lr_results['test_pred'],
                           title='Linear Regression')

## 3. Ridge Regression

In [None]:
ridge_model, ridge_trace = train_ridge(X_train, y_train)
ridge_results = evaluate_model(ridge_model, X_train, X_test, y_train, y_test)

print(f'Ridge — Test R²: {ridge_results["test_r2"]}, Test MSE: {ridge_results["test_mse"]}')
plot_regularization_trace(ridge_trace, title='Ridge Regression Trace')
plot_predictions_vs_actual(y_train, ridge_results['train_pred'], y_test, ridge_results['test_pred'],
                           title='Ridge Regression')

## 4. Lasso Regression

In [None]:
lasso_model, lasso_trace = train_lasso(X_train, y_train)
lasso_results = evaluate_model(lasso_model, X_train, X_test, y_train, y_test)

print(f'Lasso — Test R²: {lasso_results["test_r2"]}, Test MSE: {lasso_results["test_mse"]}')
plot_regularization_trace(lasso_trace, title='Lasso Regression Trace')
plot_predictions_vs_actual(y_train, lasso_results['train_pred'], y_test, lasso_results['test_pred'],
                           title='Lasso Regression')

## 5. Random Forest

In [None]:
rf_model = train_random_forest(X_train, y_train)
rf_results = evaluate_model(rf_model, X_train, X_test, y_train, y_test)

print(f'Random Forest — Test R²: {rf_results["test_r2"]}, Test MSE: {rf_results["test_mse"]}')

rf_importance = get_feature_importance(rf_model, X_train.columns)
plot_feature_importance(rf_importance, title='Random Forest — Feature Importance')
plot_predictions_vs_actual(y_train, rf_results['train_pred'], y_test, rf_results['test_pred'],
                           title='Random Forest')

## 6. XGBoost

In [None]:
xgb_model = train_xgboost(X_train, y_train)
xgb_results = evaluate_model(xgb_model, X_train, X_test, y_train, y_test)

print(f'XGBoost — Test R²: {xgb_results["test_r2"]}, Test MSE: {xgb_results["test_mse"]}')

xgb_importance = get_feature_importance(xgb_model, X_train.columns)
plot_feature_importance(xgb_importance, title='XGBoost — Feature Importance')
plot_predictions_vs_actual(y_train, xgb_results['train_pred'], y_test, xgb_results['test_pred'],
                           title='XGBoost')

## 7. CatBoost

In [None]:
catb_model = train_catboost(X_train, y_train)
catb_results = evaluate_model(catb_model, X_train, X_test, y_train, y_test)

print(f'CatBoost — Test R²: {catb_results["test_r2"]}, Test MSE: {catb_results["test_mse"]}')

catb_importance = get_feature_importance(catb_model, X_train.columns)
plot_feature_importance(catb_importance, title='CatBoost — Feature Importance')
plot_predictions_vs_actual(y_train, catb_results['train_pred'], y_test, catb_results['test_pred'],
                           title='CatBoost')

## 8. Model Comparison

In [None]:
all_results = {
    'Linear Regression': lr_results,
    'Ridge':             ridge_results,
    'Lasso':             lasso_results,
    'Random Forest':     rf_results,
    'XGBoost':           xgb_results,
    'CatBoost':          catb_results,
}

comparison = compare_models(all_results)
display(comparison)

plot_model_comparison(comparison, metric='Test R²')
plot_model_comparison(comparison, metric='Test MSE')

## 9. K-Fold Cross Validation (10-Fold)

In [None]:
cv_models = {
    'Linear Regression': lr_model,
    'Ridge':             ridge_model,
    'Lasso':             lasso_model,
    'Random Forest':     rf_model,
    'XGBoost':           xgb_model,
    'CatBoost':          catb_model,
}

cv_results = run_kfold_cv(cv_models, X, y, k=10)
display(cv_results)