<a href="https://colab.research.google.com/github/arvindsuresh-math/Fall-2025-Team-Big-Data/blob/main/notebooks/fully_connected_baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 0. Setup and Installations

This cell prepares the Google Colab environment by mounting Google Drive, changing the working directory to our project folder to ensure all custom modules can be imported, installing the required Python packages, and handling Hugging Face authentication.

In [1]:
# --- Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

# --- Change Directory to Project Folder ---
# This is a crucial step that makes all local imports work seamlessly
import os
# IMPORTANT: Make sure this path matches the location of your project folder in Google Drive
PROJECT_PATH = '/content/drive/MyDrive/Airbnb_Price_Project'
os.chdir(PROJECT_PATH)
print(f"Current working directory: {os.getcwd()}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Current working directory: /content/drive/MyDrive/Airbnb_Price_Project


In [2]:
# --- Hugging Face Authentication ---
from google.colab import userdata
from huggingface_hub import login
print("\nAttempting Hugging Face login...")
try:
    HF_TOKEN = userdata.get('HF_TOKEN')
    login(token=HF_TOKEN)
    print("Hugging Face login successful.")
except Exception as e:
    print(f"Could not log in. Please ensure 'HF_TOKEN' is a valid secret. Error: {e}")


Attempting Hugging Face login...
Hugging Face login successful.


In [3]:
# --- Install Dependencies ---
!pip install pandas
!pip install pyarrow
!pip install sentence-transformers
!pip install scikit-learn
!pip install torch
!pip install tqdm
!pip install transformers
!pip install matplotlib
!pip install seaborn



In [4]:
import os
import time
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from sentence_transformers import SentenceTransformer

In [5]:
class AirbnbPriceDataset(Dataset):
    """
    PyTorch Dataset to handle feature collation and on-the-fly tokenization.
    """
    def __init__(self, features: dict, tokenizer: AutoTokenizer):
        self.features = features
        self.tokenizer = tokenizer
        self.n_samples = len(features['target_price'])

    def __len__(self) -> int:
        return self.n_samples

    def __getitem__(self, index: int) -> dict:
        item = {
            'loc_geo_position': torch.tensor(self.features['location']['geo_position'][index], dtype=torch.float32),
            'season_cyclical': torch.tensor(self.features['seasonality']['cyclical'][index], dtype=torch.float32),
            'target_price': torch.tensor(self.features['target_price'][index], dtype=torch.float32),
            'target_log_deviation': torch.tensor(self.features['target_log_deviation'][index], dtype=torch.float32),
            'neighborhood_log_mean': torch.tensor(self.features['neighborhood_log_mean'][index], dtype=torch.float32),
        }
        for k, v in self.features['size_capacity'].items():
            dtype = torch.long if k in ['property_type', 'room_type'] else torch.float32
            item[f'size_{k}'] = torch.tensor(v[index], dtype=dtype)
        for k, v in self.features['quality'].items():
            item[f'qual_{k}'] = torch.tensor(v[index], dtype=torch.float32)

        item['amenities_tokens'] = self.tokenizer(
            self.features['amenities_text'][index], padding='max_length', truncation=True,
            max_length=128, return_tensors="pt"
        )
        item['description_tokens'] = self.tokenizer(
            self.features['description_text'][index], padding='max_length', truncation=True,
            max_length=256, return_tensors="pt"
        )
        return item

In [6]:
class BaselineFCNV2(nn.Module):
    """
    A regularized, smaller Fully Connected Network to combat overfitting.

    Changes from the original baseline:
    1. Reduced layer sizes from (256 -> 64) to (128 -> 32) to decrease model capacity.
    2. Added Dropout layers after each main block for regularization.
    3. Dropout rate is configurable via the __init__ method.
    """
    def __init__(self, processor: 'FeatureProcessor', config: dict, dropout_rate: float = 0.4):
        super().__init__()
        self.config = config
        self.device = self.config['DEVICE']

        # --- Embeddings & Encoders (same as original model) ---
        self.embed_property_type = nn.Embedding(len(processor.vocabs['property_type']), 8)
        self.embed_room_type = nn.Embedding(len(processor.vocabs['room_type']), 4)
        self.text_transformer = SentenceTransformer(self.config['TEXT_MODEL_NAME'])
        # Freeze the transformer completely for the baseline
        for param in self.text_transformer.parameters():
            param.requires_grad = False

        # --- Calculate the total input dimension for the MLP ---
        text_embed_dim = self.text_transformer.get_sentence_embedding_dimension()
        total_input_dim = (
            config['GEO_EMBEDDING_DIM'] +      # Location
            8 + 4 + 4 +                       # Size (embeddings + 4 numerical)
            8 +                               # Quality (8 numerical)
            2 +                               # Seasonality
            text_embed_dim +                  # Amenities
            text_embed_dim                    # Description
        )

        # --- Main MLP (Smaller and with Dropout) ---
        self.main_mlp = nn.Sequential(
            nn.Linear(total_input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(p=dropout_rate),

            nn.Linear(128, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(p=dropout_rate),

            nn.Linear(32, 1)
        )
        self.to(self.device)

    def forward(self, batch: dict) -> torch.Tensor:
        """
        Performs a full forward pass, returning only the final prediction.
        """
        # --- Prepare all feature tensors ---
        loc_input = batch['loc_geo_position']
        size_input = torch.cat([
            self.embed_property_type(batch['size_property_type']),
            self.embed_room_type(batch['size_room_type']),
            batch['size_accommodates'].unsqueeze(1),
            batch['size_bedrooms'].unsqueeze(1),
            batch['size_beds'].unsqueeze(1),
            batch['size_bathrooms'].unsqueeze(1)
        ], dim=1)
        qual_cols = [
            "review_scores_rating", "review_scores_cleanliness", "review_scores_checkin",
            "review_scores_communication", "review_scores_location", "review_scores_value",
            "total_reviews", "host_is_superhost"
        ]
        qual_input = torch.cat([batch[f'qual_{c}'].unsqueeze(1) for c in qual_cols], dim=1)
        season_input = batch['season_cyclical']

        # Get text embeddings (on the fly)
        amenities_tokens = {k: v.squeeze(1) for k, v in batch['amenities_tokens'].items()}
        desc_tokens = {k: v.squeeze(1) for k, v in batch['description_tokens'].items()}
        with torch.no_grad(): # Ensure no gradients are computed for the frozen transformer
            amenities_embed = self.text_transformer(amenities_tokens)['sentence_embedding']
            desc_embed = self.text_transformer(desc_tokens)['sentence_embedding']

        # --- Concatenate all features into a single vector ---
        full_input_vector = torch.cat([
            loc_input, size_input, qual_input, season_input, amenities_embed, desc_embed
        ], dim=1)

        # --- Pass through the main MLP ---
        return self.main_mlp(full_input_vector).squeeze(-1)

    def count_parameters(self):
        """Counts and prints the number of trainable and frozen parameters."""
        total_trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
        total_frozen = sum(p.numel() for p in self.parameters() if not p.requires_grad)
        print("-" * 40)
        print(f"{'Baseline V2 Model Parameter Analysis':^40}")
        print("-" * 40)
        print(f"{'Total Trainable Parameters:':<30} {total_trainable:,}")
        print(f"{'Total Frozen Parameters:':<30} {total_frozen:,}")
        print(f"{'Total Parameters:':<30} {(total_trainable + total_frozen):,}")
        print("-" * 40)

In [8]:
def evaluate_model(model, data_loader, device):
    """Calculates validation loss (MSE) and Mean Absolute Percentage Error (MAPE)."""
    model.eval()
    total_loss, total_mape = 0.0, 0.0
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Evaluating", leave=False):
            # Move batch to device
            for k, v in batch.items():
                if isinstance(v, torch.Tensor): batch[k] = v.to(device)
                else: batch[k] = {sk: sv.to(device) for sk, sv in v.items()}

            targets_price = batch['target_price']
            targets_log_dev = batch['target_log_deviation']

            # Forward pass
            with torch.amp.autocast(device_type=device, dtype=torch.float16, enabled=(device=="cuda")):
                preds_log_dev = model(batch)
                loss = torch.mean((preds_log_dev - targets_log_dev).float().pow(2))
                predicted_log_price = preds_log_dev + batch['neighborhood_log_mean']
                price_preds = torch.expm1(predicted_log_price)
                mape = (torch.abs(price_preds - targets_price) / (targets_price + 1e-6)).mean()

            total_loss += loss.item()
            total_mape += mape.item()

    return total_loss / len(data_loader), total_mape / len(data_loader)

def train_model(model, train_loader, val_loader, optimizer, scheduler, config):
    """Main function to train the model, with early stopping."""
    print("\n--- Starting Baseline Model Training ---")
    history, best_val_mape = [], float('inf')
    best_model_state, patience_counter = None, 0
    scaler = torch.amp.GradScaler(enabled=(config['DEVICE'] == "cuda"))
    start_time = time.time()

    header = f"{'Epoch':>5} | {'Time':>8} | {'Train RMSE':>12} | {'Val RMSE':>10} | {'Val MAPE (%)':>12} | {'Patience':>8}"
    print(header); print("-" * len(header))

    for epoch in range(config['N_EPOCHS']):
        model.train()
        train_loss_epoch = 0
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{config['N_EPOCHS']}", leave=False):
            # Move batch to device
            for k, v in batch.items():
                if isinstance(v, torch.Tensor): batch[k] = v.to(config['DEVICE'])
                else: batch[k] = {sk: sv.to(config['DEVICE']) for sk, sv in v.items()}

            with torch.amp.autocast(device_type=config['DEVICE'], dtype=torch.float16, enabled=(config['DEVICE']=="cuda")):
                preds_log_dev = model(batch)
                loss = torch.mean((preds_log_dev - batch["target_log_deviation"]).float().pow(2))

            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            train_loss_epoch += loss.item()

        val_mse, val_mape = evaluate_model(model, val_loader, config['DEVICE'])
        train_rmse, val_rmse = np.sqrt(train_loss_epoch / len(train_loader)), np.sqrt(val_mse)
        elapsed_time = time.strftime('%H:%M:%S', time.gmtime(time.time() - start_time))

        if val_mape < best_val_mape - config['EARLY_STOPPING_MIN_DELTA']:
            best_val_mape, patience_counter = val_mape, 0
            best_model_state = model.state_dict()
        else:
            patience_counter += 1

        print(f"{epoch+1:>5} | {elapsed_time:>8} | {train_rmse:>12.4f} | {val_rmse:>10.4f} | {val_mape*100:>12.2f} | {patience_counter:>8}")
        history.append({'epoch': epoch, 'train_rmse': train_rmse, 'val_rmse': val_rmse, 'val_mape': val_mape})
        scheduler.step(val_mape)

        if patience_counter >= config['EARLY_STOPPING_PATIENCE']:
            print(f"--- Early Stopping Triggered (MAPE did not improve for {patience_counter} epochs) ---"); break

    print("\n--- Training Complete ---")
    if best_model_state:
        print(f"Loading best model state with Val MAPE: {best_val_mape*100:.2f}%")
        model.load_state_dict(best_model_state)
    return model, pd.DataFrame(history)

def save_artifacts(model, processor, config, metrics):
    """Saves the trained model, feature processor, config, and metrics."""
    timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
    filename = f"{config['CITY']}_baseline_model_artifacts_{timestamp}.pt"
    save_path = os.path.join(config['DRIVE_SAVE_PATH'], filename)
    os.makedirs(config['DRIVE_SAVE_PATH'], exist_ok=True)

    torch.save({
        'model_state_dict': model.state_dict(),
        'feature_processor': processor,
        'config': config,
        'final_metrics': metrics
    }, save_path)

    print(f"\nBaseline artifacts successfully saved to: {save_path}")
    return save_path

In [9]:
# ==============================================================================
# --- MAIN EXECUTION SCRIPT ---
# ==============================================================================

# 1. Imports and Global Configuration
from config import config
from utils import set_seed
from data_processing import load_and_split_data, FeatureProcessor, create_dataloaders

set_seed(config['SEED'])
print("--- Configuration Settings ---")
for key, value in config.items():
    print(f"{key}: {value}")
print(f"\nUsing device: {config['DEVICE']}")


All random seeds set to 42.
--- Configuration Settings ---
CITY: toronto
DEVICE: cuda
DRIVE_SAVE_PATH: /content/drive/MyDrive/Colab_Notebooks/Airbnb_Project/artifacts/
TEXT_MODEL_NAME: BAAI/bge-small-en-v1.5
VAL_SIZE: 0.05
SEED: 42
BATCH_SIZE: 256
VALIDATION_BATCH_SIZE: 512
LEARNING_RATE: 0.001
TRANSFORMER_LEARNING_RATE: 1e-05
N_EPOCHS: 100
HIDDEN_LAYERS_LOCATION: [32, 16]
HIDDEN_LAYERS_SIZE_CAPACITY: [32, 16]
HIDDEN_LAYERS_QUALITY: [32, 16]
HIDDEN_LAYERS_AMENITIES: [64, 32]
HIDDEN_LAYERS_DESCRIPTION: [64, 32]
HIDDEN_LAYERS_SEASONALITY: [16]
GEO_EMBEDDING_DIM: 32
EARLY_STOPPING_PATIENCE: 10
EARLY_STOPPING_MIN_DELTA: 0.001
SCHEDULER_PATIENCE: 2
SCHEDULER_FACTOR: 0.5

Using device: cuda


In [10]:
# 2. Load and Split Data
train_df, val_df, neighborhood_log_means, _, _ = load_and_split_data(config)
print(f"\nTraining DataFrame shape: {train_df.shape}")
print(f"Validation DataFrame shape: {val_df.shape}")

Loading dataset from: ./toronto_dataset_oct_20.parquet
Stratified split complete. Listings in Train: 7,618, Val: 401
Total records in Train: 82,065, Val: 4,327

Training DataFrame shape: (82065, 29)
Validation DataFrame shape: (4327, 29)


In [11]:
# 3. Process Features
processor = FeatureProcessor(config)
processor.fit(train_df)
train_features = processor.transform(train_df, neighborhood_log_means)
val_features = processor.transform(val_df, neighborhood_log_means)

Fitting FeatureProcessor...
Fit complete.


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['neighborhood_log_mean'].fillna(global_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['neighborhood_log_mean'].fillna(global_mean, inplace=True)


In [12]:
# 4. Instantiate Model and DataLoaders
baseline_model = BaselineFCNV2(processor, config)
baseline_model.count_parameters() # Print model summary
train_loader, val_loader = create_dataloaders(train_features, val_features, config)

----------------------------------------
  Baseline V2 Model Parameter Analysis  
----------------------------------------
Total Trainable Parameters:    110,741
Total Frozen Parameters:       33,360,000
Total Parameters:              33,470,741
----------------------------------------
DataLoaders created.


In [13]:
# 5. Define Optimizer and Scheduler
# For the baseline, we use a single learning rate for all trainable parameters
optimizer = optim.AdamW(baseline_model.parameters(), lr=config['LEARNING_RATE'], weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=config['SCHEDULER_FACTOR'],
    patience=config['SCHEDULER_PATIENCE']
)
print("\nOptimizer and Scheduler have been defined.")


Optimizer and Scheduler have been defined.


In [14]:
# 6. Train the Model
trained_baseline_model, history_df = train_model(
    model=baseline_model,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    scheduler=scheduler,
    config=config
)


--- Starting Baseline Model Training ---
Epoch |     Time |   Train RMSE |   Val RMSE | Val MAPE (%) | Patience
----------------------------------------------------------------------




    1 | 00:01:17 |       0.4567 |     0.3398 |        28.26 |        0




    2 | 00:02:33 |       0.3539 |     0.3367 |        27.45 |        0




    3 | 00:03:49 |       0.3442 |     0.3306 |        27.96 |        1




    4 | 00:05:05 |       0.3365 |     0.3298 |        26.99 |        0




    5 | 00:06:20 |       0.3310 |     0.3356 |        28.37 |        1




    6 | 00:07:36 |       0.3259 |     0.3270 |        27.09 |        2




    7 | 00:08:51 |       0.3216 |     0.3325 |        28.23 |        3




    8 | 00:10:07 |       0.3146 |     0.3229 |        26.68 |        0




    9 | 00:11:23 |       0.3098 |     0.3260 |        26.89 |        1




   10 | 00:12:38 |       0.3061 |     0.3248 |        26.31 |        0




   11 | 00:13:54 |       0.3022 |     0.3285 |        27.12 |        1




   12 | 00:15:10 |       0.3006 |     0.3274 |        27.86 |        2




   13 | 00:16:26 |       0.2995 |     0.3262 |        26.83 |        3




   14 | 00:17:41 |       0.2947 |     0.3262 |        26.71 |        4




   15 | 00:18:56 |       0.2922 |     0.3271 |        26.68 |        5




   16 | 00:20:12 |       0.2898 |     0.3248 |        26.47 |        6




   17 | 00:21:27 |       0.2868 |     0.3255 |        26.51 |        7




   18 | 00:22:42 |       0.2872 |     0.3248 |        26.56 |        8




   19 | 00:23:58 |       0.2857 |     0.3239 |        26.31 |        9


                                                         

   20 | 00:25:13 |       0.2853 |     0.3262 |        26.56 |       10
--- Early Stopping Triggered (MAPE did not improve for 10 epochs) ---

--- Training Complete ---
Loading best model state with Val MAPE: 26.31%




In [15]:
# 7. Final Evaluation and Metrics Reporting
print("\n--- Final Model Evaluation ---")
final_train_mse, final_train_mape = evaluate_model(trained_baseline_model, train_loader, config['DEVICE'])
final_val_mse, final_val_mape = evaluate_model(trained_baseline_model, val_loader, config['DEVICE'])

final_metrics = {
    "train_rmse": np.sqrt(final_train_mse),
    "train_mape": final_train_mape,
    "val_rmse": np.sqrt(final_val_mse),
    "val_mape": final_val_mape
}

print("\n" + "="*50)
print(f"{'Final Baseline Performance Metrics':^50}")
print("="*50)
print(f"Train RMSE:      {final_metrics['train_rmse']:.4f}")
print(f"Validation RMSE: {final_metrics['val_rmse']:.4f}")
print("-" * 50)
print(f"Train MAPE:      {final_metrics['train_mape'] * 100:.2f}%")
print(f"Validation MAPE: {final_metrics['val_mape'] * 100:.2f}%")
print("=" * 50)


--- Final Model Evaluation ---


                                                         


        Final Baseline Performance Metrics        
Train RMSE:      0.2267
Validation RMSE: 0.3262
--------------------------------------------------
Train MAPE:      18.08%
Validation MAPE: 26.56%




In [16]:
# 8. Save Model Artifacts
save_artifacts(trained_baseline_model, processor, config, final_metrics)


Baseline artifacts successfully saved to: /content/drive/MyDrive/Colab_Notebooks/Airbnb_Project/artifacts/toronto_baseline_model_artifacts_20251104_125516.pt


'/content/drive/MyDrive/Colab_Notebooks/Airbnb_Project/artifacts/toronto_baseline_model_artifacts_20251104_125516.pt'

In [19]:
# The following functions (train_model, evaluate_model) are assumed to be
# defined in your notebook's environment from the previous run. If not, you
# can uncomment them from the baseline script I provided earlier.

# ==============================================================================
# 1. FLEXIBLE MODEL DEFINITION FOR ABLATION
# ==============================================================================

class AblationFCN(nn.Module):
    """
    A flexible, fully-connected network for ablation studies.

    This model can dynamically exclude one or more feature axes (e.g.,
    'description', 'quality') to measure their impact on performance. The
    input layer size and the forward pass are adjusted automatically based
    on the `exclude_axes` list provided during initialization.
    """
    def __init__(self, processor: 'FeatureProcessor', config: dict,
                 dropout_rate: float = 0.4, exclude_axes: list = None):
        super().__init__()
        self.config = config
        self.device = self.config['DEVICE']
        self.exclude_axes = set(exclude_axes) if exclude_axes else set()

        print(f"Initializing AblationFCN, excluding: {self.exclude_axes}")

        # --- Embeddings & Encoders (defined for all potential axes) ---
        self.embed_property_type = nn.Embedding(len(processor.vocabs['property_type']), 8)
        self.embed_room_type = nn.Embedding(len(processor.vocabs['room_type']), 4)
        self.text_transformer = SentenceTransformer(self.config['TEXT_MODEL_NAME'])
        for param in self.text_transformer.parameters():
            param.requires_grad = False

        # --- Dynamically calculate the total input dimension ---
        text_embed_dim = self.text_transformer.get_sentence_embedding_dimension()
        axis_dims = {
            'location': config['GEO_EMBEDDING_DIM'],
            'size': 8 + 4 + 4,  # prop_embed + room_embed + 4 numerical
            'quality': 8,      # 8 numerical quality features
            'seasonality': 2,
            'amenities': text_embed_dim,
            'description': text_embed_dim
        }

        total_input_dim = 0
        for axis, dim in axis_dims.items():
            if axis not in self.exclude_axes:
                total_input_dim += dim

        print(f"Total input dimension for MLP: {total_input_dim}")

        # --- Main MLP (architecture remains the same) ---
        self.main_mlp = nn.Sequential(
            nn.Linear(total_input_dim, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(p=dropout_rate),
            nn.Linear(128, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(p=dropout_rate),
            nn.Linear(32, 1)
        )
        self.to(self.device)

    def forward(self, batch: dict) -> torch.Tensor:
        """
        Forward pass that dynamically constructs the input vector.
        """
        tensors_to_concat = []

        # --- Conditionally prepare and append feature tensors ---
        if 'location' not in self.exclude_axes:
            tensors_to_concat.append(batch['loc_geo_position'])

        if 'size' not in self.exclude_axes:
            size_input = torch.cat([
                self.embed_property_type(batch['size_property_type']),
                self.embed_room_type(batch['size_room_type']),
                batch['size_accommodates'].unsqueeze(1),
                batch['size_bedrooms'].unsqueeze(1),
                batch['size_beds'].unsqueeze(1),
                batch['size_bathrooms'].unsqueeze(1)
            ], dim=1)
            tensors_to_concat.append(size_input)

        if 'quality' not in self.exclude_axes:
            qual_cols = [
                "review_scores_rating", "review_scores_cleanliness", "review_scores_checkin",
                "review_scores_communication", "review_scores_location", "review_scores_value",
                "total_reviews", "host_is_superhost"
            ]
            qual_input = torch.cat([batch[f'qual_{c}'].unsqueeze(1) for c in qual_cols], dim=1)
            tensors_to_concat.append(qual_input)

        if 'seasonality' not in self.exclude_axes:
            tensors_to_concat.append(batch['season_cyclical'])

        # Text embeddings are handled carefully to avoid unnecessary computation
        with torch.no_grad():
            if 'amenities' not in self.exclude_axes:
                amenities_tokens = {k: v.squeeze(1) for k, v in batch['amenities_tokens'].items()}
                amenities_embed = self.text_transformer(amenities_tokens)['sentence_embedding']
                tensors_to_concat.append(amenities_embed)

            if 'description' not in self.exclude_axes:
                desc_tokens = {k: v.squeeze(1) for k, v in batch['description_tokens'].items()}
                desc_embed = self.text_transformer(desc_tokens)['sentence_embedding']
                tensors_to_concat.append(desc_embed)

        # --- Concatenate all selected features and pass through MLP ---
        full_input_vector = torch.cat(tensors_to_concat, dim=1)
        return self.main_mlp(full_input_vector).squeeze(-1)

    def count_parameters(self):
        total_trainable = sum(p.numel() for p in self.parameters() if p.requires_grad)
        total_frozen = sum(p.numel() for p in self.parameters() if not p.requires_grad)
        print("-" * 50)
        print(f"Ablation Model Parameter Analysis (Excluding: {self.exclude_axes})")
        print(f"{'Total Trainable Parameters:':<30} {total_trainable:,}")
        print(f"{'Total Frozen Parameters:':<30} {total_frozen:,}")
        print("-" * 50)

In [20]:
# ==============================================================================
# 2. WRAPPER FUNCTION TO RUN AN EXPERIMENT
# ==============================================================================

def run_ablation_experiment(
    exclude_axes: list,
    config: dict,
    processor: 'FeatureProcessor',
    train_loader: DataLoader,
    val_loader: DataLoader
):
    """
    Initializes, trains, and evaluates an AblationFCN model, tracking full metrics.

    Args:
        exclude_axes (list): A list of strings naming the axes to remove.
        config (dict): The global configuration dictionary.
        processor (FeatureProcessor): The fitted feature processor.
        train_loader (DataLoader): The training data loader.
        val_loader (DataLoader): The validation data loader.

    Returns:
        dict: A dictionary containing the final performance metrics (RMSE and MAPE
              for both train and validation sets) for this run.
    """
    print("\n" + "="*70)
    print(f"  STARTING ABLATION EXPERIMENT: EXCLUDING {exclude_axes}")
    print("="*70)

    # 1. Instantiate the model with the specified exclusions
    model = AblationFCN(processor, config, exclude_axes=exclude_axes)
    model.count_parameters()

    # 2. Define a new optimizer and scheduler for this specific model
    optimizer = optim.AdamW(model.parameters(), lr=config['LEARNING_RATE'], weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=config['SCHEDULER_FACTOR'],
        patience=config['SCHEDULER_PATIENCE']
    )

    # 3. Train the model using the existing training function
    trained_model, _ = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        scheduler=scheduler,
        config=config
    )

    # 4. Perform final evaluation on both training and validation sets
    print("\n--- Final Evaluation for this Ablation Run ---")
    # Note: Use tqdm's `disable` parameter if you don't want the progress bar here
    final_train_mse, final_train_mape = evaluate_model(trained_model, train_loader, config['DEVICE'])
    final_val_mse, final_val_mape = evaluate_model(trained_model, val_loader, config['DEVICE'])

    metrics = {
        "excluded_axes": str(exclude_axes), # Use string for better DataFrame display
        "train_rmse": np.sqrt(final_train_mse),
        "val_rmse": np.sqrt(final_val_mse),
        "train_mape": final_train_mape,
        "val_mape": final_val_mape
    }

    print("\n" + "*"*70)
    print(f"  ABLATION RUN COMPLETE: EXCLUDING {exclude_axes}")
    print(f"  Final Train RMSE:      {metrics['train_rmse']:.4f}")
    print(f"  Final Validation RMSE: {metrics['val_rmse']:.4f}")
    print(f"  Final Train MAPE:      {metrics['train_mape'] * 100:.3f}%")
    print(f"  Final Validation MAPE: {metrics['val_mape'] * 100:.3f}%")
    print("*"*70 + "\n")

    return metrics

In [21]:
# ==============================================================================
# 3. EXAMPLE USAGE (RUN THIS IN YOUR NOTEBOOK)
# ==============================================================================

# This assumes you have the following objects loaded in your notebook:
# - config: The configuration dictionary
# - processor: The FITTED FeatureProcessor instance
# - train_loader, val_loader: The DataLoaders
# - final_metrics: The dictionary of metrics from your initial baseline run.
#   If you don't have it, you can create it manually like this:
#   final_metrics = {'train_rmse': 0.25, 'val_rmse': 0.30, 'train_mape': 0.15, 'val_mape': 0.18}

# Store results from all experiments
ablation_results = []

# IMPORTANT: Add the original baseline's results for direct comparison.
# This makes your final summary table much more informative.
baseline_performance = final_metrics.copy() # Use the metrics from your first V2 run
baseline_performance['excluded_axes'] = "['None (Baseline)']"
ablation_results.append(baseline_performance)

In [22]:
# --- Experiment 1: Remove 'description' axis ---
exp1_metrics = run_ablation_experiment(
    exclude_axes=['description'],
    config=config,
    processor=processor,
    train_loader=train_loader,
    val_loader=val_loader
)
ablation_results.append(exp1_metrics)


  STARTING ABLATION EXPERIMENT: EXCLUDING ['description']
Initializing AblationFCN, excluding: {'description'}
Total input dimension for MLP: 442
--------------------------------------------------
Ablation Model Parameter Analysis (Excluding: {'description'})
Total Trainable Parameters:    61,589
Total Frozen Parameters:       33,360,000
--------------------------------------------------

--- Starting Baseline Model Training ---
Epoch |     Time |   Train RMSE |   Val RMSE | Val MAPE (%) | Patience
----------------------------------------------------------------------




    1 | 00:01:15 |       0.4425 |     0.3520 |        29.78 |        0




    2 | 00:02:31 |       0.3625 |     0.3475 |        28.95 |        0




    3 | 00:03:47 |       0.3514 |     0.3395 |        28.03 |        0




    4 | 00:05:03 |       0.3455 |     0.3386 |        28.04 |        1




    5 | 00:06:20 |       0.3410 |     0.3395 |        28.62 |        2




    6 | 00:07:35 |       0.3385 |     0.3452 |        28.75 |        3




    7 | 00:08:51 |       0.3321 |     0.3384 |        27.98 |        4




    8 | 00:10:07 |       0.3292 |     0.3425 |        29.02 |        5




    9 | 00:11:23 |       0.3274 |     0.3395 |        27.23 |        0




   10 | 00:12:39 |       0.3261 |     0.3370 |        28.33 |        1




   11 | 00:13:54 |       0.3224 |     0.3470 |        28.85 |        2




   12 | 00:15:09 |       0.3212 |     0.3415 |        28.12 |        3




   13 | 00:16:23 |       0.3178 |     0.3397 |        28.25 |        4




   14 | 00:17:37 |       0.3161 |     0.3387 |        27.83 |        5




   15 | 00:18:52 |       0.3146 |     0.3406 |        28.33 |        6




   16 | 00:20:06 |       0.3117 |     0.3394 |        27.66 |        7




   17 | 00:21:21 |       0.3120 |     0.3413 |        27.98 |        8




   18 | 00:22:36 |       0.3112 |     0.3403 |        28.01 |        9




   19 | 00:23:50 |       0.3091 |     0.3384 |        27.90 |       10
--- Early Stopping Triggered (MAPE did not improve for 10 epochs) ---

--- Training Complete ---
Loading best model state with Val MAPE: 27.23%

--- Final Evaluation for this Ablation Run ---


                                                         


**********************************************************************
  ABLATION RUN COMPLETE: EXCLUDING ['description']
  Final Train RMSE:      0.2687
  Final Validation RMSE: 0.3384
  Final Train MAPE:      22.114%
  Final Validation MAPE: 27.901%
**********************************************************************





In [23]:
# --- Experiment 2: Remove 'amenities' axis ---
exp2_metrics = run_ablation_experiment(
    exclude_axes=['amenities'],
    config=config,
    processor=processor,
    train_loader=train_loader,
    val_loader=val_loader
)
ablation_results.append(exp2_metrics)


  STARTING ABLATION EXPERIMENT: EXCLUDING ['amenities']
Initializing AblationFCN, excluding: {'amenities'}
Total input dimension for MLP: 442
--------------------------------------------------
Ablation Model Parameter Analysis (Excluding: {'amenities'})
Total Trainable Parameters:    61,589
Total Frozen Parameters:       33,360,000
--------------------------------------------------

--- Starting Baseline Model Training ---
Epoch |     Time |   Train RMSE |   Val RMSE | Val MAPE (%) | Patience
----------------------------------------------------------------------




    1 | 00:01:14 |       0.4471 |     0.3516 |        29.88 |        0




    2 | 00:02:29 |       0.3598 |     0.3348 |        27.33 |        0




    3 | 00:03:44 |       0.3490 |     0.3368 |        27.60 |        1




    4 | 00:04:59 |       0.3412 |     0.3288 |        26.77 |        0




    5 | 00:06:14 |       0.3343 |     0.3346 |        27.16 |        1




    6 | 00:07:29 |       0.3293 |     0.3351 |        26.69 |        2




    7 | 00:08:44 |       0.3274 |     0.3302 |        27.19 |        3




    8 | 00:09:59 |       0.3240 |     0.3386 |        27.55 |        4




    9 | 00:11:14 |       0.3214 |     0.3364 |        27.10 |        5




   10 | 00:12:30 |       0.3126 |     0.3355 |        27.86 |        6




   11 | 00:13:45 |       0.3103 |     0.3407 |        27.17 |        7




   12 | 00:15:00 |       0.3075 |     0.3405 |        27.32 |        8




   13 | 00:16:15 |       0.3029 |     0.3412 |        27.94 |        9




   14 | 00:17:30 |       0.3014 |     0.3379 |        27.51 |       10
--- Early Stopping Triggered (MAPE did not improve for 10 epochs) ---

--- Training Complete ---
Loading best model state with Val MAPE: 26.77%

--- Final Evaluation for this Ablation Run ---


                                                         


**********************************************************************
  ABLATION RUN COMPLETE: EXCLUDING ['amenities']
  Final Train RMSE:      0.2539
  Final Validation RMSE: 0.3379
  Final Train MAPE:      20.701%
  Final Validation MAPE: 27.515%
**********************************************************************





In [24]:
# --- Experiment 3: Remove both 'description' and 'amenities' ---
exp3_metrics = run_ablation_experiment(
    exclude_axes=['description', 'amenities'],
    config=config,
    processor=processor,
    train_loader=train_loader,
    val_loader=val_loader
)
ablation_results.append(exp3_metrics)


  STARTING ABLATION EXPERIMENT: EXCLUDING ['description', 'amenities']
Initializing AblationFCN, excluding: {'amenities', 'description'}
Total input dimension for MLP: 58
--------------------------------------------------
Ablation Model Parameter Analysis (Excluding: {'amenities', 'description'})
Total Trainable Parameters:    12,437
Total Frozen Parameters:       33,360,000
--------------------------------------------------

--- Starting Baseline Model Training ---
Epoch |     Time |   Train RMSE |   Val RMSE | Val MAPE (%) | Patience
----------------------------------------------------------------------




    1 | 00:01:14 |       0.4639 |     0.3675 |        31.28 |        0




    2 | 00:02:29 |       0.3748 |     0.3606 |        29.82 |        0




    3 | 00:03:44 |       0.3647 |     0.3543 |        29.08 |        0




    4 | 00:04:59 |       0.3586 |     0.3472 |        28.75 |        0




    5 | 00:06:13 |       0.3543 |     0.3461 |        28.54 |        0




    6 | 00:07:28 |       0.3508 |     0.3441 |        27.79 |        0




    7 | 00:08:44 |       0.3483 |     0.3465 |        28.85 |        1




    8 | 00:09:58 |       0.3454 |     0.3425 |        28.00 |        2




    9 | 00:11:14 |       0.3435 |     0.3470 |        28.06 |        3




   10 | 00:12:30 |       0.3408 |     0.3456 |        27.96 |        4




   11 | 00:13:46 |       0.3393 |     0.3418 |        27.71 |        5




   12 | 00:15:02 |       0.3381 |     0.3451 |        28.35 |        6




   13 | 00:16:17 |       0.3360 |     0.3422 |        28.45 |        7




   14 | 00:17:33 |       0.3364 |     0.3425 |        28.35 |        8




   15 | 00:18:49 |       0.3340 |     0.3453 |        28.33 |        9




   16 | 00:20:05 |       0.3333 |     0.3449 |        28.33 |       10
--- Early Stopping Triggered (MAPE did not improve for 10 epochs) ---

--- Training Complete ---
Loading best model state with Val MAPE: 27.79%

--- Final Evaluation for this Ablation Run ---


                                                         


**********************************************************************
  ABLATION RUN COMPLETE: EXCLUDING ['description', 'amenities']
  Final Train RMSE:      0.3091
  Final Validation RMSE: 0.3449
  Final Train MAPE:      25.623%
  Final Validation MAPE: 28.328%
**********************************************************************





In [25]:
# ==============================================================================
# FINAL SUMMARY AND SAVING OF ABLATION STUDY RESULTS
# ==============================================================================

print("--- Finalizing Ablation Study ---")

# --- 1. Create and Display Summary Table ---
print("\n\n" + "="*80)
print(f"{'ABLATION STUDY SUMMARY':^80}")
print("="*80)

results_df = pd.DataFrame(ablation_results)

# Format MAPE columns for percentage display
results_df['train_mape_pct'] = results_df['train_mape'] * 100
results_df['val_mape_pct'] = results_df['val_mape'] * 100

# Define columns to display and their order
display_cols = ['excluded_axes', 'train_rmse', 'val_rmse', 'train_mape_pct', 'val_mape_pct']

# Print the final formatted table to the console
print(results_df[display_cols].to_string(index=False, float_format="%.4f"))
print("="*80)


# --- 2. Save the Results DataFrame to a CSV file ---
# Create a timestamp for the filename
timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
filename = f"{config['CITY']}_ablation_results_{timestamp}.csv"

# Construct the full save path using the path from your config
save_path = os.path.join(config['DRIVE_SAVE_PATH'], filename)

# Ensure the directory exists
os.makedirs(config['DRIVE_SAVE_PATH'], exist_ok=True)

# Save the DataFrame (including all columns, not just the display ones)
results_df.to_csv(save_path, index=False, float_format="%.6f")

print(f"\nAblation study results successfully saved to:")
print(save_path)

--- Finalizing Ablation Study ---


                             ABLATION STUDY SUMMARY                             
               excluded_axes  train_rmse  val_rmse  train_mape_pct  val_mape_pct
         ['None (Baseline)']      0.2267    0.3262         18.0824       26.5570
             ['description']      0.2687    0.3384         22.1143       27.9011
               ['amenities']      0.2539    0.3379         20.7007       27.5146
['description', 'amenities']      0.3091    0.3449         25.6233       28.3275

Ablation study results successfully saved to:
/content/drive/MyDrive/Colab_Notebooks/Airbnb_Project/artifacts/toronto_ablation_results_20251104_140805.csv
