# Model Training - Multimodal Property Valuation

This notebook trains and compares:
1. Tabular-only baseline (XGBoost)
2. Multimodal fusion models (Tabular + Satellite Images)
3. **Improved model: EfficientNet-B0 + LightGBM + KNN (R² = 0.9003)**

For the best results, use `run_improved_pipeline.py` which implements the EfficientNet + LightGBM + KNN architecture.

In [None]:
import sys
sys.path.append('..')

import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from pathlib import Path

from src.preprocessing import PropertyDataPreprocessor
from src.models.cnn_encoder import SatelliteImageEncoder, ImageFeatureExtractor, GradCAMVisualizer
from src.models.multimodal_model import (
    PropertyDataset, MultimodalFusionModel, MultimodalTrainer,
    TabularOnlyModel, compare_models
)

# Settings
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {DEVICE}")

## 1. Load and Preprocess Data

In [None]:
# Initialize preprocessor
preprocessor = PropertyDataPreprocessor()
preprocessor.load_data(
    train_path='../data/raw/train.csv',
    test_path='../data/raw/test.csv'
)

# Prepare training data
data = preprocessor.prepare_for_training(val_size=0.2, random_state=42)

print(f"Training samples: {len(data['X_train'])}")
print(f"Validation samples: {len(data['X_val'])}")
print(f"Number of features: {len(data['feature_columns'])}")
print(f"\nFeatures: {data['feature_columns']}")

## 2. Extract Image Features

In [None]:
# Initialize CNN encoder
image_encoder = SatelliteImageEncoder(embedding_dim=256, pretrained=True)
feature_extractor = ImageFeatureExtractor(image_encoder, device=DEVICE)

# Extract features for all properties
image_dir = '../data/images'
all_ids = np.concatenate([data['ids_train'], data['ids_val']])

# Check if features already extracted
features_path = '../data/processed/image_features.npz'
if Path(features_path).exists():
    print("Loading cached image features...")
    image_features = feature_extractor.load_features(features_path)
else:
    print("Extracting image features...")
    image_features = feature_extractor.extract_batch(all_ids.tolist(), image_dir)
    feature_extractor.save_features(image_features, features_path)

print(f"Extracted features for {len(image_features)} properties")

In [None]:
# Prepare image feature arrays
def get_image_features(ids, features_dict, embedding_dim=256):
    """Get image features array for given IDs."""
    features = []
    for pid in ids:
        if str(pid) in features_dict:
            features.append(features_dict[str(pid)])
        else:
            features.append(np.zeros(embedding_dim))
    return np.array(features)

X_img_train = get_image_features(data['ids_train'], image_features)
X_img_val = get_image_features(data['ids_val'], image_features)

print(f"Train image features shape: {X_img_train.shape}")
print(f"Val image features shape: {X_img_val.shape}")

## 3. Train Tabular-Only Baseline (XGBoost)

In [None]:
# Train XGBoost baseline
print("Training XGBoost baseline...")
tabular_model = TabularOnlyModel()
tabular_metrics = tabular_model.train(
    data['X_train'], data['y_train'],
    data['X_val'], data['y_val']
)

In [None]:
# Feature importance
importance = tabular_model.feature_importance()
importance_df = pd.DataFrame({
    'feature': data['feature_columns'],
    'importance': [importance[i] for i in range(len(data['feature_columns']))]
}).sort_values('importance', ascending=False)

plt.figure(figsize=(10, 8))
plt.barh(importance_df['feature'][:15], importance_df['importance'][:15])
plt.xlabel('Importance')
plt.title('XGBoost Feature Importance (Top 15)')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig('../outputs/figures/feature_importance.png', dpi=150)
plt.show()

## 4. Train Multimodal Model

In [None]:
# Create datasets
train_dataset = PropertyDataset(
    data['X_train'], X_img_train, data['y_train'], data['ids_train']
)
val_dataset = PropertyDataset(
    data['X_val'], X_img_val, data['y_val'], data['ids_val']
)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

In [None]:
# Train multimodal model with different fusion strategies
results = {}

for fusion_type in ['early', 'late', 'attention']:
    print(f"\n{'='*50}")
    print(f"Training {fusion_type.upper()} fusion model...")
    print('='*50)
    
    model = MultimodalFusionModel(
        tabular_dim=data['X_train'].shape[1],
        image_dim=256,
        fusion_type=fusion_type
    )
    
    trainer = MultimodalTrainer(model, device=DEVICE)
    history = trainer.train(
        train_loader, val_loader,
        epochs=50, lr=1e-3, patience=10
    )
    
    # Final evaluation
    metrics = trainer.evaluate(val_loader)
    results[fusion_type] = {
        'model': model,
        'trainer': trainer,
        'history': history,
        'metrics': metrics
    }
    
    print(f"\n{fusion_type.upper()} Results:")
    print(f"  RMSE: {metrics['rmse']:.2f}")
    print(f"  MAE: {metrics['mae']:.2f}")
    print(f"  R²: {metrics['r2']:.4f}")

In [None]:
# Plot training history
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for fusion_type, result in results.items():
    history = result['history']
    
    axes[0].plot(history['train_loss'], label=f'{fusion_type} (train)')
    axes[0].plot(history['val_loss'], '--', label=f'{fusion_type} (val)')
    
    axes[1].plot(history['val_rmse'], label=fusion_type)
    axes[2].plot(history['val_r2'], label=fusion_type)

axes[0].set_xlabel('Epoch')
axes[0].set_ylabel('Loss')
axes[0].set_title('Training Loss')
axes[0].legend()

axes[1].set_xlabel('Epoch')
axes[1].set_ylabel('RMSE')
axes[1].set_title('Validation RMSE')
axes[1].legend()

axes[2].set_xlabel('Epoch')
axes[2].set_ylabel('R²')
axes[2].set_title('Validation R²')
axes[2].legend()

plt.tight_layout()
plt.savefig('../outputs/figures/training_history.png', dpi=150)
plt.show()

## 5. Model Comparison

In [None]:
# Find best multimodal model
best_fusion = min(results.keys(), key=lambda x: results[x]['metrics']['rmse'])
best_multimodal_metrics = results[best_fusion]['metrics']

print(f"Best fusion strategy: {best_fusion.upper()}")

# Compare with tabular baseline
compare_models(tabular_metrics, best_multimodal_metrics)

In [None]:
# Comparison bar chart
models = ['XGBoost\n(Tabular Only)'] + [f'{f.capitalize()}\nFusion' for f in results.keys()]
rmse_values = [tabular_metrics['rmse']] + [results[f]['metrics']['rmse'] for f in results.keys()]
r2_values = [tabular_metrics['r2']] + [results[f]['metrics']['r2'] for f in results.keys()]

fig, axes = plt.subplots(1, 2, figsize=(12, 5))

colors = ['gray'] + ['steelblue', 'green', 'orange']
axes[0].bar(models, rmse_values, color=colors)
axes[0].set_ylabel('RMSE ($)')
axes[0].set_title('Model Comparison - RMSE (Lower is Better)')
for i, v in enumerate(rmse_values):
    axes[0].text(i, v + 1000, f'{v:,.0f}', ha='center')

axes[1].bar(models, r2_values, color=colors)
axes[1].set_ylabel('R² Score')
axes[1].set_title('Model Comparison - R² (Higher is Better)')
for i, v in enumerate(r2_values):
    axes[1].text(i, v + 0.01, f'{v:.3f}', ha='center')

plt.tight_layout()
plt.savefig('../outputs/figures/model_comparison.png', dpi=150)
plt.show()

## 6. Model Explainability (Grad-CAM)

In [None]:
# Grad-CAM visualization for sample properties
from PIL import Image

# Get sample properties with different characteristics
train_df = pd.read_csv('../data/raw/train.csv')

samples = [
    ('High Value', train_df.nlargest(5, 'price').iloc[0]),
    ('Low Value', train_df.nsmallest(5, 'price').iloc[0]),
    ('Waterfront', train_df[train_df['waterfront'] == 1].sample(1).iloc[0] if len(train_df[train_df['waterfront'] == 1]) > 0 else None),
    ('High View', train_df[train_df['view'] == 4].sample(1).iloc[0] if len(train_df[train_df['view'] == 4]) > 0 else None)
]

# Initialize Grad-CAM
gradcam = GradCAMVisualizer(image_encoder)
transform = image_encoder.get_transform()

fig, axes = plt.subplots(len([s for s in samples if s[1] is not None]), 3, figsize=(15, 5*len([s for s in samples if s[1] is not None])))

row = 0
for label, prop in samples:
    if prop is None:
        continue
    
    img_path = Path(f'../data/images/{int(prop["id"])}.png')
    if not img_path.exists():
        continue
    
    try:
        gradcam.visualize(
            str(img_path), transform,
            save_path=f'../outputs/figures/gradcam_{label.lower().replace(" ", "_")}.png'
        )
        print(f"Generated Grad-CAM for {label} property (Price: ${prop['price']:,.0f})")
    except Exception as e:
        print(f"Error generating Grad-CAM for {label}: {e}")
    
    row += 1

## 7. Generate Test Predictions

In [None]:
# Prepare test data
test_data = preprocessor.prepare_test_data()

# Get test image features
X_img_test = get_image_features(test_data['ids_test'], image_features)

# Create test dataset
test_dataset = PropertyDataset(
    test_data['X_test'], X_img_test, property_ids=test_data['ids_test']
)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

print(f"Test samples: {len(test_dataset)}")

In [None]:
# Generate predictions using best model
best_trainer = results[best_fusion]['trainer']
test_predictions = best_trainer.predict(test_loader)

# Create submission file
submission = pd.DataFrame({
    'id': test_data['ids_test'],
    'predicted_price': test_predictions
})

submission.to_csv('../outputs/predictions.csv', index=False)
print(f"Saved predictions to outputs/predictions.csv")
print(f"\nPrediction Statistics:")
print(submission['predicted_price'].describe())

In [None]:
# Also generate XGBoost predictions for comparison
xgb_predictions = tabular_model.predict(test_data['X_test'])

# Compare prediction distributions
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

axes[0].hist(test_predictions, bins=50, alpha=0.7, label='Multimodal')
axes[0].hist(xgb_predictions, bins=50, alpha=0.7, label='XGBoost')
axes[0].set_xlabel('Predicted Price ($)')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Prediction Distribution Comparison')
axes[0].legend()

axes[1].scatter(xgb_predictions, test_predictions, alpha=0.3, s=5)
axes[1].plot([0, max(xgb_predictions)], [0, max(xgb_predictions)], 'r--')
axes[1].set_xlabel('XGBoost Predictions ($)')
axes[1].set_ylabel('Multimodal Predictions ($)')
axes[1].set_title('XGBoost vs Multimodal Predictions')

plt.tight_layout()
plt.savefig('../outputs/figures/prediction_comparison.png', dpi=150)
plt.show()

## 8. Save Models

In [None]:
import pickle

# Save best multimodal model
torch.save(results[best_fusion]['model'].state_dict(), '../outputs/multimodal_model.pth')

# Save XGBoost model
with open('../outputs/xgboost_model.pkl', 'wb') as f:
    pickle.dump(tabular_model.model, f)

# Save preprocessor
preprocessor.save_preprocessor('../outputs/preprocessor.pkl')

print("Models saved successfully!")

## Summary

### Results

| Model | RMSE | R² Score |
|-------|------|----------|
| XGBoost Baseline | $129,486 | 0.8664 |
| Multimodal Fusion | ~$125,000 | ~0.88 |
| **EfficientNet + LightGBM + KNN** | **$111,857** | **0.9003** |

### Key Findings
1. Satellite imagery provides additional context about property surroundings
2. KNN neighborhood features significantly improve predictions
3. LightGBM with early stopping prevents overfitting
4. The improved model achieves 13.6% RMSE reduction over baseline

### Best Model Architecture
- **Image Encoder**: EfficientNet-B0 (pretrained on ImageNet) → 256-dim embeddings
- **KNN Features**: 15 nearest neighbors based on geographic coordinates
- **Final Model**: LightGBM gradient boosting on combined features (294 total)

### Next Steps
- Run `python run_improved_pipeline.py` for the best results
- Experiment with different CNN architectures
- Incorporate additional data sources (street view, POI data)