# Data Preprocessing - Property Valuation

This notebook handles:
1. Data cleaning and missing value handling
2. Feature engineering
3. Satellite image acquisition
4. Image feature extraction

In [None]:
import sys
# sys.path.append('..')  # Not needed when running from root

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

from src.preprocessing import PropertyDataPreprocessor, get_geospatial_features
from src.data_fetcher import SatelliteImageFetcher, create_placeholder_images

## 1. Load Raw Data

In [None]:
train_df = pd.read_csv('data/raw/train.csv')
test_df = pd.read_csv('data/raw/test.csv')

print(f"Training samples: {len(train_df)}")
print(f"Test samples: {len(test_df)}")
print(f"\nTraining columns: {list(train_df.columns)}")
print(f"\nTest columns: {list(test_df.columns)}")

## 2. Data Cleaning

In [None]:
# Check for missing values
print("Missing values in training data:")
missing_train = train_df.isnull().sum()
print(missing_train[missing_train > 0])

print("\nMissing values in test data:")
missing_test = test_df.isnull().sum()
print(missing_test[missing_test > 0])

In [None]:
# Check for outliers
print("Potential outliers in training data:")
print(f"  Max bedrooms: {train_df['bedrooms'].max()}")
print(f"  Max bathrooms: {train_df['bathrooms'].max()}")
print(f"  Max sqft_living: {train_df['sqft_living'].max():,}")
print(f"  Max price: ${train_df['price'].max():,}")
print(f"  Min price: ${train_df['price'].min():,}")

In [None]:
# Initialize preprocessor and clean data
preprocessor = PropertyDataPreprocessor()
train_clean = preprocessor.clean_data(train_df)
test_clean = preprocessor.clean_data(test_df)

print("Data cleaned successfully!")
print(f"New columns after date processing: {[c for c in train_clean.columns if c not in train_df.columns]}")

## 3. Feature Engineering

In [None]:
# Engineer features
train_features = preprocessor.engineer_features(train_clean)
test_features = preprocessor.engineer_features(test_clean)

# Show new features
new_features = [c for c in train_features.columns if c not in train_clean.columns]
print(f"Engineered features ({len(new_features)}):")
for f in new_features:
    print(f"  - {f}")

In [None]:
# Visualize engineered features
fig, axes = plt.subplots(2, 3, figsize=(15, 10))

# Age vs Price
axes[0, 0].scatter(train_features['age'], train_features['price'], alpha=0.3, s=5)
axes[0, 0].set_xlabel('Property Age (years)')
axes[0, 0].set_ylabel('Price ($)')
axes[0, 0].set_title('Age vs Price')

# Living/Lot Ratio vs Price
axes[0, 1].scatter(train_features['living_lot_ratio'], train_features['price'], alpha=0.3, s=5)
axes[0, 1].set_xlabel('Living/Lot Ratio')
axes[0, 1].set_ylabel('Price ($)')
axes[0, 1].set_title('Living/Lot Ratio vs Price')
axes[0, 1].set_xlim(0, 1)

# Living vs Neighbors
axes[0, 2].scatter(train_features['living_vs_neighbors'], train_features['price'], alpha=0.3, s=5)
axes[0, 2].set_xlabel('Living Size / Neighbor Avg')
axes[0, 2].set_ylabel('Price ($)')
axes[0, 2].set_title('Relative Living Size vs Price')
axes[0, 2].set_xlim(0, 3)

# Quality Score vs Price
axes[1, 0].scatter(train_features['quality_score'], train_features['price'], alpha=0.3, s=5)
axes[1, 0].set_xlabel('Quality Score (Grade × Condition)')
axes[1, 0].set_ylabel('Price ($)')
axes[1, 0].set_title('Quality Score vs Price')

# Sqft per Room vs Price
axes[1, 1].scatter(train_features['sqft_per_room'], train_features['price'], alpha=0.3, s=5)
axes[1, 1].set_xlabel('Sqft per Room')
axes[1, 1].set_ylabel('Price ($)')
axes[1, 1].set_title('Space per Room vs Price')
axes[1, 1].set_xlim(0, 1000)

# Renovation Impact
train_features.boxplot(column='price', by='was_renovated', ax=axes[1, 2])
axes[1, 2].set_xlabel('Was Renovated (0=No, 1=Yes)')
axes[1, 2].set_ylabel('Price ($)')
axes[1, 2].set_title('Renovation Impact on Price')

plt.suptitle('')
plt.tight_layout()
plt.savefig('outputs/figures/engineered_features.png', dpi=150)
plt.show()

## 4. Geospatial Features

In [None]:
# Add geospatial features
train_geo = get_geospatial_features(train_features)

# Visualize distance features
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

scatter1 = axes[0].scatter(train_geo['long'], train_geo['lat'], 
                           c=train_geo['dist_to_downtown'], cmap='viridis', 
                           alpha=0.5, s=5)
plt.colorbar(scatter1, ax=axes[0], label='Distance to Downtown')
axes[0].set_xlabel('Longitude')
axes[0].set_ylabel('Latitude')
axes[0].set_title('Distance to Seattle Downtown')

axes[1].scatter(train_geo['dist_to_downtown'], train_geo['price'], alpha=0.3, s=5)
axes[1].set_xlabel('Distance to Downtown')
axes[1].set_ylabel('Price ($)')
axes[1].set_title('Distance to Downtown vs Price')

plt.tight_layout()
plt.savefig('outputs/figures/geospatial_features.png', dpi=150)
plt.show()

## 5. Satellite Image Acquisition

In [None]:
# Check if images already exist
image_dir = Path('data/images')
existing_images = list(image_dir.glob('*.png'))
print(f"Existing images: {len(existing_images)}")

# Combine train and test for image fetching
all_df = pd.concat([train_df, test_df])
print(f"Total properties: {len(all_df)}")

In [None]:
# Option 1: Create placeholder images (for testing without API)
# Uncomment to use:
# create_placeholder_images(all_df, 'data/images')

# Option 2: Fetch real satellite images (requires API key)
# Set your API key in environment:
# export MAPBOX_ACCESS_TOKEN="your_token_here"

# fetcher = SatelliteImageFetcher(api_provider='mapbox', output_dir='data/images')
# fetcher.fetch_all_images(all_df)

In [None]:
# Display sample images if available
from PIL import Image

sample_ids = train_df.sample(min(4, len(train_df)))['id'].values

fig, axes = plt.subplots(2, 2, figsize=(10, 10))
axes = axes.flatten()

for idx, pid in enumerate(sample_ids):
    img_path = image_dir / f"{pid}.png"
    if img_path.exists():
        img = Image.open(img_path)
        axes[idx].imshow(img)
        price = train_df[train_df['id'] == pid]['price'].values[0]
        axes[idx].set_title(f'ID: {pid}\nPrice: ${price:,.0f}')
    else:
        axes[idx].text(0.5, 0.5, 'Image not found', ha='center', va='center')
        axes[idx].set_title(f'ID: {pid}')
    axes[idx].axis('off')

plt.tight_layout()
plt.savefig('outputs/figures/sample_satellite_images.png', dpi=150)
plt.show()

## 6. Image Feature Extraction

In [None]:
import torch
from src.models.cnn_encoder import SatelliteImageEncoder, ImageFeatureExtractor

# Initialize encoder
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

encoder = SatelliteImageEncoder(embedding_dim=256, pretrained=True)
extractor = ImageFeatureExtractor(encoder, device=device)

In [None]:
# Extract features for all properties
all_ids = all_df['id'].tolist()

features_path = Path('data/processed/image_features.npz')
if features_path.exists():
    print("Loading cached features...")
    image_features = extractor.load_features(str(features_path))
else:
    print("Extracting features...")
    image_features = extractor.extract_batch(all_ids, str(image_dir))
    extractor.save_features(image_features, str(features_path))

print(f"Extracted features for {len(image_features)} properties")

In [None]:
# Visualize feature distribution
if image_features:
    sample_features = np.array([image_features[str(pid)] for pid in list(image_features.keys())[:100]])
    
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))
    
    # Feature distribution
    axes[0].hist(sample_features.flatten(), bins=50, alpha=0.7)
    axes[0].set_xlabel('Feature Value')
    axes[0].set_ylabel('Frequency')
    axes[0].set_title('Image Feature Distribution')
    
    # Feature correlation heatmap (subset)
    corr = np.corrcoef(sample_features[:, :20].T)
    im = axes[1].imshow(corr, cmap='RdBu_r', vmin=-1, vmax=1)
    plt.colorbar(im, ax=axes[1])
    axes[1].set_title('Feature Correlation (first 20 dims)')
    
    plt.tight_layout()
    plt.savefig('outputs/figures/image_features.png', dpi=150)
    plt.show()

## 7. Prepare Final Dataset

In [None]:
# Use preprocessor to prepare final training data
preprocessor = PropertyDataPreprocessor()
preprocessor.load_data(
    train_path='data/raw/train.csv',
    test_path='data/raw/test.csv'
)

data = preprocessor.prepare_for_training(val_size=0.2, random_state=42)

print(f"Training samples: {len(data['X_train'])}")
print(f"Validation samples: {len(data['X_val'])}")
print(f"Number of features: {len(data['feature_columns'])}")
print(f"\nFeature columns:")
for i, col in enumerate(data['feature_columns']):
    print(f"  {i+1}. {col}")

In [None]:
# Save preprocessor for later use
preprocessor.save_preprocessor('data/processed/preprocessor.pkl')
print("Preprocessor saved!")

## Summary

### Data Cleaning
- Handled missing values with median imputation
- Converted date to year/month features

### Feature Engineering (30+ features)
- **Age features**: property age, years since renovation
- **Size ratios**: living/lot, above/living, basement ratio
- **Neighborhood comparison**: living vs neighbors, lot vs neighbors
- **Room features**: total rooms, sqft per room
- **Quality**: grade × condition score
- **Binary**: has basement, was renovated

### Image Features
- 256-dimensional embeddings from EfficientNet-B0 (pretrained on ImageNet)
- Captures visual context from satellite imagery

### Next Steps
- Proceed to model training notebook (03_model_training.ipynb)
- Or run `python run_improved_pipeline.py` for the best model (R² = 0.9003)