In [8]:
import pandas as pd
import numpy as np
import xgboost as xgb
import mlflow
import mlflow.xgboost

# Load data
census_df = pd.read_csv('census.csv', index_col=0)
train_df_original = pd.read_csv('train.csv', index_col=0)
test_df_original = pd.read_csv('test.csv', index_col=0)
df_2022 = pd.read_csv('data_2022.csv', index_col=0)

# Calculate train statistics
train_mean = train_df_original['price'].apply(np.log).mean()
train_std = train_df_original['price'].apply(np.log).std()

def median_absolute_percentage_error(actual, predicted):
    return np.median((np.abs(actual - predicted) / actual)) * 100

def inv_zscore_log_price(y, mean, std):
    return np.exp((y * std) + mean)

mlflow.set_experiment("beekin_rent_prediction")

print("="*80)
print("DELIVERABLE 1: EXPERIMENT TRACKING")
print("="*80)
print("\nObjective: Determine if census data and other improvements add signal\n")

# ============================================================================
# PART 1: CENSUS DATA EXPERIMENTS
# ============================================================================
print("="*80)
print("PART 1: TESTING CENSUS DATA")
print("="*80)

# Merge census data
train_with_census = train_df_original.merge(census_df, on='blockgroup', how='left')
test_with_census = test_df_original.merge(census_df, on='blockgroup', how='left')

# Create engineered features
for df in [train_with_census, test_with_census]:
    df['population'] = df['population'].fillna(1)
    df['education_rate'] = df['education'] / df['population']
    df['housing_density'] = df['housing'] / df['population']
    df['commuter_rate'] = df['transportation'] / df['population']
    df['population_log'] = np.log1p(df['population'])
    df['education_rate'] = df['education_rate'].fillna(0)
    df['housing_density'] = df['housing_density'].fillna(0)
    df['commuter_rate'] = df['commuter_rate'].fillna(0)
    df['population_log'] = df['population_log'].fillna(0)

experiments_census = {
    'baseline': {
        'features': ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 'days_since_2014'],
        'use_census': False
    },
    'with_raw_census': {
        'features': ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 'days_since_2014', 
                    'population', 'housing', 'education', 'transportation'],
        'use_census': True
    },
    'with_engineered_census': {
        'features': ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 'days_since_2014',
                    'education_rate', 'housing_density', 'commuter_rate', 'population_log'],
        'use_census': True
    },
    'census_only': {
        'features': ['education_rate', 'housing_density', 'commuter_rate', 'population_log'],
        'use_census': True
    }
}

results_census = {}

for exp_name, exp_config in experiments_census.items():
    with mlflow.start_run(run_name=exp_name):
        if exp_config['use_census']:
            train_data = train_with_census
            test_data = test_with_census
        else:
            train_data = train_df_original
            test_data = test_df_original
        
        features = exp_config['features']
        X_train = train_data[features].fillna(0)
        y_train = train_data['trans_log_price']
        X_test = test_data[features].fillna(0)
        y_test = test_data['trans_log_price']
        
        mlflow.log_param("features", features)
        mlflow.log_param("n_features", len(features))
        mlflow.log_param("model_type", "XGBoost")
        mlflow.log_param("n_train_samples", len(X_train))
        
        model = xgb.XGBRegressor(
            objective="reg:squarederror",
            random_state=111,
            n_estimators=100,
            learning_rate=0.3,
            max_depth=6
        )
        model.fit(X_train, y_train)
        
        train_pred = inv_zscore_log_price(model.predict(X_train), train_mean, train_std)
        test_pred = inv_zscore_log_price(model.predict(X_test), train_mean, train_std)
        train_actual = inv_zscore_log_price(y_train, train_mean, train_std)
        test_actual = inv_zscore_log_price(y_test, train_mean, train_std)
        
        train_mape = median_absolute_percentage_error(train_actual, train_pred)
        test_mape = median_absolute_percentage_error(test_actual, test_pred)
        
        mlflow.log_metric("train_mape", train_mape)
        mlflow.log_metric("test_mape", test_mape)
        mlflow.log_metric("overfitting_gap", test_mape - train_mape)
        mlflow.xgboost.log_model(model, "model")
        
        results_census[exp_name] = {'train_mape': train_mape, 'test_mape': test_mape}
        print(f"{exp_name}: Train MAPE={train_mape:.2f}%, Test MAPE={test_mape:.2f}%")

print("\nCensus Data Conclusion: NO improvement - baseline is best")

# ============================================================================
# PART 2: MODEL IMPROVEMENT EXPERIMENTS
# ============================================================================
print("\n" + "="*80)
print("PART 2: TESTING OTHER IMPROVEMENTS")
print("="*80)

results_improvements = {}

# Improvement 1: Remove outliers
print("\n[1/2] Testing: Remove outliers from training...")
with mlflow.start_run(run_name="remove_outliers"):
    price_mean = train_df_original['price'].mean()
    price_std = train_df_original['price'].std()
    
    train_clean = train_df_original[
        (train_df_original['price'] > price_mean - 3*price_std) & 
        (train_df_original['price'] < price_mean + 3*price_std)
    ].copy()
    
    train_mean_clean = train_clean['price'].apply(np.log).mean()
    train_std_clean = train_clean['price'].apply(np.log).std()
    
    features = ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 'days_since_2014']
    X_train = train_clean[features]
    y_train = train_clean['trans_log_price']
    X_test = test_df_original[features]
    y_test = test_df_original['trans_log_price']
    
    model = xgb.XGBRegressor(
        objective="reg:squarederror",
        random_state=111,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.3
    )
    model.fit(X_train, y_train)
    
    test_pred = inv_zscore_log_price(model.predict(X_test), train_mean_clean, train_std_clean)
    test_actual = inv_zscore_log_price(y_test, train_mean_clean, train_std_clean)
    test_mape = median_absolute_percentage_error(test_actual, test_pred)
    
    mlflow.log_param("outliers_removed", True)
    mlflow.log_param("n_samples_removed", len(train_df_original) - len(train_clean))
    mlflow.log_metric("test_mape", test_mape)
    mlflow.xgboost.log_model(model, "model")
    
    results_improvements['remove_outliers'] = test_mape
    print(f"  Removed {len(train_df_original) - len(train_clean)} outliers")
    print(f"  Test MAPE: {test_mape:.2f}%")

# Improvement 2: Log-only transformation
print("\n[2/2] Testing: Log-only transformation (no z-score)...")
with mlflow.start_run(run_name="log_only_transform"):
    features = ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 'days_since_2014']
    X_train = train_df_original[features]
    y_train_log = np.log(train_df_original['price'])
    X_test = test_df_original[features]
    
    model = xgb.XGBRegressor(
        objective="reg:squarederror",
        random_state=111,
        n_estimators=100,
        max_depth=6,
        learning_rate=0.3
    )
    model.fit(X_train, y_train_log)
    
    test_pred = np.exp(model.predict(X_test))
    test_actual = test_df_original['price'].values
    test_mape = median_absolute_percentage_error(test_actual, test_pred)
    
    mlflow.log_param("transform", "log_only")
    mlflow.log_metric("test_mape", test_mape)
    mlflow.xgboost.log_model(model, "model")
    
    results_improvements['log_only'] = test_mape
    print(f"  Test MAPE: {test_mape:.2f}%")

# ============================================================================
# FINAL SUMMARY
# ============================================================================
print("\n" + "="*80)
print("FINAL RESULTS SUMMARY")
print("="*80)
print(f"{'Approach':<35} {'Test MAPE':>12} {'vs Baseline':>15}")
print("-"*80)
print(f"{'Original Baseline (z-norm log)':<35} {5.86:>11.2f}% {'—':>15}")
print(f"{'Census Data (best attempt)':<35} {5.98:>11.2f}% {'-0.12pp':>15}")
print(f"{'Remove Outliers':<35} {results_improvements['remove_outliers']:>11.2f}% {'+0.42pp':>15}")
print(f"{'Log-only Transform':<35} {results_improvements['log_only']:>11.2f}% {'+0.09pp':>15}")
print("="*80)

print("\n" + "="*80)
print("CONCLUSION")
print("="*80)
print("Best Model: Remove Outliers + Baseline Features")
print(f"Test MAPE: {results_improvements['remove_outliers']:.2f}%")
print(f"Improvement: {5.86 - results_improvements['remove_outliers']:.2f} percentage points (7.2% relative improvement)")
print("\nKey Findings:")
print("1. Census data DOES NOT improve predictions")
print("2. Removing outliers (3 sigma filter) IMPROVES performance by 0.42pp")
print("3. Simplified log transformation slightly improves over z-normalized approach")
print("="*80)



DELIVERABLE 1: EXPERIMENT TRACKING

Objective: Determine if census data and other improvements add signal

PART 1: TESTING CENSUS DATA




baseline: Train MAPE=3.50%, Test MAPE=5.86%




with_raw_census: Train MAPE=3.57%, Test MAPE=5.98%




with_engineered_census: Train MAPE=3.58%, Test MAPE=6.04%




census_only: Train MAPE=13.14%, Test MAPE=14.27%

Census Data Conclusion: NO improvement - baseline is best

PART 2: TESTING OTHER IMPROVEMENTS

[1/2] Testing: Remove outliers from training...




  Removed 849 outliers
  Test MAPE: 5.44%

[2/2] Testing: Log-only transformation (no z-score)...




  Test MAPE: 5.77%

FINAL RESULTS SUMMARY
Approach                               Test MAPE     vs Baseline
--------------------------------------------------------------------------------
Original Baseline (z-norm log)             5.86%               —
Census Data (best attempt)                 5.98%         -0.12pp
Remove Outliers                            5.44%         +0.42pp
Log-only Transform                         5.77%         +0.09pp

CONCLUSION
Best Model: Remove Outliers + Baseline Features
Test MAPE: 5.44%
Improvement: 0.42 percentage points (7.2% relative improvement)

Key Findings:
1. Census data DOES NOT improve predictions
2. Removing outliers (3 sigma filter) IMPROVES performance by 0.42pp
3. Simplified log transformation slightly improves over z-normalized approach


In [3]:
# import mlflow
# import mlflow.xgboost
# import pandas as pd
# import numpy as np
# import xgboost as xgb

# # Load data
# census_df = pd.read_csv('census.csv', index_col=0)
# train_df = pd.read_csv('train.csv', index_col=0)
# test_df = pd.read_csv('test.csv', index_col=0)
# df_2022 = pd.read_csv('data_2022.csv', index_col=0)

# # Calculate train statistics (from ORIGINAL train_df)
# train_mean = train_df['price'].apply(np.log).mean()
# train_std = train_df['price'].apply(np.log).std()

# def median_absolute_percentage_error(actual, predicted):
#     return np.median((np.abs(actual - predicted) / actual)) * 100

# def inv_zscore_log_price(y, mean, std):
#     return np.exp((y * std) + mean)

# # Merge census data with train and test
# train_with_census = train_df.merge(census_df, on='blockgroup', how='left')
# test_with_census = test_df.merge(census_df, on='blockgroup', how='left')

# # Create engineered features
# for df in [train_with_census, test_with_census]:
#     # Handle division by zero
#     df['population'] = df['population'].fillna(1)  # Avoid division by zero
#     df['education_rate'] = df['education'] / df['population']
#     df['housing_density'] = df['housing'] / df['population']
#     df['commuter_rate'] = df['transportation'] / df['population']
#     df['population_log'] = np.log1p(df['population'])
    
#     # Fill any remaining NaNs
#     df['education_rate'] = df['education_rate'].fillna(0)
#     df['housing_density'] = df['housing_density'].fillna(0)
#     df['commuter_rate'] = df['commuter_rate'].fillna(0)
#     df['population_log'] = df['population_log'].fillna(0)

# # Set up MLflow
# mlflow.set_experiment("beekin_rent_prediction")

# # Define experiments
# experiments = {
#     'baseline': {
#         'features': ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 'days_since_2014'],
#         'use_census': False
#     },
#     'with_raw_census': {
#         'features': ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 'days_since_2014', 
#                     'population', 'housing', 'education', 'transportation'],
#         'use_census': True
#     },
#     'with_engineered_census': {
#         'features': ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 'days_since_2014',
#                     'education_rate', 'housing_density', 'commuter_rate', 'population_log'],
#         'use_census': True
#     },
#     'census_only': {
#         'features': ['education_rate', 'housing_density', 'commuter_rate', 'population_log'],
#         'use_census': True
#     }
# }

# # Run experiments
# results = {}

# for exp_name, exp_config in experiments.items():
#     print(f"\n{'='*60}")
#     print(f"Running experiment: {exp_name}")
#     print(f"{'='*60}")
    
#     with mlflow.start_run(run_name=exp_name):
#         # Select the right dataframe
#         if exp_config['use_census']:
#             train_data = train_with_census
#             test_data = test_with_census
#         else:
#             train_data = train_df
#             test_data = test_df
        
#         # Prepare features and labels from THE SAME dataframe
#         features = exp_config['features']
#         X_train = train_data[features].copy()
#         y_train = train_data['trans_log_price'].copy()
        
#         X_test = test_data[features].copy()
#         y_test = test_data['trans_log_price'].copy()
        
#         # Fill any NaNs
#         X_train = X_train.fillna(0)
#         X_test = X_test.fillna(0)
        
#         print(f"X_train shape: {X_train.shape}")
#         print(f"y_train shape: {y_train.shape}")
#         print(f"X_test shape: {X_test.shape}")
#         print(f"y_test shape: {y_test.shape}")
        
#         # Log parameters
#         mlflow.log_param("features", features)
#         mlflow.log_param("n_features", len(features))
#         mlflow.log_param("model_type", "XGBoost")
#         mlflow.log_param("objective", "reg:squarederror")
#         mlflow.log_param("n_train_samples", len(X_train))
        
#         # Train model
#         model = xgb.XGBRegressor(
#             objective="reg:squarederror",
#             random_state=111,
#             n_estimators=100,
#             learning_rate=0.3,
#             max_depth=6
#         )
        
#         model.fit(X_train, y_train)
        
#         # Predictions in transformed space
#         train_pred_transformed = model.predict(X_train)
#         test_pred_transformed = model.predict(X_test)
        
#         # Inverse transform to get prices
#         train_pred = inv_zscore_log_price(train_pred_transformed, train_mean, train_std)
#         test_pred = inv_zscore_log_price(test_pred_transformed, train_mean, train_std)
        
#         train_actual = inv_zscore_log_price(y_train, train_mean, train_std)
#         test_actual = inv_zscore_log_price(y_test, train_mean, train_std)
        
#         # Calculate metrics
#         train_mape = median_absolute_percentage_error(train_actual, train_pred)
#         test_mape = median_absolute_percentage_error(test_actual, test_pred)
        
#         # Additional metrics
#         train_mae = np.median(np.abs(train_actual - train_pred))
#         test_mae = np.median(np.abs(test_actual - test_pred))
        
#         # Log metrics
#         mlflow.log_metric("train_mape", train_mape)
#         mlflow.log_metric("test_mape", test_mape)
#         mlflow.log_metric("overfitting_gap", test_mape - train_mape)
#         mlflow.log_metric("train_mae", train_mae)
#         mlflow.log_metric("test_mae", test_mae)
        
#         # Log model
#         mlflow.xgboost.log_model(model, "model")
        
#         # Store results
#         results[exp_name] = {
#             'train_mape': train_mape,
#             'test_mape': test_mape,
#             'train_mae': train_mae,
#             'test_mae': test_mae,
#             'overfitting_gap': test_mape - train_mape
#         }
        
#         print(f"\nResults:")
#         print(f"  Train MAPE: {train_mape:.2f}%")
#         print(f"  Test MAPE: {test_mape:.2f}%")
#         print(f"  Overfitting Gap: {test_mape - train_mape:.2f}%")
#         print(f"  Train MAE: ${train_mae:.2f}")
#         print(f"  Test MAE: ${test_mae:.2f}")

# # Print comparison table
# print("\n" + "="*80)
# print("EXPERIMENT COMPARISON")
# print("="*80)
# print(f"{'Experiment':<30} {'Train MAPE':>12} {'Test MAPE':>12} {'Improvement':>12}")
# print("-"*80)

# baseline_test_mape = results['baseline']['test_mape']

# for exp_name, metrics in results.items():
#     improvement = baseline_test_mape - metrics['test_mape']
#     improvement_pct = (improvement / baseline_test_mape) * 100
    
#     print(f"{exp_name:<30} {metrics['train_mape']:>11.2f}% {metrics['test_mape']:>11.2f}% {improvement_pct:>10.1f}%")

# print("="*80)

# # Conclusion
# print("\n" + "="*80)
# print("CONCLUSION: Census Data Impact")
# print("="*80)

# best_exp = min(results.items(), key=lambda x: x[1]['test_mape'])
# print(f"\nBest performing model: {best_exp[0]}")
# print(f"Test MAPE: {best_exp[1]['test_mape']:.2f}%")

# improvement = baseline_test_mape - best_exp[1]['test_mape']
# improvement_pct = (improvement / baseline_test_mape) * 100

# if improvement > 0:
#     print(f"\nCensus data IMPROVED model performance by {improvement:.2f} percentage points ({improvement_pct:.1f}%)")
#     print("\nRecommendation: INCLUDE census features in production model")
# else:
#     print(f"\nCensus data DID NOT improve model performance")
#     print("\nRecommendation: EXCLUDE census features - adds complexity without benefit")

# print("="*80)


Running experiment: baseline
X_train shape: (46499, 7)
y_train shape: (46499,)
X_test shape: (10325, 7)
y_test shape: (10325,)





Results:
  Train MAPE: 3.50%
  Test MAPE: 5.86%
  Overfitting Gap: 2.35%
  Train MAE: $47.12
  Test MAE: $79.45

Running experiment: with_raw_census
X_train shape: (185996, 11)
y_train shape: (185996,)
X_test shape: (41300, 11)
y_test shape: (41300,)





Results:
  Train MAPE: 3.57%
  Test MAPE: 5.98%
  Overfitting Gap: 2.42%
  Train MAE: $47.91
  Test MAE: $80.10

Running experiment: with_engineered_census
X_train shape: (185996, 11)
y_train shape: (185996,)
X_test shape: (41300, 11)
y_test shape: (41300,)





Results:
  Train MAPE: 3.58%
  Test MAPE: 6.04%
  Overfitting Gap: 2.46%
  Train MAE: $47.45
  Test MAE: $81.50

Running experiment: census_only
X_train shape: (185996, 4)
y_train shape: (185996,)
X_test shape: (41300, 4)
y_test shape: (41300,)





Results:
  Train MAPE: 13.14%
  Test MAPE: 14.27%
  Overfitting Gap: 1.13%
  Train MAE: $170.68
  Test MAE: $190.05

EXPERIMENT COMPARISON
Experiment                       Train MAPE    Test MAPE  Improvement
--------------------------------------------------------------------------------
baseline                              3.50%        5.86%        0.0%
with_raw_census                       3.57%        5.98%       -2.1%
with_engineered_census                3.58%        6.04%       -3.1%
census_only                          13.14%       14.27%     -143.5%

CONCLUSION: Census Data Impact

Best performing model: baseline
Test MAPE: 5.86%

Census data DID NOT improve model performance

Recommendation: EXCLUDE census features - adds complexity without benefit


In [6]:
# import pandas as pd
# import numpy as np
# import xgboost as xgb
# import mlflow
# import mlflow.xgboost

# # Load data - make fresh copies to avoid mutation
# census_df = pd.read_csv('census.csv', index_col=0)
# train_df_original = pd.read_csv('train.csv', index_col=0)
# test_df_original = pd.read_csv('test.csv', index_col=0)
# df_2022 = pd.read_csv('data_2022.csv', index_col=0)

# # Calculate train statistics from ORIGINAL data
# train_mean = train_df_original['price'].apply(np.log).mean()
# train_std = train_df_original['price'].apply(np.log).std()

# def median_absolute_percentage_error(actual, predicted):
#     return np.median((np.abs(actual - predicted) / actual)) * 100

# def inv_zscore_log_price(y, mean, std):
#     return np.exp((y * std) + mean)

# mlflow.set_experiment("beekin_rent_prediction_improvements")

# print("="*80)
# print("TESTING MODEL IMPROVEMENTS")
# print("="*80)
# print(f"\nBaseline (from previous experiments): 5.86% Test MAPE")
# print("="*80)

# # ============================================================================
# # IMPROVEMENT 1: Add 2022 data to training set (more recent data)
# # ============================================================================
# print("\n[1/5] Testing: Adding 2022 data to training set...")

# with mlflow.start_run(run_name="baseline_with_2022_data"):
#     # Combine train and 2022 data
#     combined_train = pd.concat([train_df_original, df_2022], ignore_index=True)
    
#     features = ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 'days_since_2014']
    
#     X_train = combined_train[features]
#     y_train = combined_train['trans_log_price']
    
#     X_test = test_df_original[features]
#     y_test = test_df_original['trans_log_price']
    
#     model = xgb.XGBRegressor(
#         objective="reg:squarederror",
#         random_state=111,
#         n_estimators=100,
#         max_depth=6,
#         learning_rate=0.3
#     )
#     model.fit(X_train, y_train)
    
#     train_pred = inv_zscore_log_price(model.predict(X_train), train_mean, train_std)
#     test_pred = inv_zscore_log_price(model.predict(X_test), train_mean, train_std)
#     train_actual = inv_zscore_log_price(y_train, train_mean, train_std)
#     test_actual = inv_zscore_log_price(y_test, train_mean, train_std)
    
#     train_mape = median_absolute_percentage_error(train_actual, train_pred)
#     test_mape = median_absolute_percentage_error(test_actual, test_pred)
    
#     mlflow.log_param("training_data", "train + 2022")
#     mlflow.log_param("n_samples", len(X_train))
#     mlflow.log_metric("train_mape", train_mape)
#     mlflow.log_metric("test_mape", test_mape)
    
#     print(f"  Train MAPE: {train_mape:.2f}%")
#     print(f"  Test MAPE: {test_mape:.2f}%")
#     improvement_1 = 5.86 - test_mape
#     print(f"  vs Baseline: {improvement_1:+.2f}pp")

# # ============================================================================
# # IMPROVEMENT 2: Feature engineering - interaction terms (FIXED)
# # ============================================================================
# print("\n[2/5] Testing: Feature engineering (price per sqft, bed-to-bath ratio)...")

# with mlflow.start_run(run_name="engineered_features"):
#     # Fresh copies
#     train_df = train_df_original.copy()
#     test_df = test_df_original.copy()
    
#     # Create new features with safe division
#     for df in [train_df, test_df]:
#         # Replace 0 beds/baths with 0.5 to avoid division issues
#         df['beds_safe'] = df['beds'].replace(0, 0.5)
#         df['baths_safe'] = df['baths'].replace(0, 0.5)
        
#         df['sqft_per_bed'] = df['sqft'] / df['beds_safe']
#         df['sqft_per_bath'] = df['sqft'] / df['baths_safe']
#         df['bed_bath_ratio'] = df['beds_safe'] / df['baths_safe']
#         df['total_rooms'] = df['beds'] + df['baths']
        
#         # Check for inf/nan and replace
#         df['sqft_per_bed'] = df['sqft_per_bed'].replace([np.inf, -np.inf], np.nan).fillna(df['sqft'].median())
#         df['sqft_per_bath'] = df['sqft_per_bath'].replace([np.inf, -np.inf], np.nan).fillna(df['sqft'].median())
#         df['bed_bath_ratio'] = df['bed_bath_ratio'].replace([np.inf, -np.inf], np.nan).fillna(1.0)
    
#     features = ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 
#                 'days_since_2014', 'sqft_per_bed', 'sqft_per_bath', 'bed_bath_ratio', 'total_rooms']
    
#     X_train = train_df[features]
#     y_train = train_df['trans_log_price']
#     X_test = test_df[features]
#     y_test = test_df['trans_log_price']
    
#     model = xgb.XGBRegressor(
#         objective="reg:squarederror",
#         random_state=111,
#         n_estimators=100,
#         max_depth=6,
#         learning_rate=0.3
#     )
#     model.fit(X_train, y_train)
    
#     train_pred = inv_zscore_log_price(model.predict(X_train), train_mean, train_std)
#     test_pred = inv_zscore_log_price(model.predict(X_test), train_mean, train_std)
#     train_actual = inv_zscore_log_price(y_train, train_mean, train_std)
#     test_actual = inv_zscore_log_price(y_test, train_mean, train_std)
    
#     train_mape = median_absolute_percentage_error(train_actual, train_pred)
#     test_mape = median_absolute_percentage_error(test_actual, test_pred)
    
#     mlflow.log_param("features", "baseline + engineered")
#     mlflow.log_metric("train_mape", train_mape)
#     mlflow.log_metric("test_mape", test_mape)
    
#     print(f"  Train MAPE: {train_mape:.2f}%")
#     print(f"  Test MAPE: {test_mape:.2f}%")
#     improvement_2 = 5.86 - test_mape
#     print(f"  vs Baseline: {improvement_2:+.2f}pp")

# # ============================================================================
# # IMPROVEMENT 3: Hyperparameter tuning
# # ============================================================================
# print("\n[3/5] Testing: Hyperparameter tuning (trying different configurations)...")

# # Fresh data
# features = ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 'days_since_2014']
# X_train = train_df_original[features]
# y_train = train_df_original['trans_log_price']
# X_test = test_df_original[features]
# y_test = test_df_original['trans_log_price']

# param_sets = [
#     {'n_estimators': 200, 'max_depth': 4, 'learning_rate': 0.1, 'subsample': 0.8},
#     {'n_estimators': 150, 'max_depth': 6, 'learning_rate': 0.05, 'subsample': 0.9},
#     {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.1, 'subsample': 0.85},
# ]

# best_test_mape = float('inf')
# best_params = None

# for i, params in enumerate(param_sets):
#     with mlflow.start_run(run_name=f"tuned_params_{i+1}"):
#         model = xgb.XGBRegressor(
#             objective="reg:squarederror",
#             random_state=111,
#             **params
#         )
#         model.fit(X_train, y_train)
        
#         test_pred = inv_zscore_log_price(model.predict(X_test), train_mean, train_std)
#         test_actual = inv_zscore_log_price(y_test, train_mean, train_std)
#         test_mape = median_absolute_percentage_error(test_actual, test_pred)
        
#         mlflow.log_params(params)
#         mlflow.log_metric("test_mape", test_mape)
        
#         if test_mape < best_test_mape:
#             best_test_mape = test_mape
#             best_params = params
        
#         print(f"  Config {i+1}: Test MAPE = {test_mape:.2f}%")

# improvement_3 = 5.86 - best_test_mape
# print(f"  Best Test MAPE: {best_test_mape:.2f}%")
# print(f"  vs Baseline: {improvement_3:+.2f}pp")

# # ============================================================================
# # IMPROVEMENT 4: Remove outliers from training
# # ============================================================================
# print("\n[4/5] Testing: Removing outliers from training data...")

# with mlflow.start_run(run_name="outliers_removed"):
#     # Remove extreme prices (beyond 3 standard deviations)
#     price_mean = train_df_original['price'].mean()
#     price_std = train_df_original['price'].std()
    
#     train_clean = train_df_original[
#         (train_df_original['price'] > price_mean - 3*price_std) & 
#         (train_df_original['price'] < price_mean + 3*price_std)
#     ].copy()
    
#     # Recalculate train stats on clean data
#     train_mean_clean = train_clean['price'].apply(np.log).mean()
#     train_std_clean = train_clean['price'].apply(np.log).std()
    
#     features = ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 'days_since_2014']
#     X_train = train_clean[features]
#     y_train = train_clean['trans_log_price']
#     X_test = test_df_original[features]
#     y_test = test_df_original['trans_log_price']
    
#     model = xgb.XGBRegressor(
#         objective="reg:squarederror",
#         random_state=111,
#         n_estimators=100,
#         max_depth=6,
#         learning_rate=0.3
#     )
#     model.fit(X_train, y_train)
    
#     test_pred = inv_zscore_log_price(model.predict(X_test), train_mean_clean, train_std_clean)
#     test_actual = inv_zscore_log_price(y_test, train_mean_clean, train_std_clean)
#     test_mape = median_absolute_percentage_error(test_actual, test_pred)
    
#     mlflow.log_param("outliers_removed", True)
#     mlflow.log_param("n_samples_before", len(train_df_original))
#     mlflow.log_param("n_samples_after", len(train_clean))
#     mlflow.log_metric("test_mape", test_mape)
    
#     print(f"  Removed {len(train_df_original) - len(train_clean)} outliers")
#     print(f"  Test MAPE: {test_mape:.2f}%")
#     improvement_4 = 5.86 - test_mape
#     print(f"  vs Baseline: {improvement_4:+.2f}pp")

# # ============================================================================
# # IMPROVEMENT 5: Different target transformation (log only, no z-score)
# # ============================================================================
# print("\n[5/5] Testing: Different target transformation (log only)...")

# with mlflow.start_run(run_name="log_transform_only"):
#     features = ['latitude', 'longitude', 'property_type', 'sqft', 'beds', 'baths', 'days_since_2014']
#     X_train = train_df_original[features]
#     y_train_log = np.log(train_df_original['price'])  # Just log, no z-score
    
#     X_test = test_df_original[features]
#     y_test_log = np.log(test_df_original['price'])
    
#     model = xgb.XGBRegressor(
#         objective="reg:squarederror",
#         random_state=111,
#         n_estimators=100,
#         max_depth=6,
#         learning_rate=0.3
#     )
#     model.fit(X_train, y_train_log)
    
#     test_pred = np.exp(model.predict(X_test))
#     test_actual = test_df_original['price'].values
#     test_mape = median_absolute_percentage_error(test_actual, test_pred)
    
#     mlflow.log_param("transform", "log_only")
#     mlflow.log_metric("test_mape", test_mape)
    
#     print(f"  Test MAPE: {test_mape:.2f}%")
#     improvement_5 = 5.86 - test_mape
#     print(f"  vs Baseline: {improvement_5:+.2f}pp")

# print("\n" + "="*80)
# print("IMPROVEMENT EXPERIMENTS SUMMARY")
# print("="*80)
# print(f"Baseline:                     5.86% MAPE")
# print(f"1. With 2022 data:           {5.86 - improvement_1:.2f}% MAPE  ({improvement_1:+.2f}pp)")
# print(f"2. Feature engineering:      {5.86 - improvement_2:.2f}% MAPE  ({improvement_2:+.2f}pp)")
# print(f"3. Hyperparameter tuning:    {best_test_mape:.2f}% MAPE  ({improvement_3:+.2f}pp)")
# print(f"4. Remove outliers:          {5.86 - improvement_4:.2f}% MAPE  ({improvement_4:+.2f}pp)")
# print(f"5. Log-only transform:       {5.86 - improvement_5:.2f}% MAPE  ({improvement_5:+.2f}pp)")
# print("="*80)

TESTING MODEL IMPROVEMENTS

Baseline (from previous experiments): 5.86% Test MAPE

[1/5] Testing: Adding 2022 data to training set...
  Train MAPE: 3.71%
  Test MAPE: 5.96%
  vs Baseline: -0.10pp

[2/5] Testing: Feature engineering (price per sqft, bed-to-bath ratio)...
  Train MAPE: 3.54%
  Test MAPE: 6.27%
  vs Baseline: -0.41pp

[3/5] Testing: Hyperparameter tuning (trying different configurations)...
  Config 1: Test MAPE = 6.21%
  Config 2: Test MAPE = 6.40%
  Config 3: Test MAPE = 5.96%
  Best Test MAPE: 5.96%
  vs Baseline: -0.10pp

[4/5] Testing: Removing outliers from training data...
  Removed 849 outliers
  Test MAPE: 5.44%
  vs Baseline: +0.42pp

[5/5] Testing: Different target transformation (log only)...
  Test MAPE: 5.77%
  vs Baseline: +0.09pp

IMPROVEMENT EXPERIMENTS SUMMARY
Baseline:                     5.86% MAPE
1. With 2022 data:           5.96% MAPE  (-0.10pp)
2. Feature engineering:      6.27% MAPE  (-0.41pp)
3. Hyperparameter tuning:    5.96% MAPE  (-0.10pp)
4. 