In [11]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor


In [12]:
#load datasets

airbnb_crime = pd.read_csv('cleaned_airbnb_crime.csv')

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor

# -----------------------------
# 1️⃣ Target Encoding Function
# -----------------------------
def target_encode(train_series, target_series, smoothing=10):
    global_mean = target_series.mean()
    agg = train_series.to_frame().join(target_series).groupby(train_series.name)[target_series.name].agg(['mean','count'])
    smooth = (agg['count'] * agg['mean'] + smoothing * global_mean) / (agg['count'] + smoothing)
    return train_series.map(smooth), smooth, global_mean

# -----------------------------
# 2️⃣ Preprocessing
# -----------------------------
# Log-transform target
y = np.log1p(airbnb_crime['price'])
X = airbnb_crime.drop(columns=['price'])

# Convert date columns to numeric (days since a reference date)
if 'last_review' in X.columns:
    X['last_review'] = pd.to_datetime(X['last_review'])
    reference_date = X['last_review'].max()
    X['days_since_last_review'] = (reference_date - X['last_review']).dt.days
    X = X.drop(columns=['last_review'])

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Target encode categorical columns
categorical_cols = ['neighbourhood_group_cleansed', 'neighbourhood_cleansed', 'room_type']  # add more if needed

for col in categorical_cols:
    if col in X_train.columns:
        train_encoded, encoding_map, global_mean = target_encode(X_train[col], y_train)
        X_train[col + '_enc'] = train_encoded
        X_test[col + '_enc'] = X_test[col].map(encoding_map).fillna(global_mean)

# Drop original categorical columns
X_train = X_train.drop(columns=[c for c in categorical_cols if c in X_train.columns])
X_test = X_test.drop(columns=[c for c in categorical_cols if c in X_test.columns])

# Keep only numeric columns
X_train = X_train.select_dtypes(include=[np.number])
X_test = X_test.select_dtypes(include=[np.number])

# -----------------------------
# 3️⃣ Optional: Interaction Features
# -----------------------------
if 'neighbourhood_group_cleansed_enc' in X_train.columns and 'room_type_enc' in X_train.columns:
    X_train['neigh_room_interaction'] = X_train['neighbourhood_group_cleansed_enc'] * X_train['room_type_enc']
    X_test['neigh_room_interaction'] = X_test['neighbourhood_group_cleansed_enc'] * X_test['room_type_enc']

# -----------------------------
# 4️⃣ XGBoost Regressor with Hyperparameter Tuning
# -----------------------------
xgb = XGBRegressor(objective='reg:squarederror', random_state=42)

param_dist = {
    'n_estimators': [200, 400, 600],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.05, 0.1],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

search = RandomizedSearchCV(
    xgb,
    param_distributions=param_dist,
    n_iter=25,
    scoring='neg_root_mean_squared_error',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=-1
)

search.fit(X_train, y_train)
best_model = search.best_estimator_

# -----------------------------
# 5️⃣ Evaluate Model
# -----------------------------
def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_pred_train = np.expm1(model.predict(X_train))  # inverse log-transform
    y_pred_test = np.expm1(model.predict(X_test))
    
    y_train_actual = np.expm1(y_train)
    y_test_actual = np.expm1(y_test)
    
    print('--- Train Set ---')
    print('R² Score:', r2_score(y_train_actual, y_pred_train))
    print('RMSE:', np.sqrt(mean_squared_error(y_train_actual, y_pred_train)))
    print('MAE:', mean_absolute_error(y_train_actual, y_pred_train))
    
    print('\n--- Test Set ---')
    print('R² Score:', r2_score(y_test_actual, y_pred_test))
    print('RMSE:', np.sqrt(mean_squared_error(y_test_actual, y_pred_test)))
    print('MAE:', mean_absolute_error(y_test_actual, y_pred_test))

evaluate_model(best_model, X_train, y_train, X_test, y_test)



Fitting 3 folds for each of 25 candidates, totalling 75 fits
--- Train Set ---
R² Score: 0.5066141339246377
RMSE: 143.0996877524965
MAE: 31.600302682195714

--- Test Set ---
R² Score: 0.5396764129293494
RMSE: 109.5243820851694
MAE: 37.295600249624336


1️⃣ Comparison of metrics
Metric	Before	After
Train R²	0.551	0.507
Train RMSE	136.58	143.10
Train MAE	31.62	31.60
Test R²	0.572	0.540
Test RMSE	105.64	109.52
Test MAE	37.96	37.30
2️⃣ Observations

Slight drop in R² and increase in RMSE

After the full preprocessing + hyperparameter search, the train R² decreased slightly (0.551 → 0.507) and RMSE increased.

Test R² also dropped (0.572 → 0.540) and RMSE slightly increased.

This suggests the model may be slightly underfitting compared to the simpler baseline.

MAE is roughly unchanged

MAE stayed almost the same, indicating the median error magnitude hasn’t worsened — the model still predicts the typical price reasonably well.

Regularization / hyperparameter tuning effect

The RandomizedSearchCV might have picked more conservative hyperparameters, leading to slightly lower variance (less overfitting) but slightly higher bias.

Sometimes the “improved” pipeline trades small accuracy for more robustness.

Target encoding + interaction features

These changes don’t always guarantee higher R² immediately — the effect depends on correlations in your dataset.

Target encoding helps high-cardinality features, but if the feature doesn’t strongly predict the target, it can slightly increase noise.

3️⃣ Recommendations to push performance further

Feature selection / engineering

Try removing weak features like days_since_last_review if they don’t correlate with price.

Add more interaction terms or aggregated statistics (e.g., avg price per neighborhood).

Try tree-based encoders

Instead of simple target encoding, you could use CatBoost encoding or Leave-One-Out encoding to reduce leakage.

Hyperparameter tuning adjustments

Expand search space (e.g., higher max_depth or more n_estimators) to capture more complexity.

Consider early stopping with eval_set to avoid underfitting.

Outlier treatment

Even with log-transform, extreme Airbnb prices can skew RMSE.

Clipping or filtering extreme listings could slightly improve R².

✅ Summary

The “improved” pipeline is more robust but slightly underfits.

MAE staying similar means typical predictions are stable.

Further gains likely require additional feature engineering, careful hyperparameter tuning, and possibly outlier handling.