# Notebook 5.3 - Testing spatial lag as a feature

# Import libraries

In [97]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import numpy as np
import pandas as pd
from libpysal.weights import KNN, lag_spatial
from scipy.spatial import cKDTree
from sklearn.model_selection import train_test_split
from scipy import stats
from scipy.special import inv_boxcox

# Choosing the city

In [98]:
# Choose the city ('Madrid', 'Barcelona', 'Valencia', or 'Combined')
city = 'Combined'

# Load the cleaned data sets

In [99]:
if city.lower() == 'madrid':
    data = pd.read_csv('../../data/4_data_cleaned/madrid_cleaned_base_features.csv')
elif city.lower() == 'barcelona':
    data = pd.read_csv('../../data/4_data_cleaned/barcelona_cleaned_base_features.csv')
elif city.lower() == 'valencia':
    data = pd.read_csv('../../data/4_data_cleaned/valencia_cleaned_base_features.csv')
elif city.lower() == 'combined':
    # Read all 3 datasets and add a column to indicate source dataset
    madrid_df = pd.read_csv('../../data/4_data_cleaned/madrid_cleaned_base_features.csv')
    valencia_df = pd.read_csv('../../data/4_data_cleaned/valencia_cleaned_base_features.csv')
    barcelona_df = pd.read_csv('../../data/4_data_cleaned/barcelona_cleaned_base_features.csv')
    madrid_df['Madrid'] = True
    madrid_df['Valencia'] = False
    madrid_df['Barcelona'] = False    
    valencia_df['Madrid'] = False
    valencia_df['Valencia'] = True
    valencia_df['Barcelona'] = False    
    barcelona_df['Madrid'] = False
    barcelona_df['Valencia'] = False
    barcelona_df['Barcelona'] = True
    data = pd.concat([madrid_df, valencia_df, barcelona_df], ignore_index=True)

In [100]:
data.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,LATITUDE,NEIGHBORHOOD,ZIP_CODE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,Madrid,Valencia,Barcelona
0,A10000037964896093228,255000,97,3,2,3,0,1.0,0,1,...,40.473921,Pinar del Rey,28033,0,0,1,0,True,False,False
1,A10000072440601830803,82000,62,2,1,3,0,1.0,0,1,...,40.384968,Palomeras sureste,28018,0,1,0,0,True,False,False
2,A10000538600815177437,133000,67,3,1,3,0,1.0,1,0,...,40.384547,San Diego,28018,1,0,0,0,True,False,False
3,A10000654405436195291,204000,180,3,2,3,0,1.0,0,1,...,40.430336,Ventas,28017,1,0,0,0,True,False,False
4,A10000872160480475600,161000,54,2,1,3,0,1.0,0,0,...,40.384103,Buena Vista,28019,0,0,0,1,True,False,False


# Defining X and y

__Transform target variable PRICE as optimized in notebook 5.1__

In [101]:
# Define the lambda value
specified_lambda = 0

# Apply Box-Cox transformation using the specified lambda value
price_transformed = stats.boxcox(data['PRICE'], lmbda=specified_lambda)

# Add the transformed PRICE back to the DataFrame
data['PRICE_TRANSFORMED'] = price_transformed

__Assign features as X and target as Y__

In [102]:
X = data.drop(columns=['PRICE', 'PRICE_TRANSFORMED'])
y = data['PRICE_TRANSFORMED']

# Drop features not relevant for modelling

In [103]:
#Drop ASSETID as it is only an identifier, drop ZIP_CODE as the base model only includes base features
X = X.drop(columns=['ASSETID', 'ZIP_CODE'])

# Split the data into training and test sets

In [104]:
# Split the dataset into training and testing set with 7:3 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (104827, 43)
X_test shape: (44927, 43)
y_train shape: (104827,)
y_test shape: (44927,)


# Target encoding of base feature neighborhood

In [105]:
# Calculate average LOG_PRICE for each NEIGHBORHOOD only with training set
neighborhood_means = y_train.groupby(X_train['NEIGHBORHOOD']).mean()

# Assign average means to both training and test set
X_train['NEIGHBORHOOD_ENCODED'] = X_train['NEIGHBORHOOD'].map(neighborhood_means)
X_test['NEIGHBORHOOD_ENCODED'] = X_test['NEIGHBORHOOD'].map(neighborhood_means)

# Drop the NEIGHBORHOOD column from both training and testing sets
X_train = X_train.drop(columns=['NEIGHBORHOOD'])
X_test = X_test.drop(columns=['NEIGHBORHOOD'])

In [106]:
X_train.head()

Unnamed: 0,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,HASAIRCONDITIONING,HASNORTHORIENTATION,...,LONGITUDE,LATITUDE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,Madrid,Valencia,Barcelona,NEIGHBORHOOD_ENCODED
88583,80,3,1,3,0,1.0,0,1,1,0,...,-0.394215,39.48123,0,1,0,0,False,True,False,12.067036
48173,102,3,2,3,0,1.0,0,1,1,0,...,-3.608624,40.374728,0,0,0,1,True,False,False,12.247304
74291,38,1,1,3,0,1.0,0,1,0,0,...,-3.67721,40.429325,0,0,0,1,True,False,False,13.273539
853,43,1,1,3,0,1.0,1,1,1,0,...,-3.697117,40.422022,0,0,1,0,True,False,False,13.151636
95919,65,3,1,3,0,1.0,0,1,0,0,...,-0.400322,39.46897,1,0,0,0,False,True,False,11.740395


# Adding spatial lag as a feature

In [107]:
# Extract coordinates for the training set
coordinates_train = X_train[['LATITUDE', 'LONGITUDE']].values

#Define optimized number of neighbours that minimizes the number of disconnected components to 5
optimized_k = 29

# Create KNN spatial weights matrix based on the training set coordinates with the optimized k
knn_train = KNN.from_array(coordinates_train, k=optimized_k)

#Normalize the weights matrix
knn_train.transform = 'r'  

# Calculate the spatial lag for the training set
X_train['SPATIAL_LAG'] = lag_spatial(knn_train, y_train)

 There are 6 disconnected components.
  W.__init__(self, neighbors, id_order=ids, **kwargs)


In [108]:
# For the test set, create a weights object using training coordinates but query test coordinates
# Create a KDTree for the training coordinates
tree = cKDTree(coordinates_train)

# Find the k-nearest neighbors for each test coordinate in the training set
distances, indices = tree.query(X_test[['LATITUDE', 'LONGITUDE']], k=29)

# Calculate the weights based on the distances
weights_test = np.exp(-distances)

# Normalize the weights
weights_test /= weights_test.sum(axis=1, keepdims=True)

# Calculate the spatial lag for the test set using the training set's target values
X_test['SPATIAL_LAG'] = np.sum(weights_test * y_train.values[indices], axis=1)

# Inspecting final data before running model

In [109]:
# Display the first few rows of the training set
X_train.head()

Unnamed: 0,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,HASAIRCONDITIONING,HASNORTHORIENTATION,...,LATITUDE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,Madrid,Valencia,Barcelona,NEIGHBORHOOD_ENCODED,SPATIAL_LAG
88583,80,3,1,3,0,1.0,0,1,1,0,...,39.48123,0,1,0,0,False,True,False,12.067036,12.067894
48173,102,3,2,3,0,1.0,0,1,1,0,...,40.374728,0,0,0,1,True,False,False,12.247304,12.41834
74291,38,1,1,3,0,1.0,0,1,0,0,...,40.429325,0,0,0,1,True,False,False,13.273539,13.170042
853,43,1,1,3,0,1.0,1,1,1,0,...,40.422022,0,0,1,0,True,False,False,13.151636,13.154386
95919,65,3,1,3,0,1.0,0,1,0,0,...,39.46897,1,0,0,0,False,True,False,11.740395,11.55967


In [110]:
# Display the first few rows of the training set
X_test.head()

Unnamed: 0,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,HASAIRCONDITIONING,HASNORTHORIENTATION,...,LATITUDE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,Madrid,Valencia,Barcelona,NEIGHBORHOOD_ENCODED,SPATIAL_LAG
55930,82,3,2,3,0,1.0,0,1,0,0,...,40.406918,0,1,0,0,True,False,False,13.275478,12.842973
110070,86,3,1,3,0,1.0,1,1,1,0,...,41.416478,0,0,1,0,False,False,True,12.368969,12.412621
133225,93,4,1,3,0,1.0,1,1,0,0,...,41.419345,0,0,1,0,False,False,True,12.092169,12.045041
57088,40,1,1,3,1,1.0,0,1,1,0,...,40.358942,0,0,0,1,True,False,False,12.247304,12.158632
103352,84,3,2,3,0,1.0,0,0,0,0,...,41.377668,1,0,0,0,False,False,True,12.413428,12.603313


# Test the model with new feature

__Define the best parameters found by GridSearchCV in Notebook 5.1__

In [111]:
best_params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'max_depth': 10,
    'n_estimators': 500,
    'subsample': 0.9
}

__Run the optimized XGBoost model__

In [112]:
# Train the model on the Box-Cox transformed target
xgb_model = xgb.XGBRegressor(**best_params)
xgb_model.fit(X_train, y_train)

# Predictions in Box-Cox transformed scale
y_pred_boxcox_xgb = xgb_model.predict(X_test)

# Transform predictions back to the original scale
y_pred_xgb = inv_boxcox(y_pred_boxcox_xgb, specified_lambda)
y_test_price = inv_boxcox(y_test, specified_lambda)

# Evaluation metrics on the original scale
mae_xgb = mean_absolute_error(y_test_price, y_pred_xgb)
mse_xgb = mean_squared_error(y_test_price, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mape_xgb = mean_absolute_percentage_error(y_test_price, y_pred_xgb)

# Evaluation metrics on the Box-Cox transformed scale
mae_boxcox_xgb = mean_absolute_error(y_test, y_pred_boxcox_xgb)
mse_boxcox_xgb = mean_squared_error(y_test, y_pred_boxcox_xgb)
rmse_boxcox_xgb = np.sqrt(mse_boxcox_xgb)
mape_boxcox_xgb = mean_absolute_percentage_error(y_test, y_pred_boxcox_xgb)

# Print the metrics
print("Metrics on the original PRICE scale:")
print(f"MAE: {mae_xgb:.4f}")
print(f"MSE: {mse_xgb:.4f}")
print(f"RMSE: {rmse_xgb:.4f}")
print(f"MAPE: {mape_xgb:.4f}")

print("\nMetrics on the Box-Cox transformed scale:")
print(f"MAE: {mae_boxcox_xgb:.4f}")
print(f"MSE: {mse_boxcox_xgb:.4f}")
print(f"RMSE: {rmse_boxcox_xgb:.4f}")
print(f"MAPE: {mape_boxcox_xgb:.4f}")

Metrics on the original PRICE scale:
MAE: 43585.3020
MSE: 8550317193.3201
RMSE: 92467.9252
MAPE: 0.1296

Metrics on the Box-Cox transformed scale:
MAE: 0.1253
MSE: 0.0318
RMSE: 0.1784
MAPE: 0.0101
