# Notebook 5.7 - Testing the feature "Park"

# Import libraries

In [1]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import stats
from scipy.special import inv_boxcox

# Choose the city

In [2]:
# Choose the city ('Madrid', 'Barcelona', 'Valencia', or 'Combined')
city = 'Combined'

# Load the cleaned data set incl. the feature "type of road"

In [3]:
if city.lower() == 'madrid':
    data = pd.read_csv('../../data/5_cleaned_and_feature_engineering/feature_parks/madrid_cleaned_incl_parks.csv')
elif city.lower() == 'barcelona':
    data = pd.read_csv('../../data/5_cleaned_and_feature_engineering/feature_parks/barcelona_cleaned_incl_parks.csv')
elif city.lower() == 'valencia':
    data = pd.read_csv('../../data/5_cleaned_and_feature_engineering/feature_parks/valencia_cleaned_incl_parks.csv')
elif city.lower() == 'combined':
    #Read all 3 datasets and add a column to indicate source dataset
    madrid_df = pd.read_csv('../../data/5_cleaned_and_feature_engineering/feature_parks/madrid_cleaned_incl_parks.csv')
    valencia_df = pd.read_csv('../../data/5_cleaned_and_feature_engineering/feature_parks/valencia_cleaned_incl_parks.csv')
    barcelona_df = pd.read_csv('../../data/5_cleaned_and_feature_engineering/feature_parks/barcelona_cleaned_incl_parks.csv')
    madrid_df['Madrid'] = True
    madrid_df['Valencia'] = False
    madrid_df['Barcelona'] = False    
    valencia_df['Madrid'] = False
    valencia_df['Valencia'] = True
    valencia_df['Barcelona'] = False    
    barcelona_df['Madrid'] = False
    barcelona_df['Valencia'] = False
    barcelona_df['Barcelona'] = True
    data = pd.concat([madrid_df, valencia_df, barcelona_df], ignore_index=True)

In [4]:
data.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,NEIGHBORHOOD,ZIP_CODE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,park_area_percentage,Madrid,Valencia,Barcelona
0,A10000037964896093228,255000,97,3,2,3,0,1.0,0,1,...,Pinar del Rey,28033,0,0,1,0,8.733098,True,False,False
1,A10000072440601830803,82000,62,2,1,3,0,1.0,0,1,...,Palomeras sureste,28018,0,1,0,0,18.51648,True,False,False
2,A10000538600815177437,133000,67,3,1,3,0,1.0,1,0,...,San Diego,28018,1,0,0,0,10.185739,True,False,False
3,A10000654405436195291,204000,180,3,2,3,0,1.0,0,1,...,Ventas,28017,1,0,0,0,0.0,True,False,False
4,A10000872160480475600,161000,54,2,1,3,0,1.0,0,0,...,Buena Vista,28019,0,0,0,1,13.537461,True,False,False


# Defining X and y

__Transform target variable PRICE as optimized in notebook 5.1__

In [5]:
# Define the lambda value
specified_lambda = 0

# Apply Box-Cox transformation using the specified lambda value
price_transformed = stats.boxcox(data['PRICE'], lmbda=specified_lambda)

# Add the transformed PRICE back to the DataFrame
data['PRICE_TRANSFORMED'] = price_transformed

__Assign features as X and target as Y__

In [6]:
X = data.drop(columns=['PRICE', 'PRICE_TRANSFORMED'])
y = data['PRICE_TRANSFORMED']

# Drop features not relevant for modelling

In [7]:
#Drop ASSETID as it is only an identifier, drop ZIP_CODE as the base model only includes base features
X = X.drop(columns=['ASSETID', 'ZIP_CODE'])

# Splitting the data into training and test set

In [8]:
# Split the dataset into training and testing set with 7:3 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (104827, 44)
X_test shape: (44927, 44)
y_train shape: (104827,)
y_test shape: (44927,)


# Target encoding of base feature NEIGHBORHOOD

In [9]:
# Calculate average LOG_PRICE for each NEIGHBORHOOD only with training set
neighborhood_means = y_train.groupby(X_train['NEIGHBORHOOD']).mean()

# Assign average means to both training and test set
X_train['NEIGHBORHOOD_ENCODED'] = X_train['NEIGHBORHOOD'].map(neighborhood_means)
X_test['NEIGHBORHOOD_ENCODED'] = X_test['NEIGHBORHOOD'].map(neighborhood_means)

# Drop the NEIGHBORHOOD column from both training and testing sets
X_train = X_train.drop(columns=['NEIGHBORHOOD'])
X_test = X_test.drop(columns=['NEIGHBORHOOD'])

# Inspect data before running the model

In [10]:
X_test.head()

Unnamed: 0,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,HASAIRCONDITIONING,HASNORTHORIENTATION,...,LATITUDE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,park_area_percentage,Madrid,Valencia,Barcelona,NEIGHBORHOOD_ENCODED
55930,82,3,2,3,0,1.0,0,1,0,0,...,40.406918,0,1,0,0,2.296912,True,False,False,13.275478
110070,86,3,1,3,0,1.0,1,1,1,0,...,41.416478,0,0,1,0,9.028562,False,False,True,12.368969
133225,93,4,1,3,0,1.0,1,1,0,0,...,41.419345,0,0,1,0,2.359327,False,False,True,12.092169
57088,40,1,1,3,1,1.0,0,1,1,0,...,40.358942,0,0,0,1,21.774896,True,False,False,12.247304
103352,84,3,2,3,0,1.0,0,0,0,0,...,41.377668,1,0,0,0,2.398357,False,False,True,12.413428


# Test the model with new feature

__Define the best parameters found by GridSearchCV in Notebook 5.1__

In [11]:
best_params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'max_depth': 10,
    'n_estimators': 500,
    'subsample': 0.9
}

__Running the optimized model new feature__

In [12]:
# Initialize XGBoost model with the best parameters
xgb_model = xgb.XGBRegressor(**best_params)

# Train the model on the Box-Cox transformed target
xgb_model.fit(X_train, y_train)

# Predictions in Box-Cox transformed scale
y_pred_boxcox_xgb = xgb_model.predict(X_test)

# Transform predictions back to the original scale
y_pred_xgb = inv_boxcox(y_pred_boxcox_xgb, specified_lambda)
y_test_price = inv_boxcox(y_test, specified_lambda)

# Evaluation metrics on the original scale
mae_xgb = mean_absolute_error(y_test_price, y_pred_xgb)
mse_xgb = mean_squared_error(y_test_price, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mape_xgb = mean_absolute_percentage_error(y_test_price, y_pred_xgb)

# Evaluation metrics on the Box-Cox transformed scale
mae_boxcox_xgb = mean_absolute_error(y_test, y_pred_boxcox_xgb)
mse_boxcox_xgb = mean_squared_error(y_test, y_pred_boxcox_xgb)
rmse_boxcox_xgb = np.sqrt(mse_boxcox_xgb)
mape_boxcox_xgb = mean_absolute_percentage_error(y_test, y_pred_boxcox_xgb)

# Print the metrics
print("Metrics on the original PRICE scale:")
print(f"MAE: {mae_xgb:.4f}")
print(f"MSE: {mse_xgb:.4f}")
print(f"RMSE: {rmse_xgb:.4f}")
print(f"MAPE: {mape_xgb:.4f}")

print("\nMetrics on the Box-Cox transformed scale:")
print(f"MAE: {mae_boxcox_xgb:.4f}")
print(f"MSE: {mse_boxcox_xgb:.4f}")
print(f"RMSE: {rmse_boxcox_xgb:.4f}")
print(f"MAPE: {mape_boxcox_xgb:.4f}")

Metrics on the original PRICE scale:
MAE: 43759.7532
MSE: 8720744755.8811
RMSE: 93384.9279
MAPE: 0.1297

Metrics on the Box-Cox transformed scale:
MAE: 0.1255
MSE: 0.0320
RMSE: 0.1788
MAPE: 0.0101
