# Notebook 5.2 - Testing ZIP_CODE vs NEIGHBORHOOD as features

# Import libraries

In [1]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import stats
from scipy.special import inv_boxcox

# Choose the city

In [2]:
# Choose the city ('Madrid', 'Barcelona', 'Valencia', or 'Combined')
city = 'Combined'

# Load the cleaned data sets after adding ZIP_CODE

In [3]:
if city.lower() == 'madrid':
    data = pd.read_csv('../../data/4_data_cleaned/madrid_cleaned_base_features.csv')
elif city.lower() == 'barcelona':
    data = pd.read_csv('../../data/4_data_cleaned/barcelona_cleaned_base_features.csv')
elif city.lower() == 'valencia':
    data = pd.read_csv('../../data/4_data_cleaned/valencia_cleaned_base_features.csv')
elif city.lower() == 'combined':
    #Read all 3 datasets and add a column to indicate source dataset
    madrid_df = pd.read_csv('../../data/4_data_cleaned/madrid_cleaned_base_features.csv')
    valencia_df = pd.read_csv('../../data/4_data_cleaned/valencia_cleaned_base_features.csv')
    barcelona_df = pd.read_csv('../../data/4_data_cleaned/barcelona_cleaned_base_features.csv')
    madrid_df['Madrid'] = True
    madrid_df['Valencia'] = False
    madrid_df['Barcelona'] = False    
    valencia_df['Madrid'] = False
    valencia_df['Valencia'] = True
    valencia_df['Barcelona'] = False    
    barcelona_df['Madrid'] = False
    barcelona_df['Valencia'] = False
    barcelona_df['Barcelona'] = True
    data = pd.concat([madrid_df, valencia_df, barcelona_df], ignore_index=True)

In [4]:
data.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,LATITUDE,NEIGHBORHOOD,ZIP_CODE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,Madrid,Valencia,Barcelona
0,A10000037964896093228,255000,97,3,2,3,0,1.0,0,1,...,40.473921,Pinar del Rey,28033,0,0,1,0,True,False,False
1,A10000072440601830803,82000,62,2,1,3,0,1.0,0,1,...,40.384968,Palomeras sureste,28018,0,1,0,0,True,False,False
2,A10000538600815177437,133000,67,3,1,3,0,1.0,1,0,...,40.384547,San Diego,28018,1,0,0,0,True,False,False
3,A10000654405436195291,204000,180,3,2,3,0,1.0,0,1,...,40.430336,Ventas,28017,1,0,0,0,True,False,False
4,A10000872160480475600,161000,54,2,1,3,0,1.0,0,0,...,40.384103,Buena Vista,28019,0,0,0,1,True,False,False


# Defining X and y

__Transform target variable PRICE as optimized in notebook 5.1__

In [5]:
# Define the lambda value
specified_lambda = 0

# Apply Box-Cox transformation using the specified lambda value
price_transformed = stats.boxcox(data['PRICE'], lmbda=specified_lambda)

# Add the transformed PRICE back to the DataFrame
data['PRICE_TRANSFORMED'] = price_transformed

__Assign features as X and target as Y__

In [6]:
X = data.drop(columns=['PRICE', 'PRICE_TRANSFORMED'])
y = data['PRICE_TRANSFORMED']

# Splitting the data into training and test sets

In [7]:
# Split the dataset into training and testing set with 7:3 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (104827, 45)
X_test shape: (44927, 45)
y_train shape: (104827,)
y_test shape: (44927,)


# Testing neighborhood vs zip code as features

__Target encoding of neighborhood__

In [8]:
# Encode NEIGHBORHOOD with the mean log price
neighborhood_means = y_train.groupby(X_train['NEIGHBORHOOD']).mean()

X_train['NEIGHBORHOOD_ENCODED'] = X_train['NEIGHBORHOOD'].map(neighborhood_means)
X_test['NEIGHBORHOOD_ENCODED'] = X_test['NEIGHBORHOOD'].map(neighborhood_means)

# Drop the NEIGHBORHOOD column from both training and testing sets
X_train = X_train.drop(columns=['NEIGHBORHOOD'])
X_test = X_test.drop(columns=['NEIGHBORHOOD'])

__Target encoding of zip code__

In [9]:
zipcode_means = y_train.groupby(X_train['ZIP_CODE']).mean()

X_train['ZIP_CODE_ENCODED'] = X_train['ZIP_CODE'].map(zipcode_means)
X_test['ZIP_CODE_ENCODED'] = X_test['ZIP_CODE'].map(zipcode_means)

# Drop the ZIP_CODE column from both training and testing sets
X_train = X_train.drop(columns=['ZIP_CODE'])
X_test = X_test.drop(columns=['ZIP_CODE'])

# Dropping features not relevant for modelling

In [10]:
X_train = X_train.drop(columns=['ASSETID'])
X_test = X_test.drop(columns=['ASSETID'])

# Inspect data before running model 

In [11]:
X_test.head()

Unnamed: 0,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,HASAIRCONDITIONING,HASNORTHORIENTATION,...,LATITUDE,PERIOD_201803,PERIOD_201806,PERIOD_201809,PERIOD_201812,Madrid,Valencia,Barcelona,NEIGHBORHOOD_ENCODED,ZIP_CODE_ENCODED
55930,82,3,2,3,0,1.0,0,1,0,0,...,40.406918,0,1,0,0,True,False,False,13.275478,12.92332
110070,86,3,1,3,0,1.0,1,1,1,0,...,41.416478,0,0,1,0,False,False,True,12.368969,12.68885
133225,93,4,1,3,0,1.0,1,1,0,0,...,41.419345,0,0,1,0,False,False,True,12.092169,12.68885
57088,40,1,1,3,1,1.0,0,1,1,0,...,40.358942,0,0,0,1,True,False,False,12.247304,12.278367
103352,84,3,2,3,0,1.0,0,0,0,0,...,41.377668,1,0,0,0,False,False,True,12.413428,12.68885


# Test the model with both features

__Define the best parameters found by GridSearchCV in Notebook 5.1__

In [12]:
best_params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'max_depth': 10,
    'n_estimators': 500,
    'subsample': 0.9
}

__Running the optimized model with neighborhood only__

In [13]:
# Drop ZIP_CODE
X_train_NEIGHBORHOOD = X_train.drop(columns=['ZIP_CODE_ENCODED'])
X_test_NEIGHBORHOOD = X_test.drop(columns=['ZIP_CODE_ENCODED'])

In [14]:
# Initialize XGBoost model with the best parameters
xgb_model = xgb.XGBRegressor(**best_params)

# Train the model on the Box-Cox transformed target
xgb_model.fit(X_train_NEIGHBORHOOD, y_train)

# Predictions in Box-Cox transformed scale
y_pred_boxcox_xgb = xgb_model.predict(X_test_NEIGHBORHOOD)

# Transform predictions back to the original scale
y_pred_xgb = inv_boxcox(y_pred_boxcox_xgb, specified_lambda)
y_test_price = inv_boxcox(y_test, specified_lambda)

# Evaluation metrics on the original scale
mae_xgb = mean_absolute_error(y_test_price, y_pred_xgb)
mse_xgb = mean_squared_error(y_test_price, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mape_xgb = mean_absolute_percentage_error(y_test_price, y_pred_xgb)

# Evaluation metrics on the Box-Cox transformed scale
mae_boxcox_xgb = mean_absolute_error(y_test, y_pred_boxcox_xgb)
mse_boxcox_xgb = mean_squared_error(y_test, y_pred_boxcox_xgb)
rmse_boxcox_xgb = np.sqrt(mse_boxcox_xgb)
mape_boxcox_xgb = mean_absolute_percentage_error(y_test, y_pred_boxcox_xgb)

# Print the metrics
print("Metrics on the original PRICE scale:")
print(f"MAE: {mae_xgb:.4f}")
print(f"MSE: {mse_xgb:.4f}")
print(f"RMSE: {rmse_xgb:.4f}")
print(f"MAPE: {mape_xgb:.4f}")

print("\nMetrics on the Box-Cox transformed scale:")
print(f"MAE: {mae_boxcox_xgb:.4f}")
print(f"MSE: {mse_boxcox_xgb:.4f}")
print(f"RMSE: {rmse_boxcox_xgb:.4f}")
print(f"MAPE: {mape_boxcox_xgb:.4f}")

Metrics on the original PRICE scale:
MAE: 43491.0829
MSE: 8348453364.1819
RMSE: 91369.8712
MAPE: 0.1292

Metrics on the Box-Cox transformed scale:
MAE: 0.1249
MSE: 0.0319
RMSE: 0.1785
MAPE: 0.0101


__Running the optimized model with ZIP_CODE only__

In [15]:
# Drop NEIGHBORHOOD
X_train_ZIP = X_train.drop(columns=['NEIGHBORHOOD_ENCODED'])
X_test_ZIP = X_test.drop(columns=['NEIGHBORHOOD_ENCODED'])

In [16]:
# Initialize XGBoost model with the best parameters
xgb_model = xgb.XGBRegressor(**best_params)

# Train the model on the Box-Cox transformed target
xgb_model.fit(X_train_ZIP, y_train)

# Predictions in Box-Cox transformed scale
y_pred_boxcox_xgb = xgb_model.predict(X_test_ZIP)

# Transform predictions back to the original scale
y_pred_xgb = inv_boxcox(y_pred_boxcox_xgb, specified_lambda)
y_test_price = inv_boxcox(y_test, specified_lambda)

# Evaluation metrics on the original scale
mae_xgb = mean_absolute_error(y_test_price, y_pred_xgb)
mse_xgb = mean_squared_error(y_test_price, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mape_xgb = mean_absolute_percentage_error(y_test_price, y_pred_xgb)

# Evaluation metrics on the Box-Cox transformed scale
mae_boxcox_xgb = mean_absolute_error(y_test, y_pred_boxcox_xgb)
mse_boxcox_xgb = mean_squared_error(y_test, y_pred_boxcox_xgb)
rmse_boxcox_xgb = np.sqrt(mse_boxcox_xgb)
mape_boxcox_xgb = mean_absolute_percentage_error(y_test, y_pred_boxcox_xgb)

# Print the metrics
print("Metrics on the original PRICE scale:")
print(f"MAE: {mae_xgb:.4f}")
print(f"MSE: {mse_xgb:.4f}")
print(f"RMSE: {rmse_xgb:.4f}")
print(f"MAPE: {mape_xgb:.4f}")

print("\nMetrics on the Box-Cox transformed scale:")
print(f"MAE: {mae_boxcox_xgb:.4f}")
print(f"MSE: {mse_boxcox_xgb:.4f}")
print(f"RMSE: {rmse_boxcox_xgb:.4f}")
print(f"MAPE: {mape_boxcox_xgb:.4f}")

Metrics on the original PRICE scale:
MAE: 43202.0740
MSE: 8368802905.1321
RMSE: 91481.1615
MAPE: 0.1283

Metrics on the Box-Cox transformed scale:
MAE: 0.1241
MSE: 0.0315
RMSE: 0.1775
MAPE: 0.0100
