# Notebook 5.6 - Testing the feature "Type of road"

# Import libraries

In [17]:
import xgboost as xgb
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import stats
from scipy.special import inv_boxcox
from scipy.stats import pointbiserialr

# Choose the city

In [18]:
# Choose the city ('Madrid', 'Barcelona', 'Valencia', or 'Combined')
city = 'Combined'

# Load the cleaned data set incl. the feature "type of road"

In [19]:
if city.lower() == 'madrid':
    data = pd.read_csv('../../data/5_cleaned_and_feature_engineering/feature_road_type/madrid_cleaned_incl_road_type.csv')
elif city.lower() == 'barcelona':
    data = pd.read_csv('../../data/5_cleaned_and_feature_engineering/feature_road_type/barcelona_cleaned_incl_road_type.csv')
elif city.lower() == 'valencia':
    data = pd.read_csv('../../data/5_cleaned_and_feature_engineering/feature_road_type/valencia_cleaned_incl_road_type.csv')
elif city.lower() == 'combined':
    #Read all 3 datasets and add a column to indicate source dataset
    madrid_df = pd.read_csv('../../data/5_cleaned_and_feature_engineering/feature_road_type/madrid_cleaned_incl_road_type.csv')
    valencia_df = pd.read_csv('../../data/5_cleaned_and_feature_engineering/feature_road_type/valencia_cleaned_incl_road_type.csv')
    barcelona_df = pd.read_csv('../../data/5_cleaned_and_feature_engineering/feature_road_type/barcelona_cleaned_incl_road_type.csv')
    madrid_df['Madrid'] = True
    madrid_df['Valencia'] = False
    madrid_df['Barcelona'] = False    
    valencia_df['Madrid'] = False
    valencia_df['Valencia'] = True
    valencia_df['Barcelona'] = False    
    barcelona_df['Madrid'] = False
    barcelona_df['Valencia'] = False
    barcelona_df['Barcelona'] = True
    data = pd.concat([madrid_df, valencia_df, barcelona_df], ignore_index=True)

In [20]:
data.head()

Unnamed: 0,ASSETID,PRICE,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,...,PERIOD_201806,PERIOD_201809,PERIOD_201812,MOTORWAY/PRIMARY,OTHER,PEDESTRIAN,SECONDARY/TERTIARY,Madrid,Valencia,Barcelona
0,A10000037964896093228,255000,97,3,2,3,0,1.0,0,1,...,0,1,0,False,True,False,False,True,False,False
1,A10000072440601830803,82000,62,2,1,3,0,1.0,0,1,...,1,0,0,False,True,False,False,True,False,False
2,A10000538600815177437,133000,67,3,1,3,0,1.0,1,0,...,0,0,0,False,True,False,False,True,False,False
3,A10000654405436195291,204000,180,3,2,3,0,1.0,0,1,...,0,0,0,False,True,False,False,True,False,False
4,A10000872160480475600,161000,54,2,1,3,0,1.0,0,0,...,0,0,1,False,True,False,False,True,False,False


# Defining X and y

__Transform target variable PRICE as optimized in notebook 5.1__

In [21]:
# Define the lambda value
specified_lambda = 0

# Apply Box-Cox transformation using the specified lambda value
price_transformed = stats.boxcox(data['PRICE'], lmbda=specified_lambda)

# Add the transformed PRICE back to the DataFrame
data['PRICE_TRANSFORMED'] = price_transformed

__Assign features as X and target as Y__

In [22]:
X = data.drop(columns=['PRICE', 'PRICE_TRANSFORMED'])
y = data['PRICE_TRANSFORMED']

# Drop features not relevant for modelling

In [23]:
#Drop ASSETID as it is only an identifier, drop ZIP_CODE as the base model only includes base features
X = X.drop(columns=['ASSETID', 'ZIP_CODE'])

# Splitting the data into training and test set

In [24]:
# Split the dataset into training and testing set with 7:3 ratio
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check the shapes of the resulting datasets
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train shape: {y_train.shape}")
print(f"y_test shape: {y_test.shape}")

X_train shape: (105074, 47)
X_test shape: (45033, 47)
y_train shape: (105074,)
y_test shape: (45033,)


# Target encoding of base feature NEIGHBORHOOD

In [25]:
# Calculate average LOG_PRICE for each NEIGHBORHOOD only with training set
neighborhood_means = y_train.groupby(X_train['NEIGHBORHOOD']).mean()

# Assign average means to both training and test set
X_train['NEIGHBORHOOD_ENCODED'] = X_train['NEIGHBORHOOD'].map(neighborhood_means)
X_test['NEIGHBORHOOD_ENCODED'] = X_test['NEIGHBORHOOD'].map(neighborhood_means)

# Drop the NEIGHBORHOOD column from both training and testing sets
X_train = X_train.drop(columns=['NEIGHBORHOOD'])
X_test = X_test.drop(columns=['NEIGHBORHOOD'])

# Inspect data before running the model

In [26]:
X_test.head()

Unnamed: 0,CONSTRUCTEDAREA,ROOMNUMBER,BATHNUMBER,AMENITYID,HASPARKINGSPACE,PARKINGSPACEPRICE,HASTERRACE,HASLIFT,HASAIRCONDITIONING,HASNORTHORIENTATION,...,PERIOD_201809,PERIOD_201812,MOTORWAY/PRIMARY,OTHER,PEDESTRIAN,SECONDARY/TERTIARY,Madrid,Valencia,Barcelona,NEIGHBORHOOD_ENCODED
32056,87,3,1,3,1,1.0,0,1,1,0,...,1,0,False,True,False,False,True,False,False,13.135965
147130,110,3,2,3,0,1.0,1,1,1,0,...,0,1,False,False,True,False,False,False,True,13.750688
35420,137,4,2,3,1,1.0,0,1,1,1,...,1,0,False,True,False,False,True,False,False,12.77224
128474,81,3,2,3,0,1.0,1,1,1,1,...,0,1,False,True,False,False,False,False,True,12.848363
149036,195,3,2,3,0,1.0,1,1,0,0,...,0,1,False,False,True,False,False,False,True,12.938447


# Explore correlation between road_type and target variable

In [27]:
road_columns = ['MOTORWAY/PRIMARY', 'OTHER', 'PEDESTRIAN', 'SECONDARY/TERTIARY'] 

# Function to calculate point biserial correlation
def calculate_pointbiserialr(target, binary_feature):
    correlation, p_value = pointbiserialr(data[target], data[binary_feature])
    return correlation, p_value

# Calculate and print correlations
for col in road_columns:
    correlation, p_value = calculate_pointbiserialr('PRICE', col)
    print(f"Correlation between PRICE and {col}: {correlation:.4f}, p-value: {p_value:.4f}")

Correlation between PRICE and MOTORWAY/PRIMARY: 0.0364, p-value: 0.0000
Correlation between PRICE and OTHER: -0.0130, p-value: 0.0000
Correlation between PRICE and PEDESTRIAN: -0.0457, p-value: 0.0000
Correlation between PRICE and SECONDARY/TERTIARY: 0.0557, p-value: 0.0000


# Test the model with new feature

__Define the best parameters found by GridSearchCV in Notebook 5.1__

In [31]:
best_params = {
    'colsample_bytree': 0.8,
    'learning_rate': 0.1,
    'max_depth': 10,
    'n_estimators': 500,
    'subsample': 0.9
}

__Running the optimized model new feature__

In [32]:
# Initialize XGBoost model with the best parameters
xgb_model = xgb.XGBRegressor(**best_params)

# Train the model on the Box-Cox transformed target
xgb_model.fit(X_train, y_train)

# Predictions in Box-Cox transformed scale
y_pred_boxcox_xgb = xgb_model.predict(X_test)

# Transform predictions back to the original scale
y_pred_xgb = inv_boxcox(y_pred_boxcox_xgb, specified_lambda)
y_test_price = inv_boxcox(y_test, specified_lambda)

# Evaluation metrics on the original scale
mae_xgb = mean_absolute_error(y_test_price, y_pred_xgb)
mse_xgb = mean_squared_error(y_test_price, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
mape_xgb = mean_absolute_percentage_error(y_test_price, y_pred_xgb)

# Evaluation metrics on the Box-Cox transformed scale
mae_boxcox_xgb = mean_absolute_error(y_test, y_pred_boxcox_xgb)
mse_boxcox_xgb = mean_squared_error(y_test, y_pred_boxcox_xgb)
rmse_boxcox_xgb = np.sqrt(mse_boxcox_xgb)
mape_boxcox_xgb = mean_absolute_percentage_error(y_test, y_pred_boxcox_xgb)

# Print the metrics
print("Metrics on the original PRICE scale:")
print(f"MAE: {mae_xgb:.4f}")
print(f"MSE: {mse_xgb:.4f}")
print(f"RMSE: {rmse_xgb:.4f}")
print(f"MAPE: {mape_xgb:.4f}")

print("\nMetrics on the Box-Cox transformed scale:")
print(f"MAE: {mae_boxcox_xgb:.4f}")
print(f"MSE: {mse_boxcox_xgb:.4f}")
print(f"RMSE: {rmse_boxcox_xgb:.4f}")
print(f"MAPE: {mape_boxcox_xgb:.4f}")

Metrics on the original PRICE scale:
MAE: 43300.4526
MSE: 8136346830.0925
RMSE: 90201.7008
MAPE: 0.1268

Metrics on the Box-Cox transformed scale:
MAE: 0.1233
MSE: 0.0306
RMSE: 0.1749
MAPE: 0.0100
