In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [3]:
# Importing dataset
housing_madrid = pd.read_csv("housing_madrid_cleaned.csv", dtype={'id': str, 'floor': str, 'built_year': str, 'neighborhood_id': str})
pd.set_option("display.max_columns", None)
housing_madrid.head()

Unnamed: 0,id,neighborhood,sq_m,rooms,bathrooms,floors,sq_m_allotment,floor,neighborhood_id,price,house_type_id,is_renewal_needed,built_year,has_ac,has_fitted_wardrobes,has_lift,has_garden,has_pool,has_terrace,has_balcony,has_storage_room,has_green_zones,has_parking,parking_included,parking_price,orientation_north,orientation_west,orientation_south,orientation_east,house_type,district_id,district
0,21742,San Cristóbal,64.0,2,1,0,0.0,3,135,85000.0,1.0,False,1960,True,False,False,False,False,False,False,False,False,False,False,0.0,False,True,False,False,Apartment,21,Villaverde
1,21741,Los Ángeles,70.0,3,1,0,0.0,4,132,129900.0,1.0,True,1969,False,True,True,False,False,True,False,False,False,False,False,0.0,False,False,False,False,Apartment,21,Villaverde
2,21740,San Andrés,94.0,2,2,0,0.0,1,134,144247.0,1.0,False,1969,False,True,True,False,False,False,False,True,False,False,False,0.0,False,False,False,False,Apartment,21,Villaverde
3,21739,San Andrés,64.0,2,1,0,0.0,0,134,109900.0,1.0,False,1955,False,False,True,False,False,False,False,True,False,False,False,0.0,False,False,True,False,Apartment,21,Villaverde
4,21738,Los Rosales,108.0,2,2,0,0.0,4,133,260000.0,1.0,False,2003,True,True,True,False,True,False,False,True,True,True,True,0.0,True,True,True,True,Apartment,21,Villaverde


In [4]:
housing_madrid.dtypes

id                       object
neighborhood             object
sq_m                    float64
rooms                     int64
bathrooms                 int64
floors                    int64
sq_m_allotment          float64
floor                    object
neighborhood_id          object
price                   float64
house_type_id           float64
is_renewal_needed          bool
built_year               object
has_ac                     bool
has_fitted_wardrobes       bool
has_lift                   bool
has_garden                 bool
has_pool                   bool
has_terrace                bool
has_balcony                bool
has_storage_room           bool
has_green_zones            bool
has_parking                bool
parking_included           bool
parking_price           float64
orientation_north          bool
orientation_west           bool
orientation_south          bool
orientation_east           bool
house_type               object
district_id               int64
district

In [5]:
# Get dummies out of house_type_id and district columns
categorical = pd.get_dummies(housing_madrid[['house_type_id','district']])
categorical

Unnamed: 0,house_type_id,district_Arganzuela,district_Barajas,district_Carabanchel,district_Centro,district_Chamartín,district_Chamberí,district_Ciudad Lineal,district_Fuencarral,district_Hortaleza,district_Latina,district_Moncloa,district_Moratalaz,district_Puente de Vallecas,district_Retiro,district_Salamanca,district_Tetuán,district_Usera,district_Vicálvaro,district_Villa de Vallecas,district_Villaverde
0,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
1,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
2,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
3,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
4,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21595,3.0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
21596,1.0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
21597,1.0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
21598,4.0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [6]:
numerical_df = housing_madrid.select_dtypes(include=['number'])
numerical_df.describe()

Unnamed: 0,sq_m,rooms,bathrooms,floors,sq_m_allotment,price,house_type_id,parking_price,district_id
count,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0,21600.0
mean,146.836019,2.980926,2.058565,0.196991,15.967037,636185.8,1.378241,949.866019,9.745139
std,134.059618,1.465313,1.33206,0.794642,87.408889,742976.9,0.913089,8087.751928,5.602551
min,13.0,0.0,1.0,0.0,0.0,36000.0,1.0,0.0,1.0
25%,70.0,2.0,1.0,0.0,0.0,198000.0,1.0,0.0,5.0
50%,100.0,3.0,2.0,0.0,0.0,370000.0,1.0,0.0,9.0
75%,162.0,4.0,2.0,0.0,0.0,750000.0,1.0,0.0,14.0
max,999.0,24.0,14.0,5.0,997.0,8800000.0,4.0,600000.0,21.0


In [7]:
# Select features and target
target = numerical_df['price']
numerical_features = numerical_df.drop('price', axis = 1)

In [8]:
features = pd.concat([numerical_features,categorical,housing_madrid.select_dtypes('bool')],axis=1)
features

Unnamed: 0,sq_m,rooms,bathrooms,floors,sq_m_allotment,house_type_id,parking_price,district_id,house_type_id.1,district_Arganzuela,district_Barajas,district_Carabanchel,district_Centro,district_Chamartín,district_Chamberí,district_Ciudad Lineal,district_Fuencarral,district_Hortaleza,district_Latina,district_Moncloa,district_Moratalaz,district_Puente de Vallecas,district_Retiro,district_Salamanca,district_Tetuán,district_Usera,district_Vicálvaro,district_Villa de Vallecas,district_Villaverde,is_renewal_needed,has_ac,has_fitted_wardrobes,has_lift,has_garden,has_pool,has_terrace,has_balcony,has_storage_room,has_green_zones,has_parking,parking_included,orientation_north,orientation_west,orientation_south,orientation_east
0,64.0,2,1,0,0.0,1.0,0.0,21,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,False,False,False,False,False,False,False,False,False,False,False,True,False,False
1,70.0,3,1,0,0.0,1.0,0.0,21,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,True,False,False,True,False,False,False,False,False,False,False,False,False
2,94.0,2,2,0,0.0,1.0,0.0,21,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,True,True,False,False,False,False,True,False,False,False,False,False,False,False
3,64.0,2,1,0,0.0,1.0,0.0,21,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False,False,False,False,True,False,False,False,False,False,True,False
4,108.0,2,2,0,0.0,1.0,0.0,21,1.0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,False,True,True,True,False,True,False,False,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21595,78.0,2,2,0,0.0,3.0,0.0,2,3.0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,False,False,False,False,True,False,True,True,False,False,False,False
21596,96.0,2,2,0,0.0,1.0,0.0,2,1.0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,True,False,True,False,False,True,True,True,True,False,True,True,True
21597,175.0,4,2,0,0.0,1.0,0.0,2,1.0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,False,True,True,False,True,False,True,True,False,False,False,False
21598,289.0,4,3,3,304.0,4.0,0.0,2,4.0,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True,True,False,True,False,True,True,True,False,True,True,False,False,True,False


In [9]:
target

0         85000.0
1        129900.0
2        144247.0
3        109900.0
4        260000.0
           ...   
21595    350000.0
21596    425000.0
21597    680000.0
21598    695000.0
21599    424000.0
Name: price, Length: 21600, dtype: float64

In [10]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=42)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [11]:
# Define function to evaluate and store model results for regression
def evaluate_model_regression(model, X_train_scaled, y_train, X_test_scaled, y_test):
    model.fit(X_train_scaled, y_train)
    pred = model.predict(X_test_scaled)
    mae = mean_absolute_error(y_test, pred)
    rmse = np.sqrt(mean_squared_error(y_test, pred)) 
    r2 = r2_score(y_test, pred)
    return mae, rmse, r2

# Define parameter grids for each model
param_grids = {
    'KNN': {'n_neighbors': [5, 10]},
    'Bagging': {'n_estimators': [50, 100], 'estimator__max_depth': [10, 20]},
    'Random Forest': {'n_estimators': [50, 100], 'max_depth': [10, 20]},
    'Gradient Boosting': {'n_estimators': [50, 100], 'max_depth': [10, 20]},
    'AdaBoost': {'n_estimators': [50, 100], 'estimator__max_depth': [10, 20]},
    'XGBoost': {'n_estimators': [50, 100], 'max_depth': [10, 20]}
}

# Initialize models
models = {
    'KNN': KNeighborsRegressor(),
    'Bagging': BaggingRegressor(estimator=DecisionTreeRegressor(), random_state=1),
    'Random Forest': RandomForestRegressor(random_state=1),
    'Gradient Boosting': GradientBoostingRegressor(random_state=1),
    'AdaBoost': AdaBoostRegressor(estimator=DecisionTreeRegressor(), random_state=1),
    'XGBoost': XGBRegressor(random_state=1)
}

# Define function for hyperparameter tuning with Grid Search and Randomized Search
def best_model(model, param_grid, X_train_scaled, y_train):
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=10, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
    
    grid_search.fit(X_train_scaled, y_train)
    random_search.fit(X_train_scaled, y_train)
    
    best_grid = grid_search.best_estimator_
    best_random = random_search.best_estimator_
    
    # Compare results and return the best model
    if grid_search.best_score_ > random_search.best_score_:
        print(f"Best parameters for {model.__class__.__name__} (Grid Search): {grid_search.best_params_}")
        return best_grid
    else:
        print(f"Best parameters for {model.__class__.__name__} (Randomized Search): {random_search.best_params_}")
        return best_random

# Perform hyperparameter tuning and evaluate models
results = {}
for model_name, model in models.items():
    best_model_selected = best_model(model, param_grids[model_name], X_train_scaled, y_train)
    results[model_name] = evaluate_model_regression(best_model_selected, X_train_scaled, y_train, X_test_scaled, y_test)

# Display results
results_df = pd.DataFrame(results, index=['MAE', 'RMSE', 'R2 Score']).T
print(results_df)



Best parameters for KNeighborsRegressor (Randomized Search): {'n_neighbors': 5}




Best parameters for BaggingRegressor (Randomized Search): {'n_estimators': 100, 'estimator__max_depth': 20}




Best parameters for RandomForestRegressor (Randomized Search): {'n_estimators': 100, 'max_depth': 20}




Best parameters for GradientBoostingRegressor (Randomized Search): {'n_estimators': 100, 'max_depth': 10}




Best parameters for AdaBoostRegressor (Randomized Search): {'n_estimators': 100, 'estimator__max_depth': 20}




Best parameters for XGBRegressor (Randomized Search): {'n_estimators': 50, 'max_depth': 10}
                             MAE           RMSE  R2 Score
KNN                212645.142778  446221.786758  0.649880
Bagging            113202.226693  255086.862254  0.885583
Random Forest      113285.087235  254884.169352  0.885764
Gradient Boosting  113267.822344  262184.606438  0.879127
AdaBoost           109234.366710  253857.046533  0.886683
XGBoost            117279.152071  271279.143374  0.870596


In [12]:
# Save the file
housing_madrid.to_csv('housing_madrid_ML.csv', index=False)