## Model Training

In [251]:
import pandas as pd
import numpy as np
# Handling missing values and feature scaling
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

In [252]:
df=pd.read_csv('data/realestate.csv')
df.head()

Unnamed: 0,time,province,county,city,area,neighborhood,title,type,price,parking,rooms,sqrm,floor,surface,elevator,tag
0,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,Chalet en Arenys de Munt,venta,360000,1,3.0,182,,,,
1,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,Casa o chalet independiente en Arenys de Munt,venta,570000,0,5.0,266,,,,
2,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,"Chalet en riera i Penya, Arenys de Munt",venta,220000,0,8.0,320,,,,
3,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,"Casa o chalet independiente en Urbanització Collsacreu, Arenys de Munt",venta,495000,1,6.0,199,,,,
4,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,Chalet en Arenys de Munt,venta,575000,1,5.0,202,,,,Lujo


In [253]:
# Copy paste the data cleaning code from the EDA notebook
from sklearn.base import BaseEstimator, TransformerMixin

class RealEstatePreprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        print('Initialising transformer...')
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        print('Transforming..')
        X=X.drop(labels=['time', 'tag'], axis=1)

        # type
        X.rename(columns={'type': 'deal_type'}, inplace=True)

        # sqrm
        X['sqrm'] = X['sqrm'].replace('[^0-9]', np.nan, regex=True)
        X = X.dropna(subset=['sqrm'])
        X['sqrm'] = X['sqrm'].astype(int)

        # property_type_encoded & dropping duplicates
        X['property_type'] = [x.split()[0] for x in X['title']]
        X = X[X['property_type'] != 'Estudio']
        X = X[X['property_type'] != 'Finca']
        X=X.drop(labels=['title'], axis=1)
        X=X.drop_duplicates()
        # ptype_price_sqrm = X.groupby('property_type', as_index=False).apply(lambda x: pd.Series({'property_type_encoded':x['price'].sum() / x['sqrm'].sum()})).set_index('property_type')['property_type_encoded'].to_dict()
        # X['property_type_encoded'] = X['property_type'].map(ptype_price_sqrm)

        # floor
        X['floor'] = X['floor'].replace('-', '-1')
        X['floor'] = X['floor'].astype(float)
        X.loc[X['property_type'].isin(['Casa', 'Castillo', 'Chalet', 'Cortijo', 'Finca', 'Masía', 'Torre']), 'floor'] = '0'
        X['floor'] = X['floor'].astype(float)
        X['floor'] = X['floor'].fillna(X.groupby('property_type')['floor'].transform('mean'))
        X['floor'] = X['floor'].astype(int)

        # elevator
        X.loc[X['property_type'].isin(['Casa', 'Castillo', 'Chalet', 'Cortijo', 'Finca', 'Masía', 'Torre']), 'elevator'] = '0'
        X['elevator'] = X['elevator'].astype(float)
        X['elevator'] = X['elevator'].fillna(X.groupby('property_type')['elevator'].transform('mean'))
        X['elevator'] = X['elevator'].astype(int)

        # surface
        X['surface'] = X['surface'].fillna('0')
        X['surface'] = X['surface'].replace('outdoor', '1')
        X['surface'] = X['surface'].astype(int)

        # rooms
        X = X.dropna(subset=['rooms'])
        X['rooms'] = X['rooms'].astype(int)

        # location_encoded
        X['province'] = X['province'].fillna('empty')
        X['county'] = X['county'].fillna('empty')
        X['city'] = X['city'].fillna('empty')
        X['area'] = X['area'].fillna('empty')
        X['neighborhood'] = X['neighborhood'].fillna('empty')
        X['city_filled'] = X['province'] + '-' + X['county'] + '-' + X['city']
        X['area_filled'] = X['province'] + '-' + X['county'] + '-' + X['city'] + '-' + X['area']
        X['neighborhood_filled'] = X['province'] + '-' + X['county'] + '-' + X['city'] + '-' + X['area'] + '-' + X['neighborhood']
        city_price_sqrm = X.groupby('city_filled', as_index=False).apply(lambda x: pd.Series({'location_encoded':x['price'].sum() / x['sqrm'].sum()}))
        city_count = X.groupby('city_filled')['province'].count()
        city_joined = pd.merge(city_price_sqrm, city_count, on='city_filled')
        area_price_sqrm = X.groupby('area_filled', as_index=False).apply(lambda x: pd.Series({'location_encoded':x['price'].sum() / x['sqrm'].sum()}))
        area_count = X.groupby('area_filled')['province'].count()
        area_joined = pd.merge(area_price_sqrm, area_count, on='area_filled')
        area_joined = area_joined.drop(area_joined[area_joined.province < 100].index)
        neighborhood_price_sqrm = X.groupby('neighborhood_filled', as_index=False).apply(lambda x: pd.Series({'location_encoded':x['price'].sum() / x['sqrm'].sum()}))
        neighborhood_count = X.groupby('neighborhood_filled')['province'].count()
        neighborhood_joined = pd.merge(neighborhood_price_sqrm, neighborhood_count, on='neighborhood_filled')
        neighborhood_joined = neighborhood_joined.drop(neighborhood_joined[neighborhood_joined.province < 100].index)
        dict_city = city_joined.drop(columns='province').set_index('city_filled')['location_encoded'].to_dict()
        dict_area = area_joined.drop(columns='province').set_index('area_filled')['location_encoded'].to_dict()
        dict_neighborhood = neighborhood_joined.drop(columns='province').set_index('neighborhood_filled')['location_encoded'].to_dict()
        X['location_encoded'] = X['neighborhood_filled'].map(dict_neighborhood)
        X['location_encoded'] = np.where(X['location_encoded'].isna(), X['area_filled'].map(dict_area), X['location_encoded'])
        X['location_encoded'] = np.where(X['location_encoded'].isna(), X['city_filled'].map(dict_city), X['location_encoded'])
        X=X.drop(labels=['province', 'county', 'city', 'area', 'neighborhood', 'city_filled', 'area_filled', 'neighborhood_filled'], axis=1)
        return X

In [254]:
# df for rent and another one for sale
df_venta = df[df['type']=='venta']
df_alquiler = df[df['type']=='alquiler']

In [255]:
# Handling missing values and feature scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from src.utils import RealEstatePreprocessor
from sklearn.compose import make_column_selector as selector

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [256]:
estate_pipeline = Pipeline(
    steps=[
        ('real_estate_prep', RealEstatePreprocessor())
    ]
)

## Numerical pipline that scales
num_pipeline = Pipeline(
    steps=[
    ('scaler', StandardScaler())
    ]
)

# Categorical Pipeline. We don't scale if we are going to One Hot Encode
cat_pipeline = Pipeline(
    steps=[
        ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, selector(dtype_exclude=object)),
    ('cat_pipeline', cat_pipeline, selector(dtype_include=object))
])

Initialising transformer...


In [257]:
pipe = Pipeline(
    steps=[
        ('estate_pipeline', estate_pipeline),
        ('preprocessor', preprocessor)
    ]
)

In [258]:
from sklearn.model_selection import train_test_split

In [259]:
train_df, test_df = train_test_split(df_venta, test_size=0.30, random_state=30)

In [260]:
target_column_name = 'price'

In [261]:
train_df = pipe[0].transform(train_df)
test_df = pipe[0].transform(test_df)

In [267]:
train_df

Unnamed: 0,deal_type,parking,rooms,sqrm,floor,surface,elevator,property_type,property_type_encoded,location_encoded
270,venta,1,4,454,0,0,0,Casa,2441.961066,3250.939702
5633,venta,0,3,83,3,1,1,Piso,2318.840874,1703.570117
6138,venta,0,4,87,0,0,0,Chalet,2029.032996,2502.954484
7716,venta,1,5,770,0,0,0,Casa,2441.961066,3180.977326
1416,venta,1,4,185,0,0,0,Casa,2441.961066,1361.122653
...,...,...,...,...,...,...,...,...,...,...
5260,venta,0,3,100,1,1,0,Piso,2318.840874,2225.656319
500,venta,1,4,431,0,0,0,Casa,2441.961066,2445.903382
3885,venta,0,3,91,2,1,0,Piso,2318.840874,2312.581750
4517,venta,0,5,240,0,0,0,Casa,2441.961066,2323.856621


In [203]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_venta, Y_venta, test_size=0.30, random_state=42)

In [204]:
pipe.fit_transform(X_train)

array([[ 1.09592341,  0.01611078,  1.11659683, ...,  0.        ,
         0.        ,  0.        ],
       [-0.91247252,  0.01611078,  0.49664268, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.09592341,  0.4513846 ,  0.36292708, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.91247252,  0.01611078, -0.28377018, ...,  1.        ,
         0.        ,  0.        ],
       [ 1.09592341,  0.88665841,  1.16522069, ...,  0.        ,
         0.        ,  0.        ],
       [-0.91247252, -0.85443684, -0.39560505, ...,  1.        ,
         0.        ,  0.        ]])

In [205]:
X_train = pd.DataFrame(pipe.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(pipe.transform(X_test), columns=preprocessor.get_feature_names_out())

In [206]:
X_train.head()

Unnamed: 0,num_pipeline__parking,num_pipeline__rooms,num_pipeline__sqrm,num_pipeline__floor,num_pipeline__surface,num_pipeline__elevator,num_pipeline__property_type_encoded,num_pipeline__location_encoded,cat_pipeline__deal_type_venta,cat_pipeline__property_type_Casa,cat_pipeline__property_type_Castillo,cat_pipeline__property_type_Chalet,cat_pipeline__property_type_Cortijo,cat_pipeline__property_type_Dúplex,cat_pipeline__property_type_Masía,cat_pipeline__property_type_Piso,cat_pipeline__property_type_Torre,cat_pipeline__property_type_Ático
0,1.095923,0.016111,1.116597,-0.633848,-0.80513,-0.667162,-4.632584,0.018836,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,-0.912473,0.016111,0.496643,-0.633848,-0.80513,-0.667162,0.545512,0.226801,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.095923,0.451385,0.362927,-0.633848,-0.80513,-0.667162,0.545512,1.716804,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1.095923,-0.419163,-0.130605,-0.633848,-0.80513,-0.667162,0.545512,-1.244365,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.912473,0.451385,-0.147623,0.036261,-0.80513,-0.667162,0.045297,0.563084,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [207]:
# Model Training

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [208]:
regression=LinearRegression()
regression.fit(X_train, y_train)

ValueError: Found input variables with inconsistent numbers of samples: [4620, 5439]

In [None]:
regression.coef_

array([[  36030.14605057,  223108.75607339,  292293.95527525,
          -7303.41220288,  -17172.90595457,   12871.16098184,
         185298.58281446,  217723.17941913, -148492.97444558,
         504803.75264361,  -35431.28784721, -356958.620994  ,
         548083.07860143, -265564.01289488,  592378.06401199,
        -838817.99907537]])

In [None]:
regression.intercept_

array([747893.28551344])

In [None]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [None]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'RandomForestRegressor':RandomForestRegressor(),
    'xgb':xgb.XGBRegressor(eta= 0.01, n_estimators= 1200, max_depth= 4, subsample= 0.8, colsample_bytree= 1,gamma= 1),
    'GradientBoostRegressor':GradientBoostingRegressor(),
    'LGBMRegressor':LGBMRegressor()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train.values.ravel())

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 491994.32376399776
MAE: 236901.08347371925
R2 score 59.78930352647849


Lasso
Model Training Performance
RMSE: 491933.5923002461
MAE: 236761.89936216167
R2 score 59.79923007893873


Ridge
Model Training Performance
RMSE: 493365.82492744393
MAE: 236993.18877637305
R2 score 59.56480547019739


Elasticnet
Model Training Performance
RMSE: 540310.027582229
MAE: 232391.76321096558
R2 score 51.503827230111575




  model = cd_fast.enet_coordinate_descent(


RandomForestRegressor
Model Training Performance
RMSE: 448149.43404086307
MAE: 148405.75498460335
R2 score 66.63684481885863


xgb
Model Training Performance
RMSE: 431369.1247266651
MAE: 149454.32909952375
R2 score 69.08853947729996


GradientBoostRegressor
Model Training Performance
RMSE: 437338.80828547536
MAE: 153483.01599437097
R2 score 68.22705687941988


LGBMRegressor
Model Training Performance
RMSE: 488022.33369324385
MAE: 159363.3241018718
R2 score 60.435944206610095




In [288]:
model_list

['LinearRegression',
 'Lasso',
 'Ridge',
 'Elasticnet',
 'DecisionTreeRegressor',
 'xgb',
 'GradientBoostRegressor',
 'LGBMRegressor']

In [344]:
from sklearn.model_selection import GridSearchCV

In [359]:
model=xgb.XGBRegressor(eta= 0.01, n_estimators= 1200, max_depth= 4, subsample= 0.8, colsample_bytree= 1,gamma= 1)
model.fit(X_train, y_train.values.ravel())
y_pred=model.predict(X_test)
r2_square = r2_score(y_test, y_pred)
print(r2_square*100)

69.08853947729996


In [346]:
param_grid = {'max_depth': [4,6,10],
           'eta': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000, 1500],
           'colsample_bytree': [0.3, 0.7,1],
           'gamma': [1, 5],
}

In [349]:
grid_search=GridSearchCV(estimator = model, param_grid = param_grid, cv = 10, scoring='r2')

In [350]:
grid_search.fit(X_train,y_train)

In [357]:
grid_search.best_params_

{'colsample_bytree': 0.7,
 'eta': 0.01,
 'gamma': 1,
 'max_depth': 6,
 'n_estimators': 1500,
 'subsample': 1}

In [352]:
grid_search.best_estimator_

In [353]:
best_estim=grid_search.best_estimator_

In [354]:
best_estim.fit(X_train,y_train)
y_pred=best_estim.predict(X_test)
r2_square = r2_score(y_test, y_pred)
print(r2_square*100)

64.20453851112585


In [355]:
model_2=xgb.XGBRegressor(colsample_bytree= 0.7,
 learning_rate= 0.1,
 max_depth= 10,
 n_estimators= 100,
 subsample=1,
 gamma=1)
model_2.fit(X_train,y_train)
y_pred=model_2.predict(X_test)
r2_square = r2_score(y_test, y_pred)

In [356]:
print(r2_square*100)

67.13500131214161
