## Model Training

In [139]:
import pandas as pd
import numpy as np
# Handling missing values and feature scaling
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

In [140]:
df=pd.read_csv('data/realestate.csv')
df.head()

Unnamed: 0,time,province,county,city,area,neighborhood,title,type,price,parking,rooms,sqrm,floor,surface,elevator,tag
0,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,Chalet en Arenys de Munt,venta,360000,1,3.0,182,,,,
1,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,Casa o chalet independiente en Arenys de Munt,venta,570000,0,5.0,266,,,,
2,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,"Chalet en riera i Penya, Arenys de Munt",venta,220000,0,8.0,320,,,,
3,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,"Casa o chalet independiente en Urbanització Collsacreu, Arenys de Munt",venta,495000,1,6.0,199,,,,
4,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,Chalet en Arenys de Munt,venta,575000,1,5.0,202,,,,Lujo


In [141]:
# Copy paste the data cleaning code from the EDA notebook

# unnecessary columns
df=df.drop(labels=['time', 'tag'], axis=1)

# type
df.rename(columns={'type': 'deal_type'}, inplace=True)

# sqrm
df['sqrm'] = df['sqrm'].replace('[^0-9]', np.nan, regex=True)
df = df.dropna(subset=['sqrm'])
df['sqrm'] = df['sqrm'].astype(int)

# property_type_encoded & dropping duplicates
df['property_type'] = [x.split()[0] for x in df['title']]
df = df[df['property_type'] != 'Estudio']
df = df[df['property_type'] != 'Finca']
df=df.drop(labels=['title'], axis=1)
df=df.drop_duplicates()
ptype_price_sqrm = df.groupby('property_type', as_index=False).apply(lambda x: pd.Series({'property_type_encoded':x['price'].sum() / x['sqrm'].sum()})).set_index('property_type')['property_type_encoded'].to_dict()
df['property_type_encoded'] = df['property_type'].map(ptype_price_sqrm)

# floor
df['floor'] = df['floor'].replace('-', '-1')
df['floor'] = df['floor'].astype(float)
df.loc[df['property_type'].isin(['Casa', 'Castillo', 'Chalet', 'Cortijo', 'Finca', 'Masía', 'Torre']), 'floor'] = '0'
df['floor'] = df['floor'].astype(float)
df['floor'] = df['floor'].fillna(df.groupby('property_type')['floor'].transform('mean'))
df['floor'] = df['floor'].astype(int)

# elevator
df.loc[df['property_type'].isin(['Casa', 'Castillo', 'Chalet', 'Cortijo', 'Finca', 'Masía', 'Torre']), 'elevator'] = '0'
df['elevator'] = df['elevator'].astype(float)
df['elevator'] = df['elevator'].fillna(df.groupby('property_type')['elevator'].transform('mean'))
df['elevator'] = df['elevator'].astype(int)

# surface
df['surface'] = df['surface'].fillna('0')
df['surface'] = df['surface'].replace('outdoor', '1')
df['surface'] = df['surface'].astype(int)

# rooms
df = df.dropna(subset=['rooms'])
df['rooms'] = df['rooms'].astype(int)

# location_encoded
df['province'] = df['province'].fillna('empty')
df['county'] = df['county'].fillna('empty')
df['city'] = df['city'].fillna('empty')
df['area'] = df['area'].fillna('empty')
df['neighborhood'] = df['neighborhood'].fillna('empty')
df['city_filled'] = df['province'] + '-' + df['county'] + '-' + df['city']
df['area_filled'] = df['province'] + '-' + df['county'] + '-' + df['city'] + '-' + df['area']
df['neighborhood_filled'] = df['province'] + '-' + df['county'] + '-' + df['city'] + '-' + df['area'] + '-' + df['neighborhood']
city_price_sqrm = df.groupby('city_filled', as_index=False).apply(lambda x: pd.Series({'location_encoded':x['price'].sum() / x['sqrm'].sum()}))
city_count = df.groupby('city_filled')['province'].count()
city_joined = pd.merge(city_price_sqrm, city_count, on='city_filled')
area_price_sqrm = df.groupby('area_filled', as_index=False).apply(lambda x: pd.Series({'location_encoded':x['price'].sum() / x['sqrm'].sum()}))
area_count = df.groupby('area_filled')['province'].count()
area_joined = pd.merge(area_price_sqrm, area_count, on='area_filled')
area_joined = area_joined.drop(area_joined[area_joined.province < 100].index)
neighborhood_price_sqrm = df.groupby('neighborhood_filled', as_index=False).apply(lambda x: pd.Series({'location_encoded':x['price'].sum() / x['sqrm'].sum()}))
neighborhood_count = df.groupby('neighborhood_filled')['province'].count()
neighborhood_joined = pd.merge(neighborhood_price_sqrm, neighborhood_count, on='neighborhood_filled')
neighborhood_joined = neighborhood_joined.drop(neighborhood_joined[neighborhood_joined.province < 100].index)
dict_city = city_joined.drop(columns='province').set_index('city_filled')['location_encoded'].to_dict()
dict_area = area_joined.drop(columns='province').set_index('area_filled')['location_encoded'].to_dict()
dict_neighborhood = neighborhood_joined.drop(columns='province').set_index('neighborhood_filled')['location_encoded'].to_dict()
df['location_encoded'] = df['neighborhood_filled'].map(dict_neighborhood)
df['location_encoded'] = np.where(df['location_encoded'].isna(), df['area_filled'].map(dict_area), df['location_encoded'])
df['location_encoded'] = np.where(df['location_encoded'].isna(), df['city_filled'].map(dict_city), df['location_encoded'])
df=df.drop(labels=['province', 'county', 'city', 'area', 'neighborhood', 'city_filled', 'area_filled', 'neighborhood_filled'], axis=1)

In [142]:
#df=df.drop(labels=['floor', 'surface', 'elevator'], axis=1)

In [143]:
# df for rent and another one for sale
df_venta = df[df['deal_type']=='venta']
df_alquiler = df[df['deal_type']=='alquiler']

In [144]:
# independent and dependent features for sale
X_venta = df_venta.drop(labels=['price', 'deal_type'], axis=1)
Y_venta = df_venta[['price']]

In [145]:
# Define which columns should be encoded and which shuould be scaled
categorical_cols_venta = X_venta.select_dtypes(include='object').columns
numerical_cols_venta = X_venta.select_dtypes(exclude='object').columns

In [146]:
numerical_cols_venta

Index(['parking', 'rooms', 'sqrm', 'floor', 'surface', 'elevator', 'property_type_encoded', 'location_encoded'], dtype='object')

In [147]:
categorical_cols_venta

Index(['property_type'], dtype='object')

In [148]:
# Handling missing values and feature scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [149]:
## Numerical pipline that handles missing values and scaling
num_pipeline = Pipeline(
    steps=[
    ('scaler', StandardScaler())
    ]
)

# Categorical Pipeline. We don't scale if we are going to One Hot Encode
cat_pipeline = Pipeline(
    steps=[
        ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))
    ]
)

In [150]:
preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, numerical_cols_venta),
    ('cat_pipeline', cat_pipeline, categorical_cols_venta)
])

In [151]:
# Train test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_venta, Y_venta, test_size=0.30, random_state=42)

In [152]:
X_train = pd.DataFrame(preprocessor.fit_transform(X_train), columns=preprocessor.get_feature_names_out())
X_test = pd.DataFrame(preprocessor.transform(X_test), columns=preprocessor.get_feature_names_out())

In [153]:
X_train.head()

Unnamed: 0,num_pipeline__parking,num_pipeline__rooms,num_pipeline__sqrm,num_pipeline__floor,num_pipeline__surface,num_pipeline__elevator,num_pipeline__property_type_encoded,num_pipeline__location_encoded,cat_pipeline__property_type_Casa,cat_pipeline__property_type_Castillo,cat_pipeline__property_type_Chalet,cat_pipeline__property_type_Dúplex,cat_pipeline__property_type_Masía,cat_pipeline__property_type_Piso,cat_pipeline__property_type_Torre,cat_pipeline__property_type_Ático
0,-0.899618,-0.430203,-0.413332,0.023317,-0.813227,-0.677867,-0.086959,-0.316123,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.111583,0.953279,0.862482,-0.636201,-0.813227,-0.677867,0.499691,1.572413,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.899618,-0.430203,-0.345349,-0.636201,1.229669,-0.677867,-0.086959,1.724941,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,-0.899618,0.492118,-0.145932,-0.636201,-0.813227,-0.677867,-1.04636,1.315699,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,-0.899618,-0.891364,-0.402002,0.682835,1.229669,1.475216,-0.086959,-0.563926,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [297]:
# Model Training

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [155]:
regression=LinearRegression()
regression.fit(X_train, y_train)

In [156]:
regression.coef_

array([[  36030.14605057,  223108.75607339,  292293.95527525,
          -7303.41220288,  -17172.90595457,   12871.16098184,
         185298.58281446,  217723.17941913, -148492.97444558,
         504803.75264361,  -35431.28784721, -356958.620994  ,
         548083.07860143, -265564.01289488,  592378.06401199,
        -838817.99907537]])

In [157]:
regression.intercept_

array([747893.28551344])

In [158]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [318]:
## Train multiple models

models={
    'LinearRegression':LinearRegression(),
    'Lasso':Lasso(),
    'Ridge':Ridge(),
    'Elasticnet':ElasticNet(),
    'RandomForestRegressor':RandomForestRegressor(),
    'xgb':xgb.XGBRegressor(eta= 0.01, n_estimators= 1200, max_depth= 4, subsample= 0.8, colsample_bytree= 1,gamma= 1),
    'GradientBoostRegressor':GradientBoostingRegressor(),
    'LGBMRegressor':LGBMRegressor()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train.values.ravel())

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


LinearRegression
Model Training Performance
RMSE: 491994.32376399776
MAE: 236901.08347371925
R2 score 59.78930352647849


Lasso
Model Training Performance
RMSE: 491933.5923002461
MAE: 236761.89936216167
R2 score 59.79923007893873


Ridge
Model Training Performance
RMSE: 493365.82492744393
MAE: 236993.18877637305
R2 score 59.56480547019739


Elasticnet
Model Training Performance
RMSE: 540310.027582229
MAE: 232391.76321096558
R2 score 51.503827230111575




  model = cd_fast.enet_coordinate_descent(


RandomForestRegressor
Model Training Performance
RMSE: 448149.43404086307
MAE: 148405.75498460335
R2 score 66.63684481885863


xgb
Model Training Performance
RMSE: 431369.1247266651
MAE: 149454.32909952375
R2 score 69.08853947729996


GradientBoostRegressor
Model Training Performance
RMSE: 437338.80828547536
MAE: 153483.01599437097
R2 score 68.22705687941988


LGBMRegressor
Model Training Performance
RMSE: 488022.33369324385
MAE: 159363.3241018718
R2 score 60.435944206610095




In [288]:
model_list

['LinearRegression',
 'Lasso',
 'Ridge',
 'Elasticnet',
 'DecisionTreeRegressor',
 'xgb',
 'GradientBoostRegressor',
 'LGBMRegressor']

In [344]:
from sklearn.model_selection import GridSearchCV

In [359]:
model=xgb.XGBRegressor(eta= 0.01, n_estimators= 1200, max_depth= 4, subsample= 0.8, colsample_bytree= 1,gamma= 1)
model.fit(X_train, y_train.values.ravel())
y_pred=model.predict(X_test)
r2_square = r2_score(y_test, y_pred)
print(r2_square*100)

69.08853947729996


In [346]:
param_grid = {'max_depth': [4,6,10],
           'eta': [0.01, 0.05, 0.1],
           'n_estimators': [100, 500, 1000, 1500],
           'colsample_bytree': [0.3, 0.7,1],
           'gamma': [1, 5],

In [349]:
grid_search=GridSearchCV(estimator = model, param_grid = param_grid, cv = 10, scoring='r2')

In [350]:
grid_search.fit(X_train,y_train)

In [357]:
grid_search.best_params_

{'colsample_bytree': 0.7,
 'eta': 0.01,
 'gamma': 1,
 'max_depth': 6,
 'n_estimators': 1500,
 'subsample': 1}

In [352]:
grid_search.best_estimator_

In [353]:
best_estim=grid_search.best_estimator_

In [354]:
best_estim.fit(X_train,y_train)
y_pred=best_estim.predict(X_test)
r2_square = r2_score(y_test, y_pred)
print(r2_square*100)

64.20453851112585


In [355]:
model_2=xgb.XGBRegressor(colsample_bytree= 0.7,
 learning_rate= 0.1,
 max_depth= 10,
 n_estimators= 100,
 subsample=1,
 gamma=1)
model_2.fit(X_train,y_train)
y_pred=model_2.predict(X_test)
r2_square = r2_score(y_test, y_pred)

In [356]:
print(r2_square*100)

67.13500131214161
