## Model Training

In [446]:
import pandas as pd
import numpy as np
# Handling missing values and feature scaling
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 100)

In [447]:
df=pd.read_csv('data/realestate.csv')
df.head()

Unnamed: 0,time,province,county,city,area,neighborhood,title,type,price,parking,rooms,sqrm,floor,surface,elevator,tag
0,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,Chalet en Arenys de Munt,venta,360000,1,3.0,182,,,,
1,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,Casa o chalet independiente en Arenys de Munt,venta,570000,0,5.0,266,,,,
2,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,"Chalet en riera i Penya, Arenys de Munt",venta,220000,0,8.0,320,,,,
3,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,"Casa o chalet independiente en Urbanització Collsacreu, Arenys de Munt",venta,495000,1,6.0,199,,,,
4,01/04/2023,Barcelona,Maresme,Arenys de Munt,,,Chalet en Arenys de Munt,venta,575000,1,5.0,202,,,,Lujo


In [448]:
# Copy paste the data cleaning code from the EDA notebook
from sklearn.base import BaseEstimator, TransformerMixin

class iRealEstatePreprocessor(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        print('Initialising transformer...')
        
    def fit(self, X, y = None):
        return self
    
    def transform(self, X):
        print('Transforming..')
        X=X.drop(labels=['time', 'tag'], axis=1)

        # type
        X.rename(columns={'type': 'deal_type'}, inplace=True)

        # sqrm
        X['sqrm'] = X['sqrm'].replace('[^0-9]', np.nan, regex=True)
        X = X.dropna(subset=['sqrm'])
        X['sqrm'] = X['sqrm'].astype(int)

        # property_type_encoded & dropping duplicates
        X['property_type'] = [x.split()[0] for x in X['title']]
        X = X[X['property_type'] != 'Estudio']
        X = X[X['property_type'] != 'Finca']
        X=X.drop(labels=['title'], axis=1)
        X=X.drop_duplicates()
        # ptype_price_sqrm = X.groupby('property_type', as_index=False).apply(lambda x: pd.Series({'property_type_encoded':x['price'].sum() / x['sqrm'].sum()})).set_index('property_type')['property_type_encoded'].to_dict()
        # X['property_type_encoded'] = X['property_type'].map(ptype_price_sqrm)

        # floor
        X['floor'] = X['floor'].replace('-', '-1')
        X['floor'] = X['floor'].astype(float)
        X.loc[X['property_type'].isin(['Casa', 'Castillo', 'Chalet', 'Cortijo', 'Finca', 'Masía', 'Torre']), 'floor'] = '0'
        X['floor'] = X['floor'].astype(float)
        X['floor'] = X['floor'].fillna(X.groupby('property_type')['floor'].transform('mean'))
        X['floor'] = X['floor'].astype(int)

        # elevator
        X.loc[X['property_type'].isin(['Casa', 'Castillo', 'Chalet', 'Cortijo', 'Finca', 'Masía', 'Torre']), 'elevator'] = '0'
        X['elevator'] = X['elevator'].astype(float)
        X['elevator'] = X['elevator'].fillna(X.groupby('property_type')['elevator'].transform('mean'))
        X['elevator'] = X['elevator'].astype(int)

        # surface
        X['surface'] = X['surface'].fillna('0')
        X['surface'] = X['surface'].replace('outdoor', '1')
        X['surface'] = X['surface'].astype(int)

        # rooms
        X = X.dropna(subset=['rooms'])
        X['rooms'] = X['rooms'].astype(int)

        # location_encoded
        X['province'] = X['province'].fillna('empty')
        X['county'] = X['county'].fillna('empty')
        X['city'] = X['city'].fillna('empty')
        X['area'] = X['area'].fillna('empty')
        X['neighborhood'] = X['neighborhood'].fillna('empty')
        X['city_filled'] = X['province'] + '-' + X['county'] + '-' + X['city']
        X['area_filled'] = X['province'] + '-' + X['county'] + '-' + X['city'] + '-' + X['area']
        X['neighborhood_filled'] = X['province'] + '-' + X['county'] + '-' + X['city'] + '-' + X['area'] + '-' + X['neighborhood']
        city_price_sqrm = X.groupby('city_filled', as_index=False).apply(lambda x: pd.Series({'location_encoded':x['price'].sum() / x['sqrm'].sum()}))
        city_count = X.groupby('city_filled')['province'].count()
        city_joined = pd.merge(city_price_sqrm, city_count, on='city_filled')
        area_price_sqrm = X.groupby('area_filled', as_index=False).apply(lambda x: pd.Series({'location_encoded':x['price'].sum() / x['sqrm'].sum()}))
        area_count = X.groupby('area_filled')['province'].count()
        area_joined = pd.merge(area_price_sqrm, area_count, on='area_filled')
        area_joined = area_joined.drop(area_joined[area_joined.province < 100].index)
        neighborhood_price_sqrm = X.groupby('neighborhood_filled', as_index=False).apply(lambda x: pd.Series({'location_encoded':x['price'].sum() / x['sqrm'].sum()}))
        neighborhood_count = X.groupby('neighborhood_filled')['province'].count()
        neighborhood_joined = pd.merge(neighborhood_price_sqrm, neighborhood_count, on='neighborhood_filled')
        neighborhood_joined = neighborhood_joined.drop(neighborhood_joined[neighborhood_joined.province < 100].index)
        dict_city = city_joined.drop(columns='province').set_index('city_filled')['location_encoded'].to_dict()
        dict_area = area_joined.drop(columns='province').set_index('area_filled')['location_encoded'].to_dict()
        dict_neighborhood = neighborhood_joined.drop(columns='province').set_index('neighborhood_filled')['location_encoded'].to_dict()
        X['location_encoded'] = X['neighborhood_filled'].map(dict_neighborhood)
        X['location_encoded'] = np.where(X['location_encoded'].isna(), X['area_filled'].map(dict_area), X['location_encoded'])
        X['location_encoded'] = np.where(X['location_encoded'].isna(), X['city_filled'].map(dict_city), X['location_encoded'])
        X=X.drop(labels=['province', 'county', 'city', 'area', 'neighborhood', 'city_filled', 'area_filled', 'neighborhood_filled'], axis=1)
        return X

In [449]:
# df for rent and another one for sale
df_venta = df[df['type']=='venta']
df_alquiler = df[df['type']=='alquiler']

In [450]:
# Handling missing values and feature scaling
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from src.utils import RealEstatePreprocessor
from sklearn.compose import make_column_selector as selector

# pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [451]:
estate_pipeline = Pipeline(
    steps=[
        ('real_estate_prep', iRealEstatePreprocessor())
    ]
)

## Numerical pipline that scales
num_pipeline = Pipeline(
    steps=[
    ('scaler', StandardScaler())
    ]
)

# Categorical Pipeline. We don't scale if we are going to One Hot Encode
cat_pipeline = Pipeline(
    steps=[
        ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))
    ]
)

preprocessor = ColumnTransformer([
    ('num_pipeline', num_pipeline, selector(dtype_exclude=object)),
    ('cat_pipeline', cat_pipeline, selector(dtype_include=object))
])

Initialising transformer...


In [452]:
pipe = Pipeline(
    steps=[
        ('estate_pipeline', estate_pipeline),
        ('preprocessor', preprocessor)
    ]
)

In [453]:
from sklearn.model_selection import train_test_split

In [454]:
train_df, test_df = train_test_split(df_venta, test_size=0.30, random_state=30)

In [455]:
target_column_name = 'price'

In [456]:
train_df = pipe[0].fit_transform(train_df)
test_df = pipe[0].transform(test_df)

Transforming..
Transforming..


In [457]:
X_train = train_df.drop(labels=['price'],axis=1)
y_train = train_df[target_column_name]

X_test = test_df.drop(labels=['price'],axis=1)
y_test = test_df[target_column_name]

In [458]:
X_train

Unnamed: 0,deal_type,parking,rooms,sqrm,floor,surface,elevator,property_type,location_encoded
270,venta,1,4,454,0,0,0,Casa,3250.939702
5633,venta,0,3,83,3,1,1,Piso,1703.570117
6138,venta,0,4,87,0,0,0,Chalet,2502.954484
7716,venta,1,5,770,0,0,0,Casa,3180.977326
1416,venta,1,4,185,0,0,0,Casa,1361.122653
...,...,...,...,...,...,...,...,...,...
5260,venta,0,3,100,1,1,0,Piso,2225.656319
500,venta,1,4,431,0,0,0,Casa,2445.903382
3885,venta,0,3,91,2,1,0,Piso,2312.581750
4517,venta,0,5,240,0,0,0,Casa,2323.856621


In [442]:
X_train=pipe[1].fit_transform(X_train)
X_test=pipe[1].transform(X_test)

In [443]:
from sklearn.linear_model import LinearRegression, Ridge,Lasso,ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
from lightgbm import LGBMRegressor

In [444]:
models={
            'LinearRegression':LinearRegression(),
            'Lasso':Lasso(),
            'Ridge':Ridge(),
            'Elasticnet':ElasticNet(),
            'RandomForestRegressor':RandomForestRegressor(),
            'xgb':xgb.XGBRegressor(eta= 0.01, n_estimators= 1200, max_depth= 4, subsample= 0.8, colsample_bytree= 1,gamma= 1),
            'GradientBoostRegressor':GradientBoostingRegressor(),
            'LGBMRegressor':LGBMRegressor()
            }

In [445]:
from src.utils import evaluate_model

model_report:dict=evaluate_model(X_train,y_train,X_test,y_test,models)
print(model_report)
print('\n====================================================================================\n')

  model = cd_fast.enet_coordinate_descent(


{'LinearRegression': 0.1501563260815002, 'Lasso': 0.1501653110530421, 'Ridge': 0.15574057261699836, 'Elasticnet': 0.29460829591995186, 'RandomForestRegressor': 0.6986704896907536, 'xgb': 0.6911891630000861, 'GradientBoostRegressor': 0.6301880673104755, 'LGBMRegressor': 0.6420977988758791}


