PREDICTING HOUSE PRICES IN BELGIUM

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

*COLUMNS TO USE* IMPORT CLEANED DATASET

In [None]:

columns_to_use=['id','transaction.sale.price','flags.isNewlyBuilt','property.subtype','property.bedroomCount','property.bathroomCount','property.netHabitableSurface','property.hasBasement',
                'property.hasDressingRoom','property.building.condition','property.building.constructionYear','property.hasCaretakerOrConcierge','property.hasDisabledAccess',
                'property.hasLift','property.kitchen.type','property.land.surface','property.hasLaundryRoom','property.hasGarden','property.parkingCountIndoor',
                'property.parkingCountOutdoor','property.hasAirConditioning','property.hasArmoredDoor','property.hasVisiophone','property.hasSecureAccessAlarm',
                'property.hasCableTV','property.hasDoorPhone','property.showerRoomCount','property.toiletCount','property.hasSwimmingPool','property.hasSauna',
                'property.hasJacuzzi','property.fireplaceExists','property.hasTerrace','transaction.certificates.epcScore','property.energy.hasHeatPump',
                'property.energy.hasPhotovoltaicPanels','property.energy.hasThermicPanels','property.energy.hasCollectiveWaterHeater','property.energy.hasDoubleGlazing',
                'property.livingRoom.surface'

                ]
FILE='05_cleaned.csv'
#FILE='predfix.csv'
df=pd.read_csv(FILE,low_memory=False,usecols=columns_to_use) 
df.shape

*BOOL & CATEGORY & NUM & ORDINAL FEATURES *

In [None]:
bool_features=['flags.isNewlyBuilt','property.hasBasement','property.hasDressingRoom','property.hasCaretakerOrConcierge','property.hasDisabledAccess','property.hasLift',
            'property.constructionPermit.hasPlotDivisionAuthorization','property.constructionPermit.hasPossiblePriorityPurchaseRight','property.land.hasPlotToRear',
            'property.land.isFlat','property.land.isWooded','property.hasLaundryRoom','property.hasGarden','property.parkingCountIndoor','property.parkingCountOutdoor',
            'property.hasAirConditioning','property.hasArmoredDoor','property.hasVisiophone','property.hasSecureAccessAlarm','property.hasCableTV','property.hasDoorPhone',
            'property.hasSwimmingPool','property.hasSauna','property.hasJacuzzi','property.fireplaceExists','property.hasTerrace','transaction.sale.isSubjectToVat',
            'property.energy.hasHeatPump','property.energy.hasPhotovoltaicPanels','property.energy.hasThermicPanels','property.energy.hasCollectiveWaterHeater',
            'property.energy.hasDoubleGlazing','transaction.investor.isInvestmentProperty'
]


ord_features=['property.subtype','property.building.condition','property.kitchen.type','transaction.certificates.epcScore']

imp_features=['flags.isNewlyBuilt','property.bedroomCount','property.bathroomCount','property.netHabitableSurface','property.hasCaretakerOrConcierge',
            'property.hasDisabledAccess','property.hasLift','property.hasLaundryRoom','property.hasGarden','property.parkingCountIndoor','property.parkingCountOutdoor',
            'property.hasSwimmingPool','property.hasSauna','property.hasJacuzzi','property.fireplaceExists','property.hasTerrace','property.energy.hasPhotovoltaicPanels',
            'property.energy.hasDoubleGlazing','property.livingRoom.surface'

]

scale_features=['property.bathroomCount','property.netHabitableSurface','property.livingRoom.surface'
]


df2=df.copy(deep=True)
df.shape
y=df['transaction.sale.price']
df=df.drop(['transaction.sale.price','id'],axis=1)


*ORDINAL DATA LEVELS*

In [None]:

ord_subtype=['CHALET','BUNGALOW','TOWN_HOUSE','HOUSE','FARMHOUSE','MIXED_USE_BUILDING','COUNTRY_COTTAGE','MANOR_HOUSE','APARTMENT_BLOCK','VILLA','MANSION','EXCEPTIONAL_PROPERTY','CASTLE']
ord_property_building_condition=['TO_RESTORE','TO_RENOVATE','TO_BE_DONE_UP','GOOD','JUST_RENOVATED','AS_NEW'] 
ord_kitchen_type_ordinal=['NOT_INSTALLED','INSTALLED','USA_INSTALLED', 'SEMI_EQUIPPED', 'USA_SEMI_EQUIPPED', 'HYPER_EQUIPPED', 'USA_HYPER_EQUIPPED']
ord_transaction_certificates_epcScore=['G', 'F', 'E','D', 'C', 'B', 'A', 'A+', 'A++'] 


*Encode Categorical Values and create X*

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import RobustScaler, OneHotEncoder,OrdinalEncoder

ord_pipeline = Pipeline([
    ('ordencode', OrdinalEncoder(categories=[ord_subtype,ord_property_building_condition,ord_kitchen_type_ordinal,ord_transaction_certificates_epcScore],
                                    handle_unknown='use_encoded_value', unknown_value=np.nan))
])

imp_pipeline = Pipeline([
    ('impute', IterativeImputer())
])

scale_pipeline = Pipeline([
    ('scale', RobustScaler())
])

preprocess_pipeline = ColumnTransformer([
    ('ord', ord_pipeline, ord_features),
    ('scale', scale_pipeline, scale_features)
    ],remainder='passthrough')  ##remainder is used to get all the columns irrespective of transormation happened or not

X = preprocess_pipeline.fit_transform(df)

Create the model,
Best model is choosen with pycaret

In [None]:
from sklearn.model_selection import ParameterGrid, train_test_split,cross_val_score,cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector, make_column_transformer,ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor


regr=ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.0, 
                    max_depth=50, max_features=1.0,
                    max_samples=None, min_impurity_decrease=0.002,
                     min_samples_leaf=2,
                    min_samples_split=7, min_weight_fraction_leaf=0.0,
                    n_estimators=100, n_jobs=-1, oob_score=False,
                    random_state=7733, verbose=0, warm_start=False)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=3)

regr.fit(X_train, y_train)
print("model score: %.3f" % regr.score(X_test, y_test))


In [None]:
a=list(zip(df.columns,regr.feature_importances_))
a=pd.DataFrame(a)
a.sort_values(1,ascending=False, inplace=True)
a.head


*GridSearchCv* 

In [None]:
from sklearn.model_selection import GridSearchCV

model = CatBoostRegressor()
parameters = {'depth' : [6,8,10],
              'learning_rate' : [0.01, 0.05, 0.1],
              'iterations'    : [30, 50, 100]
              }

grid = GridSearchCV(estimator=model, param_grid = parameters, cv = 2, n_jobs=-1)
grid.fit(X_train, y_train)

grid.score(X_test,y_test)#

In [None]:
grid.best_params_

In [None]:
pred=regr.predict(X)
df2['pred']=pred
df2.to_csv("predfix.csv")

In [None]:
a=pd.DataFrame(X_test)
a.info()


Kfold feature selection and other models test

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold,GridSearchCV


#regr = LinearRegression()
#regr=KNeighborsRegressor()
grid=GradientBoostingRegressor(n_estimators=1000,max_depth=4,min_samples_split=2,learning_rate=0.1,random_state=42) 

seed = 13
kfold = KFold(n_splits=3, shuffle=True, random_state=seed)

hp_candidates = [{'n_estimators': [400,1000], 'max_depth': [3,16]}]

# Search for best hyperparameters
#grid = GridSearchCV(estimator=regr, param_grid=hp_candidates, cv=2, scoring='r2')
grid.fit(X_train, y_train)

grid.score(X_test,y_test)


In [None]:

from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 0.5)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)



In [None]:
from sklearn.linear_model import ElasticNet
en = ElasticNet(alpha = 0.01)
en.fit(X_train, y_train)
en.score(X_test, y_test)

In [None]:
from sklearn.linear_model import BayesianRidge
bayesian = BayesianRidge()
bayesian.fit(X_train, y_train)
bayesian.score(X_test, y_test)
#bayesian.coef_


In [None]:
from sklearn.linear_model import LinearRegression
ols = LinearRegression()
ols.fit(X_train, y_train)
ols.score(X_test, y_test)

