PREDICTING APPARTMENT PRICES IN BELGIUM

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Import cleaned dataset

In [None]:
FILE='data_last.csv'
df=pd.read_csv(FILE,low_memory=False)

Check for nan values

In [None]:
features_with_na=[features for features in df.columns if df[features].isnull().sum()>1]

for feature in features_with_na:
    print(feature, np.round(df[feature].isnull().mean(), 4),  ' % missing values')

Data Types

In [None]:
df.dtypes

Create encoder, drop unnecessary columns

In [None]:
epc_ordinal=['G', 'F', 'E','D', 'C', 'B', 'A', 'A+', 'A++']
building_condition_ordinal=['TO_BE_DONE_UP', 'JUST_RENOVATED', 'GOOD','AS_NEW']
kitchen_type_ordinal=['INSTALLED','USA_INSTALLED', 'SEMI_EQUIPPED', 'USA_SEMI_EQUIPPED', 'HYPER_EQUIPPED',  'USA_HYPER_EQUIPPED']
subtype_ordinal=['APARTMENT','DUPLEX','PENTHOUSE','TRIPLEX','LOFT']


drop_cols=['transaction.sale.isSubjectToVat','id']
num_cols=[	'bedroomCount',	'bathroomCount',	'netHabitableSurface',	'toiletCount']


cat_cols=['subtype','transaction.certificates.epcScore',	'building.condition',	'kitchen.type']


ord_col=['transaction.certificates.epcScore',	'building.condition',	'kitchen.type','subtype']


dl=df.copy()
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(categories=[epc_ordinal,building_condition_ordinal,kitchen_type_ordinal,subtype_ordinal])

df[["transaction.certificates.epcScore", "building.condition",'kitchen.type',"subtype"]] = enc.fit_transform(df[["transaction.certificates.epcScore", "building.condition",
                                                                                                                    'kitchen.type',"subtype"]])

df = df.drop(drop_cols,axis=1)

y=df['transaction.sale.price'].values
X=df.drop(['transaction.sale.price'],axis=1)

In [None]:
df.head(5)

Check for correlations

In [None]:
plt.subplots(figsize=(16,10))
sns.heatmap(df.corr())
df.corr().style.background_gradient(cmap="Blues")



Create the model 
Best model is choosen with pycaret

In [None]:
from sklearn.model_selection import  train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor



regr=CatBoostRegressor(nan_mode= 'Min', eval_metric= 'RMSE', iterations=1000, sampling_frequency= 'PerTree', leaf_estimation_method= 'Newton', grow_policy= 'SymmetricTree', 
                penalties_coefficient=1, boosting_type= 'Plain', model_shrink_mode= 'Constant', feature_border_type= 'GreedyLogSum', l2_leaf_reg=3, random_strength=1, rsm=1, 
                boost_from_average= True, model_size_reg=0.5, subsample=0.800000011920928, use_best_model= False, random_seed=10, depth=6, posterior_sampling= False, border_count=254, 
                 sparse_features_conflict_fraction=0, leaf_estimation_backtracking= 'AnyImprovement', best_model_min_trees=1, model_shrink_rate=0, min_data_in_leaf=1, 
                 loss_function= 'RMSE', learning_rate=0.0396099984645843, score_function= 'Cosine', task_type= 'CPU', leaf_estimation_iterations=1, bootstrap_type= 'MVS',
                  max_leaves=64,verbose=False
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

regr.fit(X_train, y_train)
print("model score: %.3f" % regr.score(X_test, y_test))


In [None]:
print(regr.get_params())

In [None]:
feature_imp=list(zip(X.columns,regr.feature_importances_))
feature_imp

*Export model to pkl file*

In [None]:
import joblib
joblib.dump(regr, "clf.pkl")

*Some Useful Code and other tests applied to model*

In [None]:
dk=regr.predict(X)
dl['predicted']=dk
dl.head(5)

In [None]:
print(regr.score(X_test, y_test))

Kfold feature selection and other models tests

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold,GridSearchCV


#regr = LinearRegression()
#regr=KNeighborsRegressor()
regr=GradientBoostingRegressor(n_estimators=400,max_depth=2,min_samples_split=2,learning_rate=0.1) 

seed = 13
kfold = KFold(n_splits=3, shuffle=True, random_state=seed)

hp_candidates = [{'n_estimators': [200,300,400,500,1000], 'max_depth': [2,3,4,5,16]}]

# Search for best hyperparameters
grid = GridSearchCV(estimator=regr, param_grid=hp_candidates, cv=kfold, scoring='r2')
grid.fit(X, y)

print(grid.best_score_)
print(grid.best_estimator_)
print(grid.best_params_)


In [None]:

from sklearn.linear_model import Ridge
ridge = Ridge(alpha = 0.5)
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)



In [None]:
from sklearn.linear_model import ElasticNet
en = ElasticNet(alpha = 0.01)
en.fit(X_train, y_train)
en.score(X_test, y_test)

In [None]:
from sklearn.linear_model import BayesianRidge
bayesian = BayesianRidge()
bayesian.fit(X_train, y_train)
bayesian.score(X_test, y_test)
bayesian.coef_


In [None]:
from sklearn.linear_model import LinearRegression

ols = LinearRegression()
ols.fit(X_train, y_train)
ols.score(X_test, y_test)


In [None]:
# 3. Lasso
from sklearn.linear_model import Lasso

lasso = Lasso(max_iter=5000, alpha = 0.01)
lasso.fit(X_train, y_train)
lasso.score(X_test, y_test)

