In [168]:
# Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector as selector
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np


### Load data

In [169]:
# Load data
file = "./data/cleaned_houses.csv"
df = pd.read_csv(file)

In [170]:
def count_nulls(dataframe):
    return dataframe.isnull().sum()/ len(df)

def see_shape ():
    print(f"Train set X: r- {X_train.shape[0]}, c-{X_train.shape[1]}")
    print(f"Test set X:  r: {X_test.shape[0]}, c- {X_test.shape[1]}")
    print(f"Y test set: r- {y_test.shape[0]}")
    print(f"Y final: r- {y.shape[0]}")

In [171]:
count_nulls(df)

district              0.000000
price                 0.000000
state_construction    0.253930
living_area           0.000000
bedrooms              0.000000
bathrooms             0.000000
livingroom_surface    0.550937
kitchen_surface       0.581876
facades               0.209785
has_garden            0.000000
kitchen               0.000000
has_terrace           0.000000
has_attic             0.000000
has_basement          0.000000
construction_year     0.404226
epc                   0.215382
area_total            0.000000
dtype: float64

In [172]:
# Explerimental column dropping of some of categories
# Construction year hurt the model by 2%, lots of missing data, maybe badly inputed
# and likely covered by state_construction, size, epc already
df.drop(columns=["construction_year", "state_construction"], inplace= True)

### Split Data

In [173]:
# Split in training set and test set
X = df.drop(columns=["price"])
y = df['price']

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42, test_size=0.2)

In [174]:
see_shape()

Train set X: r- 12721, c-14
Test set X:  r: 3181, c- 14
Y test set: r- 3181
Y final: r- 15902


### Encoding

In [175]:
# Encoding training
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform="pandas")
enctransform_train = enc.fit_transform(X_train[["district"]])

enctransform_train


Unnamed: 0,district_Aalst,district_Antwerp,district_Arlon,district_Ath,district_Bastogne,district_Brugge,district_Brussels,district_Charleroi,district_Dendermonde,district_Diksmuide,...,district_Soignies,district_Thuin,district_Tielt,district_Tongeren,district_Tournai,district_Turnhout,district_Verviers,district_Veurne,district_Virton,district_Waremme
2582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14550,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13441,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14644,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
724,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13418,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
860,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
15795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [176]:
X_train = pd.concat([X_train, enctransform_train], axis = 1).drop(["district"], axis = 1)

In [177]:
columns = X_train.columns

In [178]:
# Encoding test set

enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform="pandas")
enctransform_test = enc.fit_transform(X_test[["district"]])
enctransform_test

Unnamed: 0,district_Aalst,district_Antwerp,district_Arlon,district_Ath,district_Bastogne,district_Brugge,district_Brussels,district_Charleroi,district_Dendermonde,district_Diksmuide,...,district_Soignies,district_Thuin,district_Tielt,district_Tongeren,district_Tournai,district_Turnhout,district_Verviers,district_Veurne,district_Virton,district_Waremme
13537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15574,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8489,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1617,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6216,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13660,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8067,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [179]:
X_test = pd.concat([X_test, enctransform_test], axis = 1).drop(["district"], axis = 1)

In [180]:
see_shape()

Train set X: r- 12721, c-56
Test set X:  r: 3181, c- 56
Y test set: r- 3181
Y final: r- 15902


### Imputing

In [181]:
# Replace NAN in kitchen and living room by multiplying living area for a %
percent_k = X_train["kitchen_surface"].sum()/X_train["living_area"].sum() 
percent_l = X_train["livingroom_surface"].sum()/X_train["living_area"].sum() 
print (percent_k)
print(percent_l )

X_train['livingroom_surface'] = X_train['livingroom_surface'].fillna(round(X_train["living_area"]*percent_l,0))
X_train['kitchen_surface'] = X_train['kitchen_surface'].fillna(round(X_train["living_area"]*percent_k,0))

0.034800811385703206
0.08451325534264241


In [182]:
# KNN imputation
from sklearn.impute import KNNImputer
imputer = KNNImputer(missing_values = np.nan, n_neighbors=5, weights = "distance")
X_train = imputer.fit_transform(X_train)

imputer = KNNImputer(n_neighbors=5)
X_test = imputer.fit_transform(X_test)


In [183]:
#FORNOW 
#For now fill missing with -1 for epc and state_construction
#This will mess up the nulls at year of construction, maybe facades that are numerical
# Ideas: do year with nearest neighbors, but after scaling!
#X_train.fillna(-1,inplace=True)
#X_test.fillna(-1,inplace=True)

In [184]:
see_shape()

Train set X: r- 12721, c-56
Test set X:  r: 3181, c- 56
Y test set: r- 3181
Y final: r- 15902


### Scaling

In [185]:
# Scale the features using StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [186]:
see_shape()

Train set X: r- 12721, c-56
Test set X:  r: 3181, c- 56
Y test set: r- 3181
Y final: r- 15902


### Training

## Trying random tree reg

In [187]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, median_absolute_error, PredictionErrorDisplay, root_mean_squared_error

In [209]:

from sklearn.ensemble import RandomForestRegressor
n = 100
j = 4

treeregressor = RandomForestRegressor(n_estimators = n, min_samples_leaf= j)
treeregressor.fit(X_train,y_train)
y_pred_3 = treeregressor.predict(X_test)
y_pred_train_3 = treeregressor.predict(X_train)
score = r2_score(y_test, y_pred_3)


### Testing

In [210]:
y_pred_3 = treeregressor.predict(X_test)
y_pred_train_3 = treeregressor.predict(X_train)



# DATA INFO
print(f"mean {pd.Series(y).mean()}")
print(f"var {"{:e}".format(pd.Series(y).var())}")
print(f"std {pd.Series(y).std()}")
print()

#R2
print(f"R2 train: {r2_score(y_train, y_pred_train_3)}")
print(f"R2 test: {r2_score(y_test, y_pred_3)}")
print()

#MSE
print(f"MSE train: {"{:e}".format(mean_squared_error(y_train, y_pred_train_3))}")
print(f"MSE test: {"{:e}".format(mean_squared_error(y_test, y_pred_3))}")
print()

#RMSE
print(f"Root mean sq error train: {root_mean_squared_error(y_train, y_pred_train_3)}")
print(f"RMSE test: {root_mean_squared_error(y_test, y_pred_3)}")
print()


#MAE (mean absolute error) 
print(f"mean absolute error train: {mean_absolute_error(y_train, y_pred_train_3)}")
print(f"mean absolute error test: {mean_absolute_error(y_test, y_pred_3)}")
print()




mean 358314.56131304236
var 2.278152e+10
std 150935.48513346163

R2 train: 0.8582375249006543
R2 test: 0.6328362446347809

MSE train: 3.254991e+09
MSE test: 8.097911e+09

Root mean sq error train: 57052.52615133254
RMSE test: 89988.39191393004

mean absolute error train: 40991.13731456832
mean absolute error test: 66738.92120382455

