In [377]:
# Imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_selector as selector
from sklearn.neighbors import NearestNeighbors
import pandas as pd
import numpy as np


### Load data

In [378]:
# Load data
file = "./data/cleaned_houses2.csv"
df = pd.read_csv(file)

In [379]:
def count_nulls(dataframe):
    return dataframe.isnull().sum()/ len(df)

def see_shape ():
    print(f"Train set X: r- {X_train.shape[0]}, c-{X_train.shape[1]}")
    print(f"Test set X:  r: {X_test.shape[0]}, c- {X_test.shape[1]}")
    print(f"Y test set: r- {y_test.shape[0]}")
    print(f"Y final: r- {y.shape[0]}")

In [380]:
count_nulls(df)

district              0.000000
price                 0.000000
state_construction    0.253759
living_area           0.000000
garden_area           0.000000
bedrooms              0.000000
bathrooms             0.000000
livingroom_surface    0.550984
kitchen_surface       0.581871
facades               0.209725
kitchen               0.000000
has_terrace           0.000000
has_attic             0.000000
has_basement          0.000000
construction_year     0.404101
epc                   0.215261
area_total            0.000000
dtype: float64

In [381]:
# Explerimental column dropping of some of categories
# Construction year hurt the model by 2%, lots of missing data, maybe badly inputed
# and likely covered by state_construction, size, epc already
df.drop(columns=["construction_year"], inplace= True)

In [382]:
# Experimenting setting districts


### Split Data

In [383]:
# Split in training set and test set
X = df.drop(columns=["price"])
y = df['price']

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42, test_size=0.2)

In [384]:
see_shape()

Train set X: r- 12717, c-15
Test set X:  r: 3180, c- 15
Y test set: r- 3180
Y final: r- 15897


### Encoding

In [385]:
# Encoding training
enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform="pandas")
enctransform_train = enc.fit_transform(X_train[["district", "state_construction"]])

enctransform_train


Unnamed: 0,district_Aalst,district_Antwerp,district_Arlon,district_Ath,district_Bastogne,district_Brugge,district_Brussels,district_Charleroi,district_Dendermonde,district_Diksmuide,...,district_Veurne,district_Virton,district_Waremme,state_construction_AS_NEW,state_construction_GOOD,state_construction_JUST_RENOVATED,state_construction_TO_BE_DONE_UP,state_construction_TO_RENOVATE,state_construction_TO_RESTORE,state_construction_nan
2582,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
11119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
10868,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
12424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
724,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13418,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5390,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
860,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
15795,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [386]:
X_train = pd.concat([X_train, enctransform_train], axis = 1).drop(["district","state_construction"], axis = 1)

In [387]:
# Encoding test set

enc = OneHotEncoder(handle_unknown='ignore', sparse_output=False).set_output(transform="pandas")
enctransform_test = enc.fit_transform(X_test[["district", "state_construction"]])
enctransform_test

Unnamed: 0,district_Aalst,district_Antwerp,district_Arlon,district_Ath,district_Bastogne,district_Brugge,district_Brussels,district_Charleroi,district_Dendermonde,district_Diksmuide,...,district_Veurne,district_Virton,district_Waremme,state_construction_AS_NEW,state_construction_GOOD,state_construction_JUST_RENOVATED,state_construction_TO_BE_DONE_UP,state_construction_TO_RENOVATE,state_construction_TO_RESTORE,state_construction_nan
8159,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
5763,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
13349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
6306,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14987,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1665,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
13024,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [388]:
columns = X_train.columns

In [389]:
X_test = pd.concat([X_test, enctransform_test], axis = 1).drop(["district","state_construction"], axis = 1)

In [390]:
see_shape()

Train set X: r- 12717, c-63
Test set X:  r: 3180, c- 63
Y test set: r- 3180
Y final: r- 15897


### Imputing

In [391]:
# Replace NAN in kitchen and living room by multiplying living area for a %
percent_k = X_train["kitchen_surface"].sum()/X_train["living_area"].sum() 
percent_l = X_train["livingroom_surface"].sum()/X_train["living_area"].sum() 
print (percent_k)
print(percent_l )

X_train['livingroom_surface'] = X_train['livingroom_surface'].fillna(round(X_train["living_area"]*percent_l,0))
X_train['kitchen_surface'] = X_train['kitchen_surface'].fillna(round(X_train["living_area"]*percent_k,0))

0.034773725563686395
0.08456050058733997


In [392]:
# KNN imputation
from sklearn.impute import KNNImputer
imputer = KNNImputer(missing_values = np.nan, n_neighbors=5, weights = "distance")
X_train = imputer.fit_transform(X_train)

imputer = KNNImputer(n_neighbors=5)
X_test = imputer.fit_transform(X_test)


In [393]:
see_shape()

Train set X: r- 12717, c-63
Test set X:  r: 3180, c- 63
Y test set: r- 3180
Y final: r- 15897


### Scaling

In [394]:
# Scale the features using StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [395]:
see_shape()

Train set X: r- 12717, c-63
Test set X:  r: 3180, c- 63
Y test set: r- 3180
Y final: r- 15897


### Training

In [396]:
# Lets start with linear regression
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train,y_train)

### Testing

In [397]:

from sklearn.metrics import r2_score, mean_squared_error,root_mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, median_absolute_error, PredictionErrorDisplay

# Make predictions using the testing set
y_pred = regressor.predict(X_test)
y_pred_train = regressor.predict(X_train)

#DATA
print(f"mean {pd.Series(y).mean()}")
print(f"var {"{:e}".format(pd.Series(y).var())}")
print(f"std {pd.Series(y).std()}")
print()

#R2
print(f"R2 train: {r2_score(y_train, y_pred_train)}")
print(f"R2 test: {r2_score(y_test, y_pred)}")
print()

#MSE
print(f"MSE train: {"{:e}".format(mean_squared_error(y_train, y_pred_train))}")
print(f"MSE test: {"{:e}".format(mean_squared_error(y_test, y_pred))}")
print()

#RMSE

print(f"RMSE train: {"{:e}".format(root_mean_squared_error(y_train, y_pred_train))}")
print(f"RMSE test: {"{:e}".format(root_mean_squared_error(y_test, y_pred))}")
print()

#MAE (mean absolute error) 
print(f"mean absolute error train: {mean_absolute_error(y_train, y_pred_train)}")
print(f"mean absolute error test: {mean_absolute_error(y_test, y_pred)}")



mean 358331.67132163304
var 2.278381e+10
std 150943.05454013648

R2 train: 0.6622311245943133
R2 test: 0.6620293508693103

MSE train: 7.687722e+09
MSE test: 7.729033e+09

RMSE train: 8.767965e+04
RMSE test: 8.791492e+04

mean absolute error train: 66660.36104692939
mean absolute error test: 66650.66367552118


In [398]:
coef_list = pd.DataFrame(zip(columns, regressor.coef_))
coef_list.columns = ["feature", "coef"]
most_important = coef_list.sort_values(by= "coef", ascending= False)

most_important.to_csv("coeficients_linear.csv", index=True)