In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv('data\\Kangaroo.csv')
df.head()

#Take a very small subset of columns
df_subset=df[["type",
              "bedroomCount",
              "bathroomCount",
              "province",
              "habitableSurface",
              "gardenSurface",
              "epcScore",
              "facedeCount",
              "price"]]

#Drop null values in very relevant columns
df_subset=df_subset.dropna(axis=0,subset=['price','habitableSurface','bedroomCount','bathroomCount','epcScore'])

#Drop values with 100k<price<1.5M
df_subset=df_subset[(df_subset['price']>1e5) & (df_subset['price']<1.5e6)]

#We split whats left into df_subset and target
X=df_subset.drop(["price"],inplace=False, axis=1)
y=df_subset["price"]

#Split training and testing
X_train,X_test,y_train,y_test=train_test_split(X,y,shuffle=True,random_state=42,train_size=0.8)


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.compose import make_column_selector as selector
from sklearn.metrics import mean_absolute_error
import numpy as np

# Ordinal encoder
epc_order = ['A++', 'A+', 'A', 'B', 'C', 'D', 'E', 'F', 'G']
enc_epc=OrdinalEncoder(categories=[epc_order], handle_unknown='use_encoded_value', unknown_value=-1)

# Column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ("epc_ord", enc_epc, ["epcScore"]), #Encodes EPC

        ("num", SimpleImputer(strategy='constant',fill_value=0), #Imputes missing numerical values to 0
         selector(dtype_include=np.number)), #Finds all the number columns
        
        ("oh", OneHotEncoder(handle_unknown="ignore"),["type","province"]) #One hot encodes type and province
    ],
    remainder="drop"
)

# Whole Pipeline
gbr_pipeline = Pipeline(steps=[
    ("prep", preprocessor),
    ("reg", GradientBoostingRegressor(n_estimators=200,loss='absolute_error',random_state=42))
])

#Fit
gbr_pipeline.fit(X_train, y_train)

#Test
y_pred = gbr_pipeline.predict(X_test)
print("MAE on test: ", mean_absolute_error(y_test, y_pred))

MAE on test:  93260.58826658524


In [108]:
#Play with it and try it out

house_to_predict={"type":'HOUSE',
              "bedroomCount":4,
              "bathroomCount":2,
              "province":'Bruxelles',
              "habitableSurface":200,
              "gardenSurface":np.nan,
              "epcScore":'C',
              "facedeCount":2}

print(gbr_pipeline.predict(pd.DataFrame([house_to_predict])))

[427871.6500321]
