In [66]:
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeRegressor

In [48]:
# Load the train data
df = pd.read_csv('../data/houses_train_analysed.csv', index_col=0)
df.shape

(17836, 8)

In [49]:
df.head(3)

Unnamed: 0_level_0,lat,zipcode,municipality_name,object_type_name,build_year,living_area,num_rooms,price
long,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9.066771,47.038963,8750,Glarus,Wohnung,1981,134.0,4.0,380000.0
8.522013,47.097042,6318,Walchwil,Wohnung,2018,113.0,3.0,1195000.0
7.774141,47.429871,4434,Hölstein,Mehrfamilienhaus,2007,194.0,5.0,880000.0


In [50]:
# Split data into features and labels.
X_data = df.drop(columns='price')
y_data = df['price']

In [51]:
# Split features and labels into train (X_train, y_train) and validation set (X_val, y_val).
X_train, X_val, y_train, y_val = train_test_split(X_data, y_data, stratify=X_data['object_type_name'], test_size=0.1)

In [52]:
def evaluate_model(model, X, y):
    # Predict with the model the validation data.
    y_pred = model.predict(X)

    # How good are we on the validation data?
    print(mean_absolute_percentage_error(y, y_pred))

# Decision Tree

In [53]:
model = Pipeline([
    ('ohe', make_column_transformer((OneHotEncoder(handle_unknown='ignore'), ['zipcode', 'municipality_name', 'object_type_name']), remainder='passthrough')),
    ('reg', DecisionTreeRegressor())
])

# Train (fit) the model with the train data.
_ = model.fit(X_train, y_train)

# Evaluate and make submission
evaluate_model(model, X_val, y_val)

0.27963788410016877


In [55]:
def single_pred(model, long:float, lat:float, zipcode: int, municipality_name: str, object_type_name: str, build_year:int, living_area:float, num_rooms:float) -> float:
    data_frame= pd.DataFrame(data={'long': long, 'lat': lat, 'zipcode': zipcode, 'municipality_name': municipality_name, 'object_type_name': object_type_name, 'build_year': build_year, 'living_area': living_area, 'num_rooms': num_rooms}, index=[0])
    return model.predict(data_frame)[0]

In [64]:
"""
TODO: Extend UI to be able to get lat and long from user
INPUT ---
long: float (MAP Interface)
lat: float (MAP Interface)
zipcode: int (MAP Interface)
municipality_name: str (MAP Interface)
oject_type_name: str (Drop Down - Wohnung, Einfamilienhaus, Mehrfamilienhaus, Sonstiges)
build_year: int (1970-2022)
living_area:int
number_of_rooms:int
"""
long: float = 0.0
lat: float  = 0.0
zipcode: int = 5420
municipality_name: str = 'Ehrendingen'
object_type_name: str = 'Einfamilienhaus'
build_year: int = 2000
living_area: float = 200.0
num_rooms: float = 7

In [65]:
single_pred(model, long, lat, zipcode, municipality_name, object_type_name, build_year, living_area, num_rooms)

1750000.0

In [67]:
################
# Export model
################
# save the model to disk
filename = '../model/decision_tree_model.sav'
pickle.dump(model, open(filename, 'wb'))