In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
data_path = "..\\Kaggle\\data\\melb_data.csv"
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [3]:
numerical_feature_columns = data.select_dtypes(include='number').columns.to_list()
categorigal_feature_columns = data.select_dtypes(exclude='number').columns.to_list()
print(numerical_feature_columns)
print(categorigal_feature_columns)

['Rooms', 'Price', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude', 'Propertycount']
['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'CouncilArea', 'Regionname']


In [4]:
# selected_feature = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
selected_feature = data.select_dtypes(include='number').columns.to_list()

In [5]:
y = data.Price
X = data[selected_feature]

In [6]:
numeric_preprocessor = Pipeline(
    steps=[
        ("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
        ("Scaler", StandardScaler())
    ]
)

# categorical_preprocessor = Pipeline(
#     steps=[
#         ("imputation_constant", SimpleImputer(fill_value="missing", strategy="constant")),
#         ("onehot", OneHotEncoder(handle_unknown="ignore"))
#     ]
# )

preprocessor = ColumnTransformer(
    [
        ("numerical", numeric_preprocessor, selected_feature),
    ]
)
# pipe


In [7]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [8]:
pipe = make_pipeline(preprocessor, DecisionTreeRegressor(random_state=1))
pipe.named_steps['decisiontreeregressor'].get_params()

pipe.fit(train_X, train_y)
predictions = pipe.predict(val_X)
print(mean_absolute_error(val_y, predictions))

435.4703976435935


In [19]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    pipe = pipe = make_pipeline(preprocessor, DecisionTreeRegressor(random_state=1))
    pipe.named_steps['decisiontreeregressor'].set_params(max_leaf_nodes=max_leaf_nodes)
    pipe.fit(train_X, train_y)
    predictions = pipe.predict(val_X)
    mae = mean_absolute_error(val_y, predictions)
    return mae

# [590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600]
for max_leaf_nodes in range(1900, 1910, 1):
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f"{mae} \t\t {max_leaf_nodes}")

462.2928444574399 		 1900
462.28306683396085 		 1901
462.26494063026763 		 1902
462.30912325176985 		 1903
462.40337951097456 		 1904
462.40337951097456 		 1905
462.4401983622264 		 1906
462.4401983622264 		 1907
462.4401983622264 		 1908
462.4401983622264 		 1909


In [21]:
pipe = make_pipeline(preprocessor, DecisionTreeRegressor(max_leaf_nodes=1900 ,random_state=1))
pipe.named_steps['decisiontreeregressor'].get_params()

pipe.fit(X, y)
predictions = pipe.predict(val_X)
print(mean_absolute_error(val_y, predictions))

12.662097811954736
