In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
# support function

def create_histogram(data, x: str, bins: int, kde: bool):
    sns.histplot(data=data, x=x, bins=bins, kde=kde)
    plt.title(f"{x} Distribution")
    plt.xlabel(x)
    plt.ylabel("Frequency")
    plt.show()

def create_scatter(data, x, y, bins: int, kde: bool, title: str=""):
    sns.scatterplot(data=data, x=x, y=y, bins=bins, kde=kde)
    plt.title(title)
    plt.xlabel(x)
    plt.ylabel(y)
    plt.show()

In [3]:
melbourne_path = "..\\..\\Kaggle\\data\\melb_data.csv"
melbourne_data = pd.read_csv(melbourne_path, index_col=0)
melbourne_data.head()

Unnamed: 0_level_0,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
Suburb,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,2.0,1.0,1.0,202.0,,,Yarra,-37.7996,144.9984,Northern Metropolitan,4019.0
Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.8079,144.9934,Northern Metropolitan,4019.0
Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.8093,144.9944,Northern Metropolitan,4019.0
Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,3.0,2.0,1.0,94.0,,,Yarra,-37.7969,144.9969,Northern Metropolitan,4019.0
Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.8072,144.9941,Northern Metropolitan,4019.0


In [4]:
melbourne_data.columns

Index(['Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG', 'Date',
       'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize',
       'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude', 'Longtitude',
       'Regionname', 'Propertycount'],
      dtype='object')

In [5]:
melbourne_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13580 entries, Abbotsford to Yarraville
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Address        13580 non-null  object 
 1   Rooms          13580 non-null  int64  
 2   Type           13580 non-null  object 
 3   Price          13580 non-null  float64
 4   Method         13580 non-null  object 
 5   SellerG        13580 non-null  object 
 6   Date           13580 non-null  object 
 7   Distance       13580 non-null  float64
 8   Postcode       13580 non-null  float64
 9   Bedroom2       13580 non-null  float64
 10  Bathroom       13580 non-null  float64
 11  Car            13518 non-null  float64
 12  Landsize       13580 non-null  float64
 13  BuildingArea   7130 non-null   float64
 14  YearBuilt      8205 non-null   float64
 15  CouncilArea    12211 non-null  object 
 16  Lattitude      13580 non-null  float64
 17  Longtitude     13580 non-null  float64
 1

In [6]:
melbourne_data.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Rooms,13580.0,2.937997,0.955748,1.0,2.0,3.0,3.0,10.0
Price,13580.0,1075684.0,639310.724296,85000.0,650000.0,903000.0,1330000.0,9000000.0
Distance,13580.0,10.13778,5.868725,0.0,6.1,9.2,13.0,48.1
Postcode,13580.0,3105.302,90.676964,3000.0,3044.0,3084.0,3148.0,3977.0
Bedroom2,13580.0,2.914728,0.965921,0.0,2.0,3.0,3.0,20.0
Bathroom,13580.0,1.534242,0.691712,0.0,1.0,1.0,2.0,8.0
Car,13518.0,1.610075,0.962634,0.0,1.0,2.0,2.0,10.0
Landsize,13580.0,558.4161,3990.669241,0.0,177.0,440.0,651.0,433014.0
BuildingArea,7130.0,151.9676,541.014538,0.0,93.0,126.0,174.0,44515.0
YearBuilt,8205.0,1964.684,37.273762,1196.0,1940.0,1970.0,1999.0,2018.0


In [7]:
melbourne_data.isna().sum()

Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

In [8]:
# simple missing value handling
melbourne_data_clean = melbourne_data.dropna(axis=0)
melbourne_data_clean.isna().sum()

Address          0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Date             0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
YearBuilt        0
CouncilArea      0
Lattitude        0
Longtitude       0
Regionname       0
Propertycount    0
dtype: int64

In [9]:
# y = melbourne_data_clean.Price
# numeric_feature = melbourne_data_clean.select_dtypes(include='number')
# selected_feature = numeric_feature.columns
# print(selected_feature)
# selected_feature = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude']
# X = melbourne_data_clean[selected_feature]

y = melbourne_data_clean.Price
X = melbourne_data_clean.select_dtypes(include='number')
X = X.drop(columns=['Price'], axis=1)

# print(f"Selected feature {X.columns}")

In [10]:
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=1)

In [11]:
model = DecisionTreeRegressor()
model.fit(train_X, train_y)

val_predict = model.predict(val_X)
print(mean_absolute_error(val_y, val_predict))

235183.00581020012


In [12]:
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
    model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=1)
    model.fit(train_X, train_y)
    val_predict = model.predict(val_X)
    mae = mean_absolute_error(val_y, val_predict)
    return(mae)

for max_leaf_nodes in range(600, 610):
    mae = get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y)
    print(f"{max_leaf_nodes} \t\t {mae}")


600 		 227091.6021328713
601 		 226928.74089412147
602 		 226928.74089412147
603 		 226725.48259101674
604 		 226760.81260423575
605 		 226760.81260423575
606 		 226774.67684844858
607 		 226704.1244183093
608 		 227089.58054384938
609 		 227250.97499188036


In [15]:
model = DecisionTreeRegressor(random_state=1)
model.fit(train_X, train_y)
val_predict = model.predict(val_X)
print(mean_absolute_error(val_y, val_predict))

233220.79793415105
