In [2]:
# Processing data
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# preprocessing or feature engineering
import category_encoders as ce
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler

# data train test split
from sklearn.model_selection import train_test_split

# model 
from sklearn.linear_model import Ridge, Lasso

# ML Models Tree
from sklearn.tree import DecisionTreeClassifier

# Accuracy Calculation
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

## 1. Load Data

> ### Open the data set

In [3]:
df_melb = pd.read_csv('melb_data.csv')
df_melb

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra,-37.79960,144.99840,Northern Metropolitan,4019.0
1,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra,-37.80790,144.99340,Northern Metropolitan,4019.0
2,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra,-37.80930,144.99440,Northern Metropolitan,4019.0
3,Abbotsford,40 Federation La,3,h,850000.0,PI,Biggin,4/03/2017,2.5,3067.0,...,2.0,1.0,94.0,,,Yarra,-37.79690,144.99690,Northern Metropolitan,4019.0
4,Abbotsford,55a Park St,4,h,1600000.0,VB,Nelson,4/06/2016,2.5,3067.0,...,1.0,2.0,120.0,142.0,2014.0,Yarra,-37.80720,144.99410,Northern Metropolitan,4019.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13575,Wheelers Hill,12 Strada Cr,4,h,1245000.0,S,Barry,26/08/2017,16.7,3150.0,...,2.0,2.0,652.0,,1981.0,,-37.90562,145.16761,South-Eastern Metropolitan,7392.0
13576,Williamstown,77 Merrett Dr,3,h,1031000.0,SP,Williams,26/08/2017,6.8,3016.0,...,2.0,2.0,333.0,133.0,1995.0,,-37.85927,144.87904,Western Metropolitan,6380.0
13577,Williamstown,83 Power St,3,h,1170000.0,S,Raine,26/08/2017,6.8,3016.0,...,2.0,4.0,436.0,,1997.0,,-37.85274,144.88738,Western Metropolitan,6380.0
13578,Williamstown,96 Verdon St,4,h,2500000.0,PI,Sweeney,26/08/2017,6.8,3016.0,...,1.0,5.0,866.0,157.0,1920.0,,-37.85908,144.89299,Western Metropolitan,6380.0


> ### Preprocessing

In [4]:
# to cope the missing value to the data
# there are 4 features having missing values ['Car','BuildingArea','YearBuilt','CouncilArea']
# fill the missing value with the right Imputer with teh character of its feature

df_melb[['Car','BuildingArea','YearBuilt','CouncilArea']].info()

# Car(Folat) uses mean (Simple Imputer)
# BuildingArea(Float) uses mean (Simple Imputer)
# YearBuilt (Float) uses mode (Simple Imputer)
# CouncilArea (Float) uses mode (Simple Imputer)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13580 entries, 0 to 13579
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Car           13518 non-null  float64
 1   BuildingArea  7130 non-null   float64
 2   YearBuilt     8205 non-null   float64
 3   CouncilArea   12211 non-null  object 
dtypes: float64(3), object(1)
memory usage: 424.5+ KB


and then, we can check how many missing values for each variable

In [5]:
df_melb.isnull().sum()

Suburb              0
Address             0
Rooms               0
Type                0
Price               0
Method              0
SellerG             0
Date                0
Distance            0
Postcode            0
Bedroom2            0
Bathroom            0
Car                62
Landsize            0
BuildingArea     6450
YearBuilt        5375
CouncilArea      1369
Lattitude           0
Longtitude          0
Regionname          0
Propertycount       0
dtype: int64

> ## Simple Imputer

In [6]:
imputer_mean = SimpleImputer(strategy='mean')
df_melb[['Car','BuildingArea']] = imputer_mean.fit_transform(df_melb[['Car','BuildingArea']])

imputer_mode = SimpleImputer(strategy='most_frequent')
df_melb[['YearBuilt','CouncilArea']] = imputer_mode.fit_transform(df_melb[['YearBuilt','CouncilArea']])

In [7]:
df_melb.isnull().sum()

Suburb           0
Address          0
Rooms            0
Type             0
Price            0
Method           0
SellerG          0
Date             0
Distance         0
Postcode         0
Bedroom2         0
Bathroom         0
Car              0
Landsize         0
BuildingArea     0
YearBuilt        0
CouncilArea      0
Lattitude        0
Longtitude       0
Regionname       0
Propertycount    0
dtype: int64

In [8]:
# Encoding for Features
# There are 5 categorical features.

cat_var = df_melb.columns[df_melb.dtypes == object]
df_melb[cat_var].describe()

# Suburb has 314 categories -> Binary
# Type has 3 categories -> One Hot
# Method has 5 categories -> One Hot
# CouncilArea has 33 categories -> Binary
# Regionname has 8 categories -> Binary

Unnamed: 0,Suburb,Address,Type,Method,SellerG,Date,CouncilArea,Regionname
count,13580,13580,13580,13580,13580,13580,13580,13580
unique,314,13378,3,5,268,58,33,8
top,Reservoir,36 Aberfeldie St,h,S,Nelson,27/05/2017,Moreland,Southern Metropolitan
freq,359,3,9449,9022,1565,473,2532,4695


We can also see below, the data type in each variable to determine which one will use One Hot Encoding and which one will use Binary

In [9]:
for i in ['Type','Method','CouncilArea','Regionname','Suburb']:
    print(df_melb[i].value_counts())

h    9449
u    3017
t    1114
Name: Type, dtype: int64
S     9022
SP    1703
PI    1564
VB    1199
SA      92
Name: Method, dtype: int64
Moreland             2532
Boroondara           1160
Moonee Valley         997
Darebin               934
Glen Eira             848
Stonnington           719
Maribyrnong           692
Yarra                 647
Port Phillip          628
Banyule               594
Bayside               489
Melbourne             470
Hobsons Bay           434
Brimbank              424
Monash                333
Manningham            311
Whitehorse            304
Kingston              207
Whittlesea            167
Hume                  164
Wyndham                86
Maroondah              80
Knox                   80
Melton                 66
Frankston              53
Greater Dandenong      52
Casey                  38
Nillumbik              36
Yarra Ranges           18
Cardinia                8
Macedon Ranges          7
Unavailable             1
Moorabool               1
Name:

> ## One Hot Encoding

In [10]:
# One Hot Encoding
onehot = OneHotEncoder(drop='first')
onehot_var = ['Type','Method']

#Binary Encoding
binary_encoding = ce.BinaryEncoder()
binary_var = ['CouncilArea','Regionname','Suburb']

transformer = ColumnTransformer([
                                ('one hot', onehot, onehot_var),
                                ('binary', binary_encoding, binary_var)
                                ],remainder = 'passthrough')

> ## Data Splitting

In [11]:
fitur = ['Suburb','Rooms','Type','Method','Distance','Bedroom2','Bathroom','Car','Landsize','BuildingArea','YearBuilt','CouncilArea','Regionname','Propertycount']
x = df_melb[fitur]
y = df_melb['Price']

In [12]:
x_train, x_test, y_train, y_test = train_test_split(
        x,
        y,
        test_size = 0.2,
        random_state = 10
)

> ## Data Transform

In [14]:
# Transformasi Columns
x_train_preprocessed = transformer.fit_transform(x_train) 
x_test_preprocessed = transformer.transform(x_test)

  elif pd.api.types.is_categorical(cols):


In [15]:
# Scaling 
scaler = MinMaxScaler()

scaler.fit(x_train_preprocessed)

x_train_preprocessed_scaled = scaler.transform(x_train_preprocessed)
x_test_preprocessed_scaled = scaler.transform(x_test_preprocessed)

> ## Model Lasso Regression

In [16]:
# Model Lasso Regression
model = Lasso()

# fit
model.fit(x_train_preprocessed_scaled, y_train)

# predict
y_pred = model.predict(x_test_preprocessed_scaled)

# MSE score
mse = mean_squared_error(y_test, y_pred)

rmse = np.sqrt(mse)
rmse

404980.8737691327

> ## With Pipeline

In [17]:
from sklearn.pipeline import Pipeline

# Model (Lasso Regression)
lasso = Lasso()

# Model dengan Scalling
model = Pipeline([
    ('transformer', transformer),
    ('scaling', MinMaxScaler()),
    ('model', lasso)
])

# fit
model.fit(x_train, y_train)

# predict
y_pred = model.predict(x_test)

# MSE score
mse = mean_squared_error(y_test, y_pred)

rmse = np.sqrt(mse)
rmse

  elif pd.api.types.is_categorical(cols):


404980.8737691327