In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error 
import joblib
from sklearn import ensemble


#LOADING DATA
df = pd.read_csv('./DATASET/Melbourne_housing_FULL.csv')
#SETTING N = NUMBER OF ROWS = 5
df.head(n=5)




Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/09/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/02/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/02/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/03/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [52]:
#TO DELETE THE UN WANTED COLUMNS

del df['Address']
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude'] 
del df['Regionname'] 
del df['Propertycount']



In [53]:
# TO REMOVE COLUMNS WITH NO VALUES
# TO REMOVE ROWS WITH ANY NaN VALUES
df.dropna(axis=0, how='any', inplace=True)

# TO REMOVE ROWS WITH LESS THAN A CERTAIN NUMBER OF NON-NULL VALUES
df.dropna(axis=0, thresh=5, inplace=True)

In [54]:
#COLUMNS WITH NON NUMERICAL VALUES TO NUMERICAL VALUES
features_df = pd.get_dummies(df, columns=['Suburb', 'CouncilArea', 'Type'])

In [55]:
del features_df['Price']

In [56]:
#CREATING X AND Y VARIABLES WITH X - INDEPENDENT VARIABLES 
# Y = DEPENDENT VARIABLES
X = features_df.to_numpy()
y = df['Price'].to_numpy()


4) Split the dataset

In [57]:
#Scikit-learn function below with an argument of “0.3.” The dataset’s 
#rows are also shuffled randomly to avoid bias using the random_state function.

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

5) Select the algorithm and configure its hyperparameters

✅ n_estimators represents how many decision trees to build. Remember that a high number of trees will generally improve accuracy (up to a certain point), but it will also increase the model’s processing time. Above, I have selected 150 decision trees as an initial starting point.
✅ learning_rate controls the rate at which additional decision trees influence the overall prediction. This effectively shrinks the contribution of each tree by the set learning_rate. Inserting a low rate here, such as 0.1, should improve accuracy.
✅ max_depth defines the maximum number of layers (depth) for each decision tree. If “None” is selected, then nodes expand until all leaves are pure or until all leaves contain less than min_samples_leaf. Here, I have selected a high maximum number of layers (30), which will have a dramatic effect on the final result, as we will see later.
✅ min_samples_split defines the minimum number of samples required to execute a new binary split. For example, min_samples_split = 10 means there must be ten available samples in order to create a new branch. min_samples_leaf represents the minimum number of samples that must appear in each child node (leaf) before a new branch can be implemented. This helps to mitigate the impact of outliers and anomalies in the form of a low number of samples found in one leaf as a result of a binary split. For example, min_samples_leaf = 4 requires there to be at least four available
samples within each leaf for a new branch to be created.
✅ max_features is the total number of features presented to the model when
determining the best split. As mentioned in Chapter 11, random forests and
gradient boosting restrict the total number of features shown to each
individual tree to create multiple results that can be voted upon later.
✅ If the max_features value is an integer (whole number), the model will
consider max_features at each split (branch). If the value is a float (e.g. 0.6),
then max_features is the percentage of total features randomly selected.
✅ Although max_features sets a maximum number of features to consider in
identifying the best split, total features may exceed the max_features limit if
no split can initially be made.
✅ loss calculates the model's error rate. For this exercise, we are using huber
which protects against outliers and anomalies. Alternative error rate options
include ls (least squares regression), lad (least absolute deviations), and
quantile (quantile regression). Huber is actually a combination of ls and lad.
To learn more about gradient boosting hyperparameters, you may refer to the
Scikit-learn website: http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.G

In [58]:
#GRADIENT BOOSTING ALGORITHM FROM SKITLEARN

model = ensemble.GradientBoostingRegressor( n_estimators=150,
learning_rate=0.1,
max_depth=30,
min_samples_split=4, min_samples_leaf=6, max_features=0.6, loss='huber'
)

In [59]:
#TRAINING THE MODEL
model.fit(X_train, y_train)

In [60]:
#BUILDING THE FINAL MODEL 
joblib.dump(model, 'house_trained_model.pkl')

['house_trained_model.pkl']

In [61]:
#EVALUATE THE RESULT
mse = mean_absolute_error(y_train, model.predict(X_train)) 
print("Training Set Mean Absolute Error: %.2f" % mse)

Training Set Mean Absolute Error: 29179.51


In [62]:
mse = mean_absolute_error(y_test, model.predict(X_test)) 
print ("Test Set Mean Absolute Error: %.2f" % mse)

Test Set Mean Absolute Error: 164169.00
