In [8]:
#my first machine learning experiment using data from https://www.kaggle.com/anthonypino/melbourne-housing-market#Melbourne_housing_FULL.csv
#as described in the book Machine Learning for Absolute Beginners: A plain English Introdution
#price is the dependent variable (represented as y)
#although I followed the book's instructions, I added all comments for clarity and concept explanation 
#in case I need to reference this experiment in the future


#import libraries 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error
from sklearn.externals import joblib

#Open the file into a dataframe
df = pd.read_csv(r'C:\Users\bendgame\Desktop\Melbourne_housing_FULL.csv')

#check the first 5 rows
df.head()

Unnamed: 0,Suburb,Address,Rooms,Type,Price,Method,SellerG,Date,Distance,Postcode,...,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea,Lattitude,Longtitude,Regionname,Propertycount
0,Abbotsford,68 Studley St,2,h,,SS,Jellis,3/9/2016,2.5,3067.0,...,1.0,1.0,126.0,,,Yarra City Council,-37.8014,144.9958,Northern Metropolitan,4019.0
1,Abbotsford,85 Turner St,2,h,1480000.0,S,Biggin,3/12/2016,2.5,3067.0,...,1.0,1.0,202.0,,,Yarra City Council,-37.7996,144.9984,Northern Metropolitan,4019.0
2,Abbotsford,25 Bloomburg St,2,h,1035000.0,S,Biggin,4/2/2016,2.5,3067.0,...,1.0,0.0,156.0,79.0,1900.0,Yarra City Council,-37.8079,144.9934,Northern Metropolitan,4019.0
3,Abbotsford,18/659 Victoria St,3,u,,VB,Rounds,4/2/2016,2.5,3067.0,...,2.0,1.0,0.0,,,Yarra City Council,-37.8114,145.0116,Northern Metropolitan,4019.0
4,Abbotsford,5 Charles St,3,h,1465000.0,SP,Biggin,4/3/2017,2.5,3067.0,...,2.0,0.0,134.0,150.0,1900.0,Yarra City Council,-37.8093,144.9944,Northern Metropolitan,4019.0


In [9]:
#scrub the data: remove unneeded columns and minimize non-numeric columns
del df['Address'] 
del df['Method']
del df['SellerG']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

#remove rows with missing values
#axis 0 is index (rows);  axis 1 is columns, how any is any row or column with a null
#documentation http://pandas.pydata.org/pandas-docs/stable/search.html?q=dropna&check_keywords=yes&area=default
df.dropna(axis = 0, how = 'any', thresh = None, subset = None, inplace = True)

#verify the dataframe's columns
df.head()




Unnamed: 0,Suburb,Rooms,Type,Price,Distance,Bedroom2,Bathroom,Car,Landsize,BuildingArea,YearBuilt,CouncilArea
2,Abbotsford,2,h,1035000.0,2.5,2.0,1.0,0.0,156.0,79.0,1900.0,Yarra City Council
4,Abbotsford,3,h,1465000.0,2.5,3.0,2.0,0.0,134.0,150.0,1900.0,Yarra City Council
6,Abbotsford,4,h,1600000.0,2.5,3.0,1.0,2.0,120.0,142.0,2014.0,Yarra City Council
11,Abbotsford,3,h,1876000.0,2.5,4.0,2.0,0.0,245.0,210.0,1910.0,Yarra City Council
14,Abbotsford,2,h,1636000.0,2.5,2.0,1.0,2.0,256.0,107.0,1890.0,Yarra City Council


In [10]:
#convert non-numeric data to numeric values using one-hot encoding. Documentation follows:
#http://pandas.pydata.org/pandas-docs/stable/generated/pandas.get_dummies.html?highlight=get%20dummies#pandas.get_dummies
features_df = pd.get_dummies(df, columns = ['Suburb', 'CouncilArea','Type'])

#separate the dependent variable from the independent variable
del features_df['Price']

#create X and y arrays from the dataset using the .values command

X = features_df.values
y = df['Price'].values



In [11]:
#split the data set using the scikit-learn command below. 70/30 training to test is the standard split ratio
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size = 0.3, shuffle = True)




In [12]:
#apply the selected algorithm, Gradient boosting, and configure the hyperparameters
#Gradient boosting regressor is essentially an algorithm that decides by weighting each iteration of binary classification splitting
model = ensemble.GradientBoostingRegressor(
    n_estimators = 150, #how many decision trees to build
    learning_rate = 0.1, #controls rate at which additional decision trees influes overall prediction
    max_depth = 5, 
    min_samples_split = 4,
    min_samples_leaf = 6, 
    max_features = 0.6,
    loss = 'huber'
)

#begins the model training process
model.fit(X_train, y_train)


#save the training model as a file 
joblib.dump(model, 'house_trained_model.pkl')




['house_trained_model.pkl']

In [13]:
#Evaluate Results accuracy using Mean Absolute Error

mse = mean_absolute_error(y_train, model.predict(X_train))

print("Training setn mean absolute error: %.2f" % mse)

mse = mean_absolute_error(y_test, model.predict(X_test))
print("Test set mean absolute error %2.f" %mse)



Training setn mean absolute error: 134246.31
Test set mean absolute error 166780
