In [51]:
import pandas as pd 
from sklearn.model_selection import train_test_split 
from sklearn import ensemble 
from sklearn.metrics import mean_absolute_error 
import joblib

In [52]:
# Read in data from CSV
df = pd.read_csv('Melbourne_housing_FULL.csv')
df.head()

In [54]:
df.columns

Index(['Suburb', 'Address', 'Rooms', 'Type', 'Price', 'Method', 'SellerG',
       'Date', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car',
       'Landsize', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude',
       'Longtitude', 'Regionname', 'Propertycount'],
      dtype='object')

In [55]:
# Delete unneeded columns
del df['Address']
del df['Method']
del df['Date']
del df['Postcode']
del df['Lattitude']
del df['Longtitude']
del df['Regionname']
del df['Propertycount']

In [56]:
df.columns

Index(['Suburb', 'Rooms', 'Type', 'Price', 'SellerG', 'Distance', 'Bedroom2',
       'Bathroom', 'Car', 'Landsize', 'BuildingArea', 'YearBuilt',
       'CouncilArea'],
      dtype='object')

In [57]:
df.shape

(34857, 13)

In [58]:
df.isnull().sum()

Suburb              0
Rooms               0
Type                0
Price            7610
SellerG             0
Distance            1
Bedroom2         8217
Bathroom         8226
Car              8728
Landsize        11810
BuildingArea    21115
YearBuilt       19306
CouncilArea         3
dtype: int64

In [59]:
# Remove rows with missing values
df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=True)

In [60]:
df.shape

(8895, 13)

In [63]:
# Convert non-numerical data using one-hot encoding
features_df = pd.get_dummies(df, columns=['Suburb', 'CouncilArea', 'Type', 'SellerG'])

In [64]:
# Remove price
del features_df['Price']

In [65]:
# Create X and y arrays from the dataset
X = features_df.values
y = df['Price'].values

In [66]:
# Split data into test/train set (70/30 split) and shuffle
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

In [67]:
# Set up algorithm
model = ensemble.GradientBoostingRegressor(
n_estimators=250,
learning_rate=0.1,
max_depth=5,
min_samples_split=4,
min_samples_leaf=6,
max_features=0.6,
loss='huber'
)

In [68]:
# Run model on training data
model.fit(X_train, y_train)

GradientBoostingRegressor(loss='huber', max_depth=5, max_features=0.6,
                          min_samples_leaf=6, min_samples_split=4,
                          n_estimators=250)

In [69]:
# Save model to file
joblib.dump(model, 'house_trained_model.pkl')

['house_trained_model.pkl']

In [70]:
# Check model accuracy (up to two decimal places)
mse = mean_absolute_error(y_train, model.predict(X_train))
print ("Training Set Mean Absolute Error: %.2f" % mse)

Training Set Mean Absolute Error: 122731.30


In [71]:
mse = mean_absolute_error(y_test, model.predict(X_test))
print ("Test Set Mean Absolute Error: %.2f" % mse)


Test Set Mean Absolute Error: 160122.02
