In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
import time
%matplotlib inline

### Data & Cleaning

https://www.kaggle.com/anthonypino/melbourne-housing-market

21 column, 34,857 row dataset containing housing market data for Melbourne

In [2]:
#21 column 34857 row housing dataset
df_raw = pd.read_csv('Melbourne_housing_FULL.csv')

In [3]:
#set dataframe to clean
df = df_raw
df.isnull().sum()

Suburb               0
Address              0
Rooms                0
Type                 0
Price             7610
Method               0
SellerG              0
Date                 0
Distance             1
Postcode             1
Bedroom2          8217
Bathroom          8226
Car               8728
Landsize         11810
BuildingArea     21115
YearBuilt        19306
CouncilArea          3
Lattitude         7976
Longtitude        7976
Regionname           3
Propertycount        3
dtype: int64

In [4]:
#lots of nans
#see how much data is left if drop all of them
df = df.dropna()

#nearly 9000 rows still, see how this performs as is
#set data & target, get dummies for categoricals
X = df.drop('Price',1)
X = pd.get_dummies(X)
y = df['Price']

#set train & test sets
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]

### Random Forest Regression

In [5]:
#default RF regression
from sklearn import ensemble
from sklearn.model_selection import cross_val_score

start_time = time.clock()
rfr = ensemble.RandomForestRegressor().fit(X_train, y_train)
scores_rfr = cross_val_score(rfr, X_train, y_train, cv=5)

print('score array:\n', scores_rfr)
print('\nscore array mean:\n', np.mean(scores_rfr))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.74888642 0.75633057 0.78345858 0.75260314 0.79962301]

score array mean:
 0.7681803460325961

runtime:
 61.737435000000005 seconds


In [6]:
#run on test data
start_time = time.clock()
scores_rfr = cross_val_score(rfr, X_test, y_test, cv=5)

print('score array:\n', scores_rfr)
print('\nscore array mean:\n', np.mean(scores_rfr))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.83136369 0.77059368 0.64755632 0.70542135 0.77209013]

score array mean:
 0.745405033464097

runtime:
 3.819082999999992 seconds


### Gradient Boosting

In [13]:
#default gradient boosting
start_time = time.clock()
gbr = ensemble.GradientBoostingRegressor().fit(X_train, y_train)
scores_gbr = cross_val_score(gbr, X_train, y_train, cv=5)

print('score array:\n', scores_gbr)
print('\nscore array mean:\n', np.mean(scores_gbr))
print('\nruntime:\n',time.clock() - start_time, 'seconds')

score array:
 [0.75588946 0.79583189 0.78545596 0.71599991 0.78584532]

score array mean:
 0.7678045087752388

runtime:
 371.908873 seconds


### Feature Selection

In [12]:
from sklearn.feature_selection import SelectKBest, f_regression
import warnings
warnings.filterwarnings('ignore')

k = 5
kbest = SelectKBest(f_regression, k=k)
kbest.fit(X, y)

mask = kbest.get_support()
k_feats = []
for bool, feat in zip(mask, X.columns):
    if bool:
        k_feats.append(feat)

print(k_feats)

#X_kbest = df[k_feats]
#X_kbest_train, X_kbest_test = X_kbest[:offset], X_kbest[offset:]

['Rooms', 'Bedroom2', 'Bathroom', 'BuildingArea', 'Regionname_Southern Metropolitan']


In [None]:
#tune RF regr
#from sklearn.model_selection import GridSearchCV

#params = [{'n_estimators':[10, 15, 20],
#           'criterion':['mse','mae'],
#           'max_depth':[2,3,4]}]

#rfr2 = ensemble.RandomForestRegressor()
#grid = GridSearchCV(estimator=rfr2, param_grid=params)

#start_time = time.clock()
#grid.fit(X_train, y_train)
#print('\nBest parameters:\n', grid.best_params_)
#print('\nBest score:\n', grid.best_score_)
#print('\nruntime:\n',time.clock() - start_time, 'seconds')

In [None]:
#check categorical data types
#cat = df.select_dtypes(include=['object'])
#for i in cat:
#    column = cat[i]
#    print(i)
#    print(column.nunique())