In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import ensemble
from sklearn.metrics import mean_absolute_error

df = pd.read_csv('../data/Melbourne_housing_FULL.csv')

# Stripping non-important features
df.columns = df.columns.str.strip()
scrubbed = ['Address','Method','SellerG','Date','Postcode','Lattitude','Longtitude','Regionname','Propertycount']
for title in scrubbed:
    del df[title]

# Remove rows with emptiness
df.dropna(axis=0,how='any',thresh=None,subset=None,inplace=True)

# One-hot encoding
df = pd.get_dummies(df,columns=['Suburb','CouncilArea','Type'])

# Set independent and dependent vars
X = df.drop('Price', axis=1)
y = df['Price']

# Shuffle and split training/testing data
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,shuffle=True)

# Create model
model = ensemble.GradientBoostingRegressor(
                                            n_estimators=150,
                                            learning_rate=0.1,
                                            max_depth=30,
                                            min_samples_split=4,
                                            min_samples_leaf=6,
                                            max_features=0.6,
                                            loss='huber'
                                          )
# Train model
model.fit(X_train, y_train)

GradientBoostingRegressor(loss='huber', max_depth=30, max_features=0.6,
                          min_samples_leaf=6, min_samples_split=4,
                          n_estimators=150)

In [2]:
# First test
mae_train = mean_absolute_error(y_train,model.predict(X_train))
print("TRAIN MAE: %.2f" % mae_train)

mae_test = mean_absolute_error(y_test,model.predict(X_test))
print("TEST MAE: %.2f" % mae_test)

TRAIN MAE: 26381.94
TEST MAE: 169482.66


In [5]:
# Lower depth and increase tree count
model = ensemble.GradientBoostingRegressor(
                                            n_estimators=250,
                                            learning_rate=0.1,
                                            max_depth=5,
                                            min_samples_split=4,
                                            min_samples_leaf=6,
                                            max_features=0.6,
                                            loss='huber'
                                          )
model.fit(X_train, y_train)

GradientBoostingRegressor(loss='huber', max_depth=5, max_features=0.6,
                          min_samples_leaf=6, min_samples_split=4,
                          n_estimators=250)

In [6]:
# Test new model
mae_train = mean_absolute_error(y_train,model.predict(X_train))
print("TRAIN MAE: %.2f" % mae_train)

mae_test = mean_absolute_error(y_test,model.predict(X_test))
print("TEST MAE: %.2f" % mae_test)

TRAIN MAE: 120194.96
TEST MAE: 163573.82


In [None]:
# Implement grid search
from sklearn.model_selection import GridSearchCV

model = ensemble.GradientBoostingRegressor()

hyperparams = {
    'n_estimators':[200,300],
    'max_depth':[4,6],
    'min_samples_split':[3,4],
    'min_samples_leaf':[5,6],
    'learning_rate':[0.01,0.02],
    'max_features':[0.8,0.9],
    'loss':['ls','lad','huber']
}

grid = GridSearchCV(model, hyperparams, n_jobs=4, verbose=10)
grid.fit(X_train, y_train)
grid.best_params_

mae_train = mean_absolute_error(y_train,grid.predict(X_train))
print("TRAIN MAE: %.2f" % mae_train)

mae_test = mean_absolute_error(y_test,grid.predict(X_test))
print("TEST MAE: %.2f" % mae_test)