In [1]:

# coding: utf-8

# In[1]:


# all model saving and loading, grid search is removed from this script
# this script only contains the code to load the data and create the models and check the RMSE scores of the models.
# model training time is about 10 minutes in total for 4 models
# the suprising finding is that the stacked ensemble does not give a good performance at all
# this is unexpected - probably I made a mistake when creating the stacked model - I could not find it though.
# so if you find it, please let me know.
# the good news though is that I could find one model whose RMSE is below the 123000 threshold for cross validation and test data
# this model is a Neural Network.
#  
# so overall I have fulfilled the exercise even though the ensemble model is not working as expected.
import h2o


# In[2]:


import matplotlib.pyplot as plt
get_ipython().run_line_magic('matplotlib', 'inline')


# In[3]:


h2o.init()


# In[5]:


url = 'http://coursera.h2o.ai/house_data.3487.csv'


# In[6]:


data = h2o.import_file(url)


# In[8]:


# problem is that date is of type enum and not a date type
# so we need to convert from enum to string and cut out the T000000 and then cast back to date


# In[9]:


import datetime
# using datetime for converting string to datetime: datetime.datetime.strptime('20141013', "%Y%m%d")
# using datetime for converting datetime to string in a format that is h2o compatible: dt.strftime('%Y-%m-%d')


# In[10]:


# create new column real_date which is of type time
data['real_date']=h2o.H2OFrame(data["date"].ascharacter().as_data_frame().applymap(lambda x: datetime.datetime.strptime(x[0:8], "%Y%m%d").strftime('%Y-%m-%d')))


# In[11]:


# creating additional column year
data['year']=data['real_date'].year()


# In[12]:


# creating additional column month
data['month']=data['real_date'].month()


# In[13]:


data['zip_enum'] = data['zipcode'].asfactor()


# In[14]:


# splitting data set into train an test data set according to split ration 90% for train and 10% for test, seed is 123
train, test = data.split_frame([0.9], seed = 123)


# In[15]:


#Total number of rows: 21613. Number of rows in train data set: 19462. Number of rows in validation data set: 0. Number of rows in  test data set 2151
print("Total number of rows: {}. Number of rows in train data set: {}. Number of rows in validation data set: {}. Number of rows in  test data set {}".format(data.nrows, train.nrows, 0, test.nrows))


# In[16]:


# RMSE goal is below 123000


# In[18]:


y = "price"


# In[19]:


ignore_fields = [y, "date", "id", "zipcode"]


# In[20]:


xAll = set(data.names) - set(ignore_fields)


# In[22]:


# THE FOLLOWING columns are used for all models
#{'bathrooms',
# 'bedrooms',
# 'condition',
# 'floors',
# 'grade',
# 'lat',
# 'long',
# 'month',
# 'real_date',
# 'sqft_above',
# 'sqft_basement',
# 'sqft_living',
# 'sqft_living15',
# 'sqft_lot',
# 'sqft_lot15',
# 'view',
# 'waterfront',
# 'year',
# 'yr_built',
# 'yr_renovated',
# 'zip_enum'}


# In[23]:


# Now generating 4 models and then create an ensemble


# In[24]:


from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.deeplearning import H2ODeepLearningEstimator


# In[25]:


# nfolds for crossvalidation for all models
nfolds=7


# In[26]:


# GLM model
# training time is 3 seconds
mGLMs = H2OGeneralizedLinearEstimator(
    family = "gaussian", # default is gaussian
    model_id = "mGLMs",
    nfolds=nfolds,
    fold_assignment = "Modulo",
    keep_cross_validation_predictions=True,
    seed=50,
    )
mGLMs.train(xAll, y, train)


# In[29]:


# RMSE is of GLM is 364369 - that is bad
print(mGLMs.rmse(xval=True))


# In[28]:


# Random forest model
# training time is 3 minutes
mRFs = H2ORandomForestEstimator(
    model_id = "mRFs",
    nfolds=nfolds,
    fold_assignment = "Modulo",
    keep_cross_validation_predictions=True,
    seed=50,
    stopping_metric = "rmse",
)
mRFs.train(xAll, y, train)


# In[31]:


#RMSE of RF is 125650 - much better but above goal
print(mRFs.rmse(xval=True))


# In[32]:


# Gradient Boosting 
# creating a GBM with parameters that were discovered with grid search - train with crossvalidation
# training time is 1 minute 37 sec
mGBMs = H2OGradientBoostingEstimator(
    model_id = "mGBMs",
    nfolds=nfolds,
    fold_assignment = "Modulo",
    keep_cross_validation_predictions=True,
    learn_rate = 0.1,
    max_depth = 8,
    nbins_cats = 2000,
    nbins_top_level = 2000,
    ntrees = 100,
    seed = 50,
    stopping_metric = "rmse"
    )
mGBMs.train(xAll, y, train)


# In[33]:


#RMSE of GBM is 117960 that is meeting the goal
print(mGBMs.rmse(xval=True))


# In[34]:


# for NN the following parameters were discovered via grid search
#    distribution epochs     hidden      l1      l2   rate  \
#0           gamma  200.0   [10, 10]   0.001  1.0E-4   0.01   
#1           gamma  200.0  [200, 10]  1.0E-5  1.0E-4   0.01 


# In[35]:


# Neural network model
mNNs = H2ODeepLearningEstimator(
    epochs=200,
    hidden=[10,10],
    l1 = 0.001,
    l2 = 0.0001,
    rate = 0.01,
    activation="rectifier",
    seed=50,
    model_id = "mNNs",
    nfolds=nfolds,
    stopping_metric="rmse",
    fold_assignment = "Modulo",
    keep_cross_validation_predictions=True)


# In[36]:


# training time is 5 minutes for NN
mNNs.train(xAll, y, train)


# In[37]:


# RMSE of NN is 119069
print(mNNs.rmse(xval=True))


# In[38]:


models = [mNNs, mGBMs, mRFs, mGLMs]


# In[39]:


mSE= H2OStackedEnsembleEstimator(model_id = 'mSE', base_models= models, seed=50)


# In[40]:


mSE.train(xAll, y, train)


# In[41]:


# the stacked ensemble has a terrible performance even on the training data set - rmse is 363773
# which is about as bad as the GLM.
# the stacked ensemble does not nearly give the performance of the NN or the GBM or the RF.
# something must be wrong...
mSE.model_performance(train)


# In[44]:


# the neural network has a RMSE of 117692 on the test data and thus meets the goal.
# the NN meets the goal for cross validation and test data
# mission accomplished
print("model performance on test for Neural Network:")
print(mNNs.model_performance(test))


# In[45]:


# the GBM does have a relative high RMSE of 132440 on the test data and does not meet the goal. The GBM suffers from overfitting.
print("model performance on test for Gradient Boosting:")
print(mGBMs.model_performance(test))


# In[46]:


# the RF scores better than the GBM. The RF has an RMSE of 131184 on the test data and does not meet the goal
print("model performance on test for Random Forest:")
print(mRFs.model_performance(test))


# In[47]:


# the GLM is the worst base model. The RMSE is 360907 on the test data and does not meet the goal. Very bad result. Model has very high bias.
print("model performance on test for GLM:")
print(mGLMs.model_performance(test))


# In[48]:


# the stacked ensemble is even worse then the worst base model (GLM). The stacked ensemble has RMSE of 361066 on the test data and does not meet the goal.
# this result is not expected, it is not logical. Something must be wrong. Not sure what. It seems that the stacked ensemble
# optimized for high RMSE and not low RMSE.
print("model performance on test for Stacked Ensemble:")
print(mSE.model_performance(test))


# In[50]:


import pandas as pd


# In[51]:


all_models=[mGLMs, mRFs, mGBMs, mNNs, mSE]


# In[53]:


names = ['GLM', 'RF',"GBM", 'NN','SE']


# In[56]:


test_perf = list(map(lambda x: x.model_performance(test), all_models))


# In[59]:


# Summary of the RMSE on the test data - as you can see the NN is below the threshold of 123000
print(pd.Series(map(lambda x: x.rmse(), test_perf), names))



Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,1 hour 36 mins
H2O cluster timezone:,Asia/Kolkata
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.5
H2O cluster version age:,24 days
H2O cluster name:,H2O_from_python_anmol_i32c5d
H2O cluster total nodes:,1
H2O cluster free memory:,429.3 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


Parse progress: |█████████████████████████████████████████████████████████| 100%


  data = _handle_python_lists(python_obj.as_matrix().tolist(), -1)[1]


Parse progress: |█████████████████████████████████████████████████████████| 100%
Total number of rows: 21613. Number of rows in train data set: 19462. Number of rows in validation data set: 0. Number of rows in  test data set 2151
glm Model Build progress: |███████████████████████████████████████████████| 100%
364369.34656071867
drf Model Build progress: |███████████████████████████████████████████████| 100%
125650.06492025779
gbm Model Build progress: |███████████████████████████████████████████████| 100%
117960.13139155704
deeplearning Model Build progress: |██████████████████████████████████████| 100%
119512.45347264563
stackedensemble Model Build progress: |███████████████████████████████████| 100%
model performance on test for Neural Network:

ModelMetricsRegression: deeplearning
** Reported on test data. **

MSE: 13418958450.001537
RMSE: 115840.22811614943
MAE: 68646.9215215741
RMSLE: 0.17720672728689413
Mean Residual Deviance: 13418958450.001537

model performance on test for Gr