In [1]:
# code for loading the format for the notebook
import os

# path : store the current path to convert back to it later
path = os.getcwd()
os.chdir("../notebook_format")
from formats import load_style
load_style()

In [2]:
os.chdir(path)
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = 8, 6 # change default figure size

# 1. magic to print version
# 2. magic so that the notebook will reload external python modules
%load_ext watermark
%load_ext autoreload 
%autoreload 2

%watermark -a 'Ethen' -d -t -v -p numpy,pandas,matplotlib,h2o

Ethen 2016-08-04 08:46:20 

CPython 3.5.2
IPython 4.2.0

numpy 1.11.1
pandas 0.18.1
matplotlib 1.5.1
h2o 3.8.2.9


## h2o Preprocessing

In [3]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
h2o.init( nthreads = -1, max_mem_size = 5 )



No instance found at ip and port: localhost:54321. Trying to start local jar...


JVM stdout: /var/folders/x3/zghnnw9x1nl2x4vsqd97kxym0000gn/T/tmpcuicty3_/h2o_ethen_started_from_python.out
JVM stderr: /var/folders/x3/zghnnw9x1nl2x4vsqd97kxym0000gn/T/tmpcqx3h7zm/h2o_ethen_started_from_python.err
Using ice_root: /var/folders/x3/zghnnw9x1nl2x4vsqd97kxym0000gn/T/tmpgzu6rbwk


Java Version: java version "1.6.0_65"
Java(TM) SE Runtime Environment (build 1.6.0_65-b14-468-11M4833)
Java HotSpot(TM) 64-Bit Server VM (build 20.65-b04-468, mixed mode)


Starting H2O JVM and connecting: ................. Connection successful!


0,1
H2O cluster uptime:,1 seconds 819 milliseconds
H2O cluster version:,3.8.1.4
H2O cluster name:,H2O_started_from_python_ethen_vhz326
H2O cluster total nodes:,1
H2O cluster total free memory:,4.98 GB
H2O cluster total cores:,8
H2O cluster allowed cores:,8
H2O cluster healthy:,True
H2O Connection ip:,127.0.0.1
H2O Connection port:,54321


In [4]:
# 1. upload the python object directy to h2o, the column_names requires a list
# 2. destination_frame is the key assigned to the imported file, easier to track later 
"""
h2o_train = h2o.H2OFrame.from_python( 
    train.values, 
    destination_frame = "h2o_train",
    column_names = train.columns.tolist()
)

h2o_test = h2o.H2OFrame.from_python( 
    test.values, 
    destination_frame = "h2o_test",
    column_names = test.columns.tolist()
)
"""

# faster to save and load it
h2o_train = h2o.import_file( path = "df_train.csv", destination_frame = "h2o_train" )
h2o_test  = h2o.import_file( path = "df_test.csv" , destination_frame = "h2o_test"  )


Parse Progress: [##################################################] 100%

Parse Progress: [##################################################] 100%


In [5]:
# 1. we want to train a classification model, we must ensure that the response is 
#    coded as a factor. If the response is numeric 0/1, H2O will 
#    train a regression model instead
#    Use .asfactor the convert the output column to factor variable
# 2. drop the id column
h2o_train["TARGET"] = h2o_train["TARGET"].asfactor()
test_ids  = h2o_test["ID"]
h2o_test  = h2o_test.drop("ID")
h2o_train = h2o_train.drop("ID")

In [6]:
# extract the input and output column
X = h2o_train.col_names[:-1]      
y = h2o_train.col_names[-1]
print(y)

TARGET


## h2o Modeling

In [7]:
rf1 = H2ORandomForestEstimator(
    model_id = "rf1",
    ntrees = 250,
    max_depth = 12,
    nfolds = 10,
    balance_classes = True,
    stopping_rounds = 5,
    stopping_metric = "AUC",
    stopping_tolerance = 0.05,
)

In [8]:
rf1.train( X, y, training_frame = h2o_train )


drf Model Build Progress: [##################################################] 100%


In [9]:
# set a small learning rate, and tune the max_depth parameter
gbm1 = H2OGradientBoostingEstimator(
    model_id = "gbm1",
    ntrees = 200,
    learn_rate = 0.01,
    max_depth = 6,
    sample_rate = 0.7,
    col_sample_rate = 0.7,
    nfolds = 10,
    balance_classes = True,
    stopping_rounds = 5,
    stopping_metric = "AUC",
    stopping_tolerance = 0.05
)

In [10]:
gbm1.train( X, y, training_frame = h2o_train )


gbm Model Build Progress: [##################################################] 100%


## Creating Submission, Saving & Loading Models

In [31]:
def write_submission( h2o_model, test_ids, dirs, filename ):
    pred = test_ids.as_data_frame()
    pred['TARGET'] = h2o_model.predict(h2o_test)['p0'].as_data_frame()
    pred.to_csv( os.path.join( dirs, 'submission_' + filename + '.csv' ), index = False )

In [32]:
dirs = "submission"

# auc 0.803467
write_submission( h2o_model = rf1, test_ids = test_ids, dirs = dirs, filename = "rf1" ) 

# auc 0.816301
write_submission( h2o_model = gbm1, test_ids = test_ids, dirs = dirs,  filename = "gbm1" )


drf prediction Progress: [##################################################] 100%

gbm prediction Progress: [##################################################] 100%


In [13]:
# saving and loading model
# path = h2o.save_model( gbm1, force = True )
# print(path)

# gbm_load = h2o.load_model(path)

# access variable importance
# for var_name, _, _, imp in gbm_load.varimp()[:5]:
#     print( var_name, imp )

## Other Tricks Not Implemented

In [34]:
# Construct a column that can be used to perform a random stratified split,
# stratified split is preferred for unbalanced dataset, since it makes sure that
# the splitted folds are made by preserving the percentage of samples for each output class
stratified = h2o_train["TARGET"].stratified_split( test_frac = 0.2, seed = 1234 )
h2o_train1 = h2o_train[ stratified == "train" ]
h2o_valid1 = h2o_train[ stratified == "test"  ]

# code to double check the TARGET distribution is stratified splitted
train1_tb = h2o_train1["TARGET"].table()
valid1_tb = h2o_valid1["TARGET"].table()
print( train1_tb["Count"] / train1_tb["Count"].sum() )
print( valid1_tb["Count"] / valid1_tb["Count"].sum() )

    Count
---------
0.960352
0.0396483

[2 rows x 1 column]
    Count
---------
0.960748
0.0392516

[2 rows x 1 column]


In [35]:
def plot_scoring_history(h2o_model):
    """
    access the scoring_history of a h2o_model
    and plot the training and validation's evaluation score to spot
    sign of overfitting, only usefule if you're using a validation_frame 
    instead of doing cross validation
    """
    score_fig = plt.figure( figsize = ( 8, 6 ) )   
    score = h2o_model.scoring_history()
    plt.plot( score.loc[ 1:, 'training_AUC'], label = "training_AUC" )
    plt.plot( score.loc[ 1:, 'validation_AUC' ], label = "validation_AUC" )
    plt.title(h2o_model.__class__.__name__)
    plt.legend( loc = "lower right" )
    return score_fig

In [36]:
# print( rf1.auc( valid = True ) )
# rf1_fig = plot_scoring_history(rf1)
# print( gbm1.auc( valid = True ) )
# gbm1_fig = plot_scoring_history(gbm1)