In [1]:
## first step is to import h2o module
## make sure h20 is installed
## pip install h2o 
## or it can be installed from h2o git for the latest version
import h2o

In [3]:
h2o.init()
# h2o.init(ip, port) -- use this when h2o is installed on a separate machine in the network
# h2o.init(nthreads = -1, max_mem_size = 8) # nthreads uses all possible cores. mem size is what h2o will work on
# h2o.remove_all()  #clean slate, in case cluster was already running

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,34 secs
H2O cluster version:,3.10.3.4
H2O cluster version age:,20 days
H2O cluster name:,H2O_from_python_Hari_xurkh2
H2O cluster total nodes:,1
H2O cluster free memory:,887 Mb
H2O cluster total cores:,0
H2O cluster allowed cores:,0
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


In [4]:
## h2o's syntax is very similar to python's scikit learn

## initiate a h2o dataframe with tupple
df = h2o.H2OFrame(zip(*((1, 2, 3), ('a', 'b', 'c'), (0.1, 0.2, 0.3))))
print df

## initiate a h2o dataframe with list
df = h2o.H2OFrame(zip(*[[1, 2, 3], ['a', 'b', 'c'], [0.1, 0.2, 0.3]]))
print df

Parse progress: |█████████████████████████████████████████████████████████| 100%


C1,C2,C3
1,a,0.1
2,b,0.2
3,c,0.3



Parse progress: |█████████████████████████████████████████████████████████| 100%


C1,C2,C3
1,a,0.1
2,b,0.2
3,c,0.3





In [5]:
## initialize an h2o frame using a dictionary
## this can enable us to name the columns
df = h2o.H2OFrame({'A': [1, 2, 3],'B': ['a', 'b', 'c'],'C': [0.1, 0.2, 0.3]})
print df
print df.types

Parse progress: |█████████████████████████████████████████████████████████| 100%


A,C,B
1,0.1,a
2,0.2,b
3,0.3,c



{u'A': u'int', u'C': u'real', u'B': u'string'}


In [6]:
df = h2o.H2OFrame.from_python({'A': [1, 2, 3], 'B': ['a', 'a', 'b'], 'C': ['hello', 'all', 'world'], \
                               'D': ['12MAR2015:11:00:00', '13MAR2015:12:00:00', '14MAR2015:13:00:00']},\
                              column_types=['numeric', 'enum', 'string', 'time'])

print df
print df.types

Parse progress: |█████████████████████████████████████████████████████████| 100%


A,C,B,D
1,hello,a,2015-03-12 11:00:00
2,all,a,2015-03-13 12:00:00
3,world,b,2015-03-14 13:00:00



{u'A': u'int', u'C': u'enum', u'B': u'string', u'D': u'time'}


In [21]:
## generate a dataframe using randomly created numbers
import numpy as np
df = h2o.H2OFrame.from_python(np.random.randn(100,4).tolist(), column_names=list('ABCD'))

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [22]:
## see the top 10 rows of the h2o dataframe
df.head() ## df.head(2)

A,B,C,D
0.269896,-0.865952,-0.276396,0.0776507
0.415327,-0.96594,0.490325,0.964605
0.594605,1.19073,1.07275,-0.729646
-1.17624,0.011903,-0.568521,-1.70094
-1.89206,0.466301,0.582788,0.260875
-0.428659,1.30084,0.732052,-0.0062972
-0.235223,1.89863,0.173552,0.729738
1.12253,1.57339,-1.46924,0.982647
0.712093,-1.413,0.279538,1.98873
-2.2179,-0.255462,0.864262,-1.28869




In [9]:
## see the last 10 rows of the dataframe
df.tail() # df.tail(2)

A,B,C,D
-0.0308951,-0.405008,-0.0275623,-1.06173
1.78997,-0.286779,1.7157,-1.50796
1.26621,0.189065,0.331336,1.33962
-0.843966,0.949368,-0.00504118,-0.0725097
-1.82147,-0.739363,-0.792319,-0.507157
0.222296,0.0482048,1.27518,-1.51885
1.38787,0.469047,1.00545,-2.3627
-0.807577,0.0563982,0.228542,-0.404728
-1.05154,0.395779,0.403606,-0.291074
-1.53964,0.126006,0.519661,-2.32339




In [10]:
## get the column names ## same as that in sklearn
df.columns

[u'A', u'B', u'C', u'D']

In [11]:
## give a basic summary about the h2o dataframe
df.describe()

Rows:100
Cols:4




Unnamed: 0,A,B,C,D
type,real,real,real,real
mins,-2.33063930346,-2.45133607033,-2.39741265147,-2.36270095674
mean,-0.00365301674539,-0.000612611345309,-0.256365564892,0.104572359581
maxs,2.72681549885,2.72307907829,2.30684354355,2.35934457647
sigma,1.0250971685,0.968296153304,1.04390877405,1.05233485729
zeros,0,0,0,0
missing,0,0,0,0
0,-0.140334229745,0.383622669798,-1.04370608371,-1.04318294836
1,1.38121149453,-0.802860369666,0.85799043865,1.62352262904
2,0.456932202721,0.895567629279,-1.36004782332,-0.726321831146


In [12]:
## subsetting the h2o dataframe. take out the column A. By column name
df['A']

A
-0.140334
1.38121
0.456932
0.109364
0.579468
0.0958561
0.74104
0.773131
1.43124
-0.125035




In [13]:
## subset the h2o dataframe using index
df[1]

B
0.383623
-0.80286
0.895568
0.0816452
-0.856254
2.12129
-2.19808
0.828201
0.167442
-0.0194855




In [14]:
## subset more than one columns
df[['B', 'C']]

B,C
0.383623,-1.04371
-0.80286,0.85799
0.895568,-1.36005
0.0816452,-1.55697
-0.856254,-0.843682
2.12129,-1.35414
-2.19808,0.912158
0.828201,-1.86374
0.167442,-0.418615
-0.0194855,-1.12859




In [26]:
## Dealing with missing values in H2O

## generate a h2o frame with some NA or None data
df = h2o.H2OFrame.from_python(
{'A': [1, 2, 3,None,''],
 'B': ['a', 'a', 'b', 'NA', 'NA'],
 'C': ['hello', 'all', 'world', None, None],
 'D': ['12MAR2015:11:00:00',None,
'13MAR2015:12:00:00',None,
 '14MAR2015:13:00:00']},
 column_types=['numeric', 'enum', 'string', 'time'])

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [27]:
## check if the column A contains a missing value
df["A"].isna()

isNA(A)
0
0
0
1
1




In [28]:
## assign the missing values in column A to 5
df[ df["A"].isna(), "A"] = 5
print df["A"].isna()

isNA(A)
0
0
0
0
0





In [29]:
## taking mean when NAs are present
df = h2o.H2OFrame.from_python( {'A': [1, 2, 3,None,''],  'B': ['a', 'a', 'b', 'NA', 'NA'],  \
                                 'C': ['hello', 'all', 'world', None, None], 'D': ['12MAR2015:11:00:00',None, \
                                                                                   '13MAR2015:12:00:00',None,\
                                                                                   '14MAR2015:13:00:00']}, \
                               column_types=['numeric', 'enum', 'string', 'time']) 

print '***NA removed***'
print df.mean(na_rm=True)

Parse progress: |█████████████████████████████████████████████████████████| 100%
***NA removed***
[2.0, nan, nan, 1426248000000.0]


In [32]:
## apply functions to multiple columns
df = h2o.H2OFrame.from_python(np.random.randn(100,4).tolist(), column_names=list('ABCD'))

## like in sklearn, apply function works well with lambda function
print df.apply(lambda x: x.mean()), type(df.apply(lambda x: x.mean()))
print df.mean(), type(df.mean())

Parse progress: |█████████████████████████████████████████████████████████| 100%


A,B,C,D
0.0849616,-0.0195686,0.0534621,0.0128808


 <class 'h2o.frame.H2OFrame'>
[0.08496163922292263, -0.019568649767121055, 0.05346214047440261, 0.012880790432177174] <type 'list'>


In [38]:
df.apply(lambda row: row.sum(), axis=0)

A,B,C,D
-1.41469,-0.643047,-1.49336,-0.535328




In [39]:
## methods to deal with strings
df = h2o.H2OFrame.from_python(['Hello', 'World', 'Welcome', 'To', 'H2O', 'World'])

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [40]:
print 'number of l\'s in the dataframe are = ', df.countmatches('l')

number of l's in the dataframe are =  

C1
2
1
1
0
0
1





In [41]:
## substituting l with x
print df.sub('l','x')

C1
Hexlo
Worxd
Wexcome
To
H2O
Worxd





In [42]:
## merging / joining dataframes
## syntax works like dataframe in R

df1 = h2o.H2OFrame.from_python(np.random.randn(100,4).tolist(), column_names=list('ABCD'))
df2 = h2o.H2OFrame.from_python(np.random.randn(100,4).tolist(), column_names=list('ABCD'))

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [43]:
## row bind. append both rows. need to have same number of columns
df2.rbind(df1)

A,B,C,D
-1.82576,-1.58405,0.217588,-0.637517
0.740811,-2.26648,1.9496,1.43442
0.946057,-0.491886,0.277835,-0.534715
0.000251398,-0.86951,0.699713,0.794792
0.0390592,-0.279418,0.690292,-0.641263
0.438678,0.326225,1.18508,0.00843382
0.531987,0.912445,0.638092,0.29905
-0.770206,0.116926,0.82748,-0.255066
1.78641,-0.502205,0.237441,-0.57727
-0.928735,0.478596,-0.575327,-0.597691




In [44]:
## column bind. append side by side
df2.cbind(df1)

A,B,C,D,A0,B0,C0,D0
-1.82576,-1.58405,0.217588,-0.637517,2.1654,1.02442,0.678053,0.84895
0.740811,-2.26648,1.9496,1.43442,-0.887794,-0.0880076,0.871375,1.46817
0.946057,-0.491886,0.277835,-0.534715,2.5882,0.96214,-0.575813,-0.90839
0.000251398,-0.86951,0.699713,0.794792,-1.00329,1.12423,0.527596,1.85277
0.0390592,-0.279418,0.690292,-0.641263,0.0623377,0.749777,-0.805001,-0.0272182
0.438678,0.326225,1.18508,0.00843382,-2.22313,0.115459,1.00188,0.543189
0.531987,0.912445,0.638092,0.29905,-1.03278,-1.50022,-1.20239,0.159349
-0.770206,0.116926,0.82748,-0.255066,-0.0785205,0.634706,-2.23708,0.0698855
1.78641,-0.502205,0.237441,-0.57727,-2.58661,1.14817,0.581431,0.791463
-0.928735,0.478596,-0.575327,-0.597691,-0.433513,0.506066,-2.49044,-0.499934




In [45]:
## merging dataframe
df1 = h2o.H2OFrame.from_python( {'A': ['Hello', 'World', 'Welcome', 'To', 'H2O', 'World'], \
                                  'n': [0,1,2,3,4,5]} )
df2 = h2o.H2OFrame.from_python(np.random.randint(0,6,(100,1)), column_names= list('n'))

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [46]:
# Combine column "n" from both datasets
df2.merge(df1)

n,A
3,To
4,H2O
1,World
0,Hello
4,H2O
2,Welcome
3,To
4,H2O
1,World
5,World




In [47]:
# grouping

df = h2o.H2OFrame(
 {'A' : ['foo', 'bar', 'foo', 'bar', 'foo', 'bar', 'foo', 'foo'],
 'B' : ['one', 'one', 'two', 'three', 'two', 'two', 'one', 'three'],
 'C' : np.random.randn(8).tolist(),
 'D' : np.random.randn(8).tolist()})

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [48]:
df.group_by('A').sum().frame

A,sum_C,sum_B,sum_D
bar,1.80936,3,1.46001
foo,-4.60595,5,-0.396115




In [49]:
df.group_by('A').mean().frame

A,mean_D,mean_B,mean_C
bar,0.48667,1,0.603119
foo,-0.0792229,1,-0.92119




In [54]:
print df.group_by('A').min().frame
print df.group_by('A').max().get_frame()

A,min_D,min_B,min_C
bar,-0.653349,0,-0.035211
foo,-0.888446,0,-1.42597





A,max_D,max_B,max_C
bar,1.14798,2,1.26203
foo,1.13654,2,0.0350038





In [55]:
df1 = df.group_by(['A','B']).sum().frame
print df1

A,B,sum_C,sum_D
bar,one,1.26203,1.14798
bar,three,-0.035211,-0.653349
bar,two,0.582543,0.965381
foo,one,-0.745753,0.764497
foo,three,-1.42597,-0.888446
foo,two,-2.43423,-0.272166





In [56]:
## working with date and time

df = h2o.H2OFrame.from_python({'D': ['18OCT2015:11:00:00', '19OCT2015:12:00:00', '20OCT2015:13:00:00']}, \
                              column_types=['time'])
print df['D'].day()
print df['D'].dayOfWeek()

Parse progress: |█████████████████████████████████████████████████████████| 100%


D
18
19
20





D
Sun
Mon
Tue





In [58]:
## machine learning

## H2O supports the following models:
# Deep Learning
# Naive Bayes
# Principal Components Analysis (PCA)
# K-means
# Generalized Linear Models (GLM)
# Gradient Boosting Machine (GBM)
# Generalized Low Rank Model (GLRM)
# Distributed Random Forest (DRF)

In [59]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
#https://h2o-release.s3.amazonaws.com/h2o/rel-turan/4/docs-website/h2o-py/docs/modeling.html#h2ogradientboostingestimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
#https://h2o-release.s3.amazonaws.com/h2o/rel-turan/4/docs-website/h2o-py/docs/modeling.html#h2ogeneralizedlinearestimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
#https://h2o-release.s3.amazonaws.com/h2o/rel-turan/4/docs-website/h2o-py/docs/modeling.html#h2orandomforestestimator

from h2o.grid.grid_search import H2OGridSearch
## same as sklearn
## from sklearn.ensemble import GradientBoostingClassifier

In [60]:
## read data set
## data source -- http://archive.ics.uci.edu/ml/datasets/Bank+Marketing
path = 'C:\\Users\\Hari\\Documents\\Experiment_with_H2O\\bank\\bank-full.csv'
df = h2o.import_file(path=path)

## sklearn -- df = pd.read_csv(path) # sklearn has no read method

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [61]:
## check out the first 5 rows
df.head(5)

age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no




In [None]:
# Input variables:
# # bank client data:
# 1 - age (numeric)
# 2 - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
# 3 - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
# 4 - education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
# 5 - default: has credit in default? (categorical: 'no','yes','unknown')
# 6 - housing: has housing loan? (categorical: 'no','yes','unknown')
# 7 - loan: has personal loan? (categorical: 'no','yes','unknown')
# # related with the last contact of the current campaign:
# 8 - contact: contact communication type (categorical: 'cellular','telephone') 
# 9 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
# 10 - day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
# 11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
# # other attributes:
# 12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
# 13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
# 14 - previous: number of contacts performed before this campaign and for this client (numeric)
# 15 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
    
# Output variable (desired target):
# 21 - y - has the client subscribed a term deposit? (binary: 'yes','no')

In [62]:
## check what data types h2o parsed
df.types

{u'age': u'int',
 u'balance': u'int',
 u'campaign': u'int',
 u'contact': u'enum',
 u'day': u'int',
 u'default': u'enum',
 u'duration': u'int',
 u'education': u'enum',
 u'housing': u'enum',
 u'job': u'enum',
 u'loan': u'enum',
 u'marital': u'enum',
 u'month': u'enum',
 u'pdays': u'int',
 u'poutcome': u'enum',
 u'previous': u'int',
 u'y': u'enum'}

In [63]:
## to change a variable to a factor. Not there in scikit learn. There in R
## df["var"] = df["var"].asfactor()   

In [64]:
## get the categories for a categorical variable
df['marital'].levels()

[['divorced', 'married', 'single']]

In [65]:
## get a brief summary of the data
df.describe()

Rows:45211
Cols:17




Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
type,int,enum,enum,enum,enum,int,enum,enum,enum,int,enum,int,int,int,int,enum,enum
mins,18.0,,,,,-8019.0,,,,1.0,,0.0,1.0,-1.0,0.0,,
mean,40.9362102143,,,,,1362.27205769,,,,15.8064187919,,258.163079781,2.76384065825,40.1978279622,0.580323372631,,
maxs,95.0,,,,,102127.0,,,,31.0,,4918.0,63.0,871.0,275.0,,
sigma,10.618762041,,,,,3044.76582917,,,,8.32247615304,,257.527812265,3.09802088328,100.128745991,2.30344104493,,
zeros,0,,,,,3514,,,,0,,3,0,0,36954,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,58.0,management,married,tertiary,no,2143.0,yes,no,unknown,5.0,may,261.0,1.0,-1.0,0.0,unknown,no
1,44.0,technician,single,secondary,no,29.0,yes,no,unknown,5.0,may,151.0,1.0,-1.0,0.0,unknown,no
2,33.0,entrepreneur,married,secondary,no,2.0,yes,yes,unknown,5.0,may,76.0,1.0,-1.0,0.0,unknown,no


In [71]:
## any machine learning task, we divide the data into three parts
## training set - the algorithm learns using this data
## validation set - the algorithm prunes or avoids overfitting using this data
## test set - the final output and the accuracy is gauged using this data

## METHOD 1

## a little different syntax that scikit learn
splits = df.split_frame(ratios=[0.70, 0.15], seed=1)  

train = splits[0]
valid = splits[1]
test = splits[2]

## METHOD 2
# # Construct validation and training datasets by sampling (20/80)
iloc = df[0].runif()
train = df[iloc < 0.7]
valid = df[iloc >= 0.7]

In [72]:
## get the shape of each of the datasets
print 'train shape', train.shape
print 'validation shape', valid.shape

train shape (31649, 17)
validation shape (13562, 17)


In [73]:
## read about the documentation
help(H2OGeneralizedLinearEstimator)

Help on class H2OGeneralizedLinearEstimator in module h2o.estimators.glm:

class H2OGeneralizedLinearEstimator(h2o.estimators.estimator_base.H2OEstimator)
 |  Generalized Linear Modeling
 |  
 |  Fits a generalized linear model, specified by a response variable, a set of predictors, and a
 |  description of the error distribution.
 |  
 |  A subclass of :class:`ModelBase` is returned. The specific subclass depends on the machine learning task
 |  at hand (if it's binomial classification, then an H2OBinomialModel is returned, if it's regression then a
 |  H2ORegressionModel is returned). The default print-out of the models is shown, but further GLM-specific
 |  information can be queried out of the object. Upon completion of the GLM, the resulting object has
 |  coefficients, normalized coefficients, residual/null deviance, aic, and a host of model metrics including
 |  MSE, AUC (for logistic regression), degrees of freedom, and confusion matrices.
 |  
 |  Method resolution order:
 |  

In [74]:
print df['y'].unique()

C1
no
yes





In [77]:
## MODEL 1 - GENERALIZED LINEAR MODEL OR LOGISTIC REGRESSION (BINOMIAL)
## lambda, controls the amount of regularization in a GLM model and we can find the optimal
## value for lambda automatically by setting lambda_search = True and passing in a validation 
## frame (which is used to evaluate model performance using a particular value of lambda).

glm_model = H2OGeneralizedLinearEstimator(family='binomial', model_id='glm_model', lambda_search=True)

In [78]:
#Prepare predictors and response columns
predictors = df.col_names[:-1]     #last column is response, our desired response variable 
response = df.col_names[-1] 

In [79]:
glm_model.train(predictors, response, training_frame=train, validation_frame=valid)

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [80]:
glm_model

Model Details
H2OGeneralizedLinearEstimator :  Generalized Linear Modeling
Model Key:  glm_model


ModelMetricsBinomialGLM: glm
** Reported on train data. **

MSE: 0.0715488988629
RMSE: 0.267486259204
LogLoss: 0.240970792281
Null degrees of freedom: 31648
Residual degrees of freedom: 31603
Null deviance: 22949.7138487
Residual deviance: 15252.9692098
AIC: 15344.9692098
AUC: 0.905765966586
Gini: 0.811531933172
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.209074401458: 


0,1,2,3,4
,no,yes,Error,Rate
no,25590.0,2330.0,0.0835,(2330.0/27920.0)
yes,1245.0,2484.0,0.3339,(1245.0/3729.0)
Total,26835.0,4814.0,0.113,(3575.0/31649.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2090744,0.5815287,239.0
max f2,0.1126536,0.6889807,291.0
max f0point5,0.3563224,0.5793646,178.0
max accuracy,0.3862490,0.9029985,167.0
max precision,0.9156518,0.7218543,21.0
max recall,0.0034602,1.0,397.0
max specificity,0.9993865,0.9995344,0.0
max absolute_mcc,0.1854274,0.5237316,250.0
max min_per_class_accuracy,0.1142768,0.8344198,290.0


Gains/Lift Table: Avg response rate: 11.78 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100161,0.9092256,6.1311767,6.1311767,0.7223975,0.7223975,0.0614106,0.0614106,513.1176650,513.1176650
,2,0.0200006,0.8162094,5.8819949,6.0067826,0.6930380,0.7077409,0.0587289,0.1201394,488.1994867,500.6782585
,3,0.0300167,0.7344288,5.9705345,5.9946872,0.7034700,0.7063158,0.0598016,0.1799410,497.0534467,499.4687160
,4,0.0400013,0.6522332,5.2911095,5.8190706,0.6234177,0.6856240,0.0528292,0.2327702,429.1109538,481.9070629
,5,0.0500174,0.5748806,5.0066814,5.6563875,0.5899054,0.6664561,0.0501475,0.2829177,400.6681369,465.6387499
,6,0.1000032,0.3301088,4.4904161,5.0735860,0.5290771,0.5977883,0.2244570,0.5073746,349.0416115,407.3586005
,7,0.1500205,0.2111736,3.0453347,4.3973598,0.3588124,0.5181129,0.1523197,0.6596943,204.5334691,339.7359840
,8,0.2000063,0.1478666,2.1888767,3.8454135,0.2579014,0.4530806,0.1094127,0.7691070,118.8876673,284.5413494
,9,0.3000095,0.0873804,1.1718589,2.9542286,0.1380727,0.3480779,0.1171896,0.8862966,17.1858924,195.4228637




ModelMetricsBinomialGLM: glm
** Reported on validation data. **

MSE: 0.0691761325485
RMSE: 0.263013559629
LogLoss: 0.233285524937
Null degrees of freedom: 13561
Residual degrees of freedom: 13516
Null deviance: 9681.5473721
Residual deviance: 6327.6365784
AIC: 6419.6365784
AUC: 0.913249260807
Gini: 0.826498521614
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.216282244408: 


0,1,2,3,4
,no,yes,Error,Rate
no,11086.0,916.0,0.0763,(916.0/12002.0)
yes,535.0,1025.0,0.3429,(535.0/1560.0)
Total,11621.0,1941.0,0.107,(1451.0/13562.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2162822,0.5855470,229.0
max f2,0.0970539,0.6968203,297.0
max f0point5,0.3747514,0.5871212,166.0
max accuracy,0.3747514,0.9069459,166.0
max precision,0.8305757,0.7136929,39.0
max recall,0.0032239,1.0,397.0
max specificity,0.9995784,0.9997500,0.0
max absolute_mcc,0.1748817,0.5295948,248.0
max min_per_class_accuracy,0.1147125,0.8396934,285.0


Gains/Lift Table: Avg response rate: 11.50 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100280,0.9035688,6.0088047,6.0088047,0.6911765,0.6911765,0.0602564,0.0602564,500.8804676,500.8804676
,2,0.0200560,0.8037839,6.3284219,6.1686133,0.7279412,0.7095588,0.0634615,0.1237179,532.8421946,516.8613311
,3,0.0300103,0.7099888,5.7957265,6.0449285,0.6666667,0.6953317,0.0576923,0.1814103,479.5726496,504.4928495
,4,0.0400383,0.6211027,6.0727281,6.0518912,0.6985294,0.6961326,0.0608974,0.2423077,507.2728130,505.1891203
,5,0.0500664,0.5567199,5.4334936,5.9280295,0.625,0.6818851,0.0544872,0.2967949,443.3493590,492.8029531
,6,0.1000590,0.3145546,4.4878413,5.2084661,0.5162242,0.5991157,0.2243590,0.5211538,348.7841313,420.8466073
,7,0.1500516,0.2032214,2.9235081,4.4471877,0.3362832,0.5115479,0.1461538,0.6673077,192.3508055,344.7187677
,8,0.2000442,0.1416573,2.1028742,3.8613253,0.2418879,0.4441578,0.1051282,0.7724359,110.2874215,286.1325338
,9,0.3000295,0.0841781,1.3591748,3.0274801,0.1563422,0.3482428,0.1358974,0.9083333,35.9174798,202.7480134



Scoring History: 


0,1,2,3,4,5,6,7
,timestamp,duration,iteration,lambda,predictors,deviance_train,deviance_test
,2017-02-23 19:15:14,0.000 sec,3,.23E0,2,0.7126002,0.7018805
,2017-02-23 19:15:14,0.110 sec,5,.21E0,2,0.7012941,0.6910565
,2017-02-23 19:15:14,0.239 sec,7,.19E0,2,0.6911816,0.6813684
,2017-02-23 19:15:14,0.329 sec,9,.18E0,2,0.6821507,0.6727057
,2017-02-23 19:15:14,0.423 sec,11,.16E0,2,0.6740675,0.6649383
---,---,---,---,---,---,---,---
,2017-02-23 19:15:19,5.409 sec,136,.15E-3,44,0.4820060,0.4666700
,2017-02-23 19:15:19,5.463 sec,137,.14E-3,44,0.4819856,0.4666391
,2017-02-23 19:15:19,5.496 sec,138,.12E-3,45,0.4819684,0.4666128



See the whole table with table.as_data_frame()




In [83]:
## accuracy, auc, confusion matrix
print 'accuracy', glm_model.accuracy()
print 'auc', glm_model.auc()
print 'confusion matrix', glm_model.confusion_matrix()
print glm_model.confusion_matrix(thresholds=[0.1, 0.5, 0.99])

accuracy [[0.3862489940360238, 0.9029985149609783]]
auc 0.905765966586
confusion matrix Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.209074401458: 


0,1,2,3,4
,no,yes,Error,Rate
no,25590.0,2330.0,0.0835,(2330.0/27920.0)
yes,1245.0,2484.0,0.3339,(1245.0/3729.0)
Total,26835.0,4814.0,0.113,(3575.0/31649.0)



Could not find exact threshold 0.1; using closest threshold found 0.100344669854.
Could not find exact threshold 0.5; using closest threshold found 0.5001301086.
Could not find exact threshold 0.99; using closest threshold found 0.988340636634.
Confusion Matrix (Act/Pred) @ threshold = 0.100344669854: 


0,1,2,3,4
,no,yes,Error,Rate
no,22550.0,5370.0,0.1923,(5370.0/27920.0)
yes,505.0,3224.0,0.1354,(505.0/3729.0)
Total,23055.0,8594.0,0.1856,(5875.0/31649.0)


Confusion Matrix (Act/Pred) @ threshold = 0.5001301086: 


0,1,2,3,4
,no,yes,Error,Rate
no,27238.0,682.0,0.0244,(682.0/27920.0)
yes,2457.0,1272.0,0.6589,(2457.0/3729.0)
Total,29695.0,1954.0,0.0992,(3139.0/31649.0)


Confusion Matrix (Act/Pred) @ threshold = 0.988340636634: 


0,1,2,3,4
,no,yes,Error,Rate
no,27891.0,29.0,0.001,(29.0/27920.0)
yes,3680.0,49.0,0.9869,(3680.0/3729.0)
Total,31571.0,78.0,0.1172,(3709.0/31649.0)


[, , ]


In [84]:
prediction = glm_model.predict(valid[:-1])

glm prediction progress: |████████████████████████████████████████████████| 100%


In [85]:
print prediction['predict']
print valid['y']

predict
no
no
no
no
no
no
no
no
no
no





y
no
no
no
no
no
no
no
no
no
no





In [87]:
## Gradient Boosting Estimator with cross validation
gbm_model = H2OGradientBoostingEstimator(distribution="bernoulli", ntrees=50, max_depth=3, min_rows=2, learn_rate=0.2, nfolds=5)

In [88]:
# train the model
gbm_model.train(predictors, response, training_frame=train, validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [90]:
## accuracy, auc, confusion matrix
print 'accuracy', gbm_model.accuracy()
print 'auc', gbm_model.auc()
print 'confusion matrix', gbm_model.confusion_matrix()
print gbm_model.confusion_matrix(thresholds=[0.1, 0.5, 0.99])

accuracy [[0.5178505221352689, 0.9125090840152927]]
auc 0.93525688459
confusion matrix Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.266472480789: 


0,1,2,3,4
,no,yes,Error,Rate
no,25791.0,2129.0,0.0763,(2129.0/27920.0)
yes,1000.0,2729.0,0.2682,(1000.0/3729.0)
Total,26791.0,4858.0,0.0989,(3129.0/31649.0)



Could not find exact threshold 0.1; using closest threshold found 0.100060833882.
Could not find exact threshold 0.5; using closest threshold found 0.499137013588.
Could not find exact threshold 0.99; using closest threshold found 0.978210972288.
Confusion Matrix (Act/Pred) @ threshold = 0.100060833882: 


0,1,2,3,4
,no,yes,Error,Rate
no,22938.0,4982.0,0.1784,(4982.0/27920.0)
yes,330.0,3399.0,0.0885,(330.0/3729.0)
Total,23268.0,8381.0,0.1678,(5312.0/31649.0)


Confusion Matrix (Act/Pred) @ threshold = 0.499137013588: 


0,1,2,3,4
,no,yes,Error,Rate
no,27142.0,778.0,0.0279,(778.0/27920.0)
yes,1998.0,1731.0,0.5358,(1998.0/3729.0)
Total,29140.0,2509.0,0.0877,(2776.0/31649.0)


Confusion Matrix (Act/Pred) @ threshold = 0.978210972288: 


0,1,2,3,4
,no,yes,Error,Rate
no,27920.0,0.0,0.0,(0.0/27920.0)
yes,3725.0,4.0,0.9989,(3725.0/3729.0)
Total,31645.0,4.0,0.1177,(3725.0/31649.0)


[, , ]


In [91]:
## Grid Search
ntrees_opt = [5,50,100]
max_depth_opt = [2,3,5]
learn_rate_opt = [0.1,0.2]

hyper_params = {'ntrees': ntrees_opt, 
                'max_depth': max_depth_opt,
                'learn_rate': learn_rate_opt}
search_criteria = {'strategy': 'RandomDiscrete', 'max_runtime_secs': 60}  #updated

In [92]:
gs = H2OGridSearch(H2OGradientBoostingEstimator, hyper_params = hyper_params, search_criteria = search_criteria)

In [93]:
gs.train(predictors, response, training_frame=train, validation_frame=valid)

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [94]:
print gs

     learn_rate max_depth ntrees  \
0           0.1         5    100   
1           0.2         5    100   
2           0.2         3    100   
3           0.2         5     50   
4           0.2         2    100   
5           0.1         3    100   
6           0.2         3     50   
7           0.1         3     50   
8           0.1         2    100   
9           0.2         5      5   
10          0.2         2      5   
11          0.1         5      5   
12          0.1         3      5   
13          0.1         2      5   

                                                          model_ids  \
0    Grid_GBM_py_62_sid_a435_model_python_1487894265768_230_model_7   
1   Grid_GBM_py_62_sid_a435_model_python_1487894265768_230_model_12   
2    Grid_GBM_py_62_sid_a435_model_python_1487894265768_230_model_1   
3   Grid_GBM_py_62_sid_a435_model_python_1487894265768_230_model_13   
4    Grid_GBM_py_62_sid_a435_model_python_1487894265768_230_model_9   
5   Grid_GBM_py_62_sid_a435_model

In [95]:
# sort the models by auc score
gbm_gridperf = gs.get_grid(sort_by='auc', decreasing=True)

In [96]:
# get the best model
best_model = gbm_gridperf.models[0]

In [97]:
# get the auc score
gbm_perf = best_model.model_performance(valid)
print gbm_perf.auc()

0.939471386179


In [98]:
## RANDOM FOREST

In [99]:
rf = H2ORandomForestEstimator(seed=1, nfolds=5, model_id="rf",
    ntrees=200,
    max_depth=30,
    stopping_rounds=2,
    stopping_tolerance=0.01,
    score_each_iteration=True)

In [100]:
rf.train(x=predictors, y=response, training_frame=train, validation_frame = valid)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [101]:
rf.varimp()

[(u'duration', 16109.12890625, 1.0, 0.27097335099132025),
 (u'month', 7392.951171875, 0.4589292949916548, 0.12435760893197284),
 (u'age', 5529.138671875, 0.343230146338317, 0.09300622291453499),
 (u'poutcome', 5217.38232421875, 0.3238773713080486, 0.08776213661360202),
 (u'day', 5088.8173828125, 0.31589649647897144, 0.08559953221732469),
 (u'job', 4493.61376953125, 0.27894827806535355, 0.07558754966062746),
 (u'balance', 3706.746337890625, 0.23010222088746748, 0.062351569864422036),
 (u'pdays', 2460.367431640625, 0.1527312523202949, 0.04138609924233117),
 (u'campaign', 2097.118896484375, 0.13018201720831332, 0.03527585744174638),
 (u'education', 1719.081787109375, 0.10671475764542475, 0.02891685547940736),
 (u'marital', 1403.2259521484375, 0.08710750036918605, 0.02360381127151601),
 (u'contact', 1211.5206298828125, 0.07520708518340605, 0.02037911589043721),
 (u'housing', 1178.2987060546875, 0.07314478100659637, 0.019820286416883694),
 (u'previous', 1147.650634765625, 0.0712422528520683

In [102]:
print rf.accuracy

Model Details
H2ORandomForestEstimator :  Distributed Random Forest
Model Key:  rf


ModelMetricsBinomial: drf
** Reported on train data. **

MSE: 0.0701776590924
RMSE: 0.264910662474
LogLoss: 0.47675875444
Mean Per-Class Error: 0.156155161358
AUC: 0.896125706055
Gini: 0.79225141211
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.303231111832: 


0,1,2,3,4
,no,yes,Error,Rate
no,25364.0,2556.0,0.0915,(2556.0/27920.0)
yes,1159.0,2570.0,0.3108,(1159.0/3729.0)
Total,26523.0,5126.0,0.1174,(3715.0/31649.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3032311,0.5804630,235.0
max f2,0.1511398,0.6978951,305.0
max f0point5,0.4741819,0.5626811,154.0
max accuracy,0.5133341,0.8997441,142.0
max precision,0.9464286,0.7681159,6.0
max recall,0.0,1.0,399.0
max specificity,1.0,0.9988539,0.0
max absolute_mcc,0.2352305,0.5264974,266.0
max min_per_class_accuracy,0.1666723,0.8402579,297.0


Gains/Lift Table: Avg response rate: 11.78 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100161,0.9370000,8.4872620,8.4872620,1.0,1.0,0.0850094,0.0850094,748.7262001,748.7262001
,2,0.0200006,0.9072962,8.4872620,8.4872620,1.0,1.0,0.0847412,0.1697506,748.7262001,748.7262001
,3,0.0303643,0.8928571,8.4872620,8.4872620,1.0,1.0,0.0879592,0.2577098,748.7262001,748.7262001
,4,0.0436665,0.8571429,8.4872620,8.4872620,1.0,1.0,0.1128989,0.3706087,748.7262001,748.7262001
,5,0.0557364,0.8214286,8.4872620,8.4872620,1.0,1.0,0.1024403,0.4730491,748.7262001,748.7262001
,6,0.1004139,0.6785714,8.4872620,8.4872620,1.0,1.0,0.3791901,0.8522392,748.7262001,748.7262001
,7,0.1502733,0.1964286,2.9635497,6.6545416,0.3491762,0.7840622,0.1477608,1.0,196.3549659,565.4541632
,8,0.2153939,0.1071429,0.0,4.6426581,0.0,0.5470148,0.0,1.0,-100.0,364.2658061
,9,0.3608645,0.0357143,0.0,2.7711234,0.0,0.3265038,0.0,1.0,-100.0,177.1123369




ModelMetricsBinomial: drf
** Reported on validation data. **

MSE: 0.0626579193298
RMSE: 0.250315639403
LogLoss: 0.268976543602
Mean Per-Class Error: 0.13860894979
AUC: 0.923098927956
Gini: 0.846197855913
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.323041640982: 


0,1,2,3,4
,no,yes,Error,Rate
no,11062.0,940.0,0.0783,(940.0/12002.0)
yes,442.0,1118.0,0.2833,(442.0/1560.0)
Total,11504.0,2058.0,0.1019,(1382.0/13562.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3230416,0.6180210,222.0
max f2,0.1563736,0.7232571,304.0
max f0point5,0.4444131,0.6049945,157.0
max accuracy,0.5000018,0.9098953,131.0
max precision,0.7250000,0.7892977,39.0
max recall,0.0,1.0,399.0
max specificity,1.0,0.9999167,0.0
max absolute_mcc,0.3075893,0.5685077,227.0
max min_per_class_accuracy,0.1785736,0.8539410,293.0


Gains/Lift Table: Avg response rate: 11.50 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0112078,0.8214286,6.7489710,6.7489710,0.7763158,0.7763158,0.0756410,0.0756410,574.8970985,574.8970985
,2,0.0211621,0.75,6.8904748,6.8155320,0.7925926,0.7839721,0.0685897,0.1442308,589.0474834,581.5532029
,3,0.0349506,0.6785714,5.7647333,6.4009764,0.6631016,0.7362869,0.0794872,0.2237179,476.4733306,540.0976415
,4,0.0439463,0.6428571,5.3444199,6.1847014,0.6147541,0.7114094,0.0480769,0.2717949,434.4419924,518.4701428
,5,0.0531633,0.6071429,5.7029949,6.1011878,0.656,0.7018031,0.0525641,0.3243590,470.2994872,510.1187809
,6,0.1013125,0.4642857,4.5797778,5.3781305,0.5267994,0.6186317,0.2205128,0.5448718,357.9777752,437.8130482
,7,0.1501991,0.3285714,3.4092509,4.7373022,0.3921569,0.5449190,0.1666667,0.7115385,240.9250880,373.7302217
,8,0.2102197,0.2142857,2.1360171,3.9945993,0.2457002,0.4594879,0.1282051,0.8397436,113.6017136,299.4599286
,9,0.3374134,0.0714286,0.8668391,2.8155376,0.0997101,0.3238636,0.1102564,0.95,-13.3160907,181.5537587




ModelMetricsBinomial: drf
** Reported on cross-validation data. **

MSE: 0.06679400542
RMSE: 0.258445362543
LogLoss: 0.277824404731
Mean Per-Class Error: 0.142660508206
AUC: 0.918190534616
Gini: 0.836381069231
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.295157658688: 


0,1,2,3,4
,no,yes,Error,Rate
no,25304.0,2616.0,0.0937,(2616.0/27920.0)
yes,977.0,2752.0,0.262,(977.0/3729.0)
Total,26281.0,5368.0,0.1135,(3593.0/31649.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.2951577,0.6050346,240.0
max f2,0.1615565,0.7193602,303.0
max f0point5,0.4394788,0.5775748,175.0
max accuracy,0.4634618,0.9012923,165.0
max precision,1.0,1.0,0.0
max recall,0.0,1.0,399.0
max specificity,1.0,1.0,0.0
max absolute_mcc,0.2951577,0.5534992,240.0
max min_per_class_accuracy,0.1851879,0.8533119,292.0


Gains/Lift Table: Avg response rate: 11.78 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0102373,0.8148148,6.5226180,6.5226180,0.7685185,0.7685185,0.0667739,0.0667739,552.2618019,552.2618019
,2,0.0200006,0.7469953,5.9603102,6.2481265,0.7022654,0.7361769,0.0581925,0.1249665,496.0310207,524.8126528
,3,0.0315018,0.6923077,5.4560970,5.9589603,0.6428571,0.7021063,0.0627514,0.1877179,445.6097000,495.8960281
,4,0.0413915,0.6538462,4.9350853,5.7143245,0.5814696,0.6732824,0.0488067,0.2365245,393.5085253,471.4324492
,5,0.0514076,0.6153846,4.9531340,5.5660163,0.5835962,0.6558082,0.0496112,0.2861357,395.3133975,456.6016321
,6,0.1020885,0.4615385,4.4605747,5.0172301,0.5255611,0.5911483,0.2260660,0.5122017,346.0574730,401.7230090
,7,0.1529274,0.3333333,3.5288864,4.5224481,0.4157862,0.5328512,0.1794047,0.6916063,252.8886438,352.2448078
,8,0.2014914,0.2307692,2.3854894,4.0073931,0.2810670,0.4721656,0.1158488,0.8074551,138.5489385,300.7393113
,9,0.3048122,0.0909091,1.1368259,3.0343699,0.1339450,0.3575205,0.1174578,0.9249128,13.6825919,203.4369922



Cross-Validation Metrics Summary: 


0,1,2,3,4,5,6,7
,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
accuracy,0.8845808,0.0060399,0.86962,0.8916587,0.8814538,0.886885,0.8932869
auc,0.9182724,0.0026785,0.9157462,0.9209911,0.9168177,0.9136567,0.9241503
err,0.1154191,0.0060399,0.1303799,0.1083413,0.1185463,0.1131150,0.1067131
err_count,730.8,40.460846,827.0,678.0,760.0,715.0,674.0
f0point5,0.5435946,0.0107998,0.5168953,0.5463259,0.5412999,0.5498008,0.5636511
f1,0.60691,0.0059715,0.5991275,0.6021127,0.6033403,0.6069269,0.6230425
f2,0.6876460,0.0106300,0.7124740,0.6705882,0.6814430,0.6773006,0.6964241
lift_top_group,6.3798437,0.2627986,6.329697,6.7492576,6.8541565,6.05984,5.906267
logloss,0.2777598,0.0132357,0.2891578,0.2529644,0.2794759,0.3052329,0.2619678


Scoring History: 


0,1,2,3,4,5,6,7,8,9,10,11,12,13
,timestamp,duration,number_of_trees,training_rmse,training_logloss,training_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_lift,validation_classification_error
,2017-02-23 19:26:52,54.119 sec,0.0,,,,,,,,,,
,2017-02-23 19:26:52,54.203 sec,1.0,0.3586046,4.4069587,0.6914080,6.7963311,0.1306151,0.3487614,4.1692628,0.7050370,4.0789564,0.1232119
,2017-02-23 19:26:52,54.313 sec,2.0,0.3449803,3.8351851,0.7177361,8.1513728,0.1330026,0.3035062,2.1248388,0.7824082,5.1812798,0.1450376
,2017-02-23 19:26:53,54.495 sec,3.0,0.3379472,3.4512549,0.7361613,8.4252789,0.1381265,0.2871126,1.4264772,0.8235846,5.5283426,0.1633240
,2017-02-23 19:26:53,54.658 sec,4.0,0.3288017,3.0295213,0.7510073,8.4709403,0.1437331,0.2774813,1.0749095,0.8480628,5.6419623,0.1127415
---,---,---,---,---,---,---,---,---,---,---,---,---,---
,2017-02-23 19:27:01,1 min 2.955 sec,24.0,0.2673525,0.5440972,0.8888501,8.4872620,0.1207937,0.2514514,0.2793526,0.9211744,6.6480392,0.1048518
,2017-02-23 19:27:02,1 min 3.608 sec,25.0,0.2665146,0.5211094,0.8913222,8.4872620,0.1240482,0.2509901,0.2789712,0.9215679,6.5360565,0.1086123
,2017-02-23 19:27:02,1 min 4.296 sec,26.0,0.2657235,0.5034343,0.8936158,8.4872620,0.1165598,0.2506824,0.2740809,0.9222134,6.8376549,0.1019761



See the whole table with table.as_data_frame()
Variable Importances: 


0,1,2,3
variable,relative_importance,scaled_importance,percentage
duration,16109.1289062,1.0,0.2709734
month,7392.9511719,0.4589293,0.1243576
age,5529.1386719,0.3432301,0.0930062
poutcome,5217.3823242,0.3238774,0.0877621
day,5088.8173828,0.3158965,0.0855995
job,4493.6137695,0.2789483,0.0755875
balance,3706.7463379,0.2301022,0.0623516
pdays,2460.3674316,0.1527313,0.0413861
campaign,2097.1188965,0.1301820,0.0352759


<bound method ?.accuracy of >


In [69]:
print rf.auc()

0.822187326635


In [None]:
h2o.shutdown(prompt=False)