In [1]:
import numpy as np
import pandas as pd
import platform, time, sys

import h2o
from h2o.automl import H2OAutoML
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [2]:
start_time = int(time.time())

localH2O = h2o.init(ip = "localhost",
                    port = 54321,
                    max_mem_size="24G",
                    nthreads = 6)
h2o.no_progress()
h2o.remove_all()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.261-b12, mixed mode)
  Starting server from C:\Users\affiqazrin\Anaconda3\envs\h2olime\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\AFFIQA~1\AppData\Local\Temp\tmpe86lnm5d
  JVM stdout: C:\Users\AFFIQA~1\AppData\Local\Temp\tmpe86lnm5d\h2o_affiqazrin_started_from_python.out
  JVM stderr: C:\Users\AFFIQA~1\AppData\Local\Temp\tmpe86lnm5d\h2o_affiqazrin_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,04 secs
H2O_cluster_timezone:,Asia/Singapore
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.7
H2O_cluster_version_age:,4 months and 11 days !!!
H2O_cluster_name:,H2O_from_python_affiqazrin_9w15ej
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,21.33 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


In [3]:
DATASET_LOCAL_PATH = "C:/Users/affiqazrin/Desktop/mmlspark/Data_FinalProject_READY4.csv"
df = pd.read_csv(DATASET_LOCAL_PATH)

dfh2o = h2o.H2OFrame(df)

# convert columns to factors for classification task, or else regression
dfh2o["deposit"]=dfh2o["deposit"].asfactor()

dfh2o["age"]=dfh2o["age"].asfactor()
dfh2o["duration"]=dfh2o["duration"].asfactor()
dfh2o["pdays"]=dfh2o["pdays"].asfactor()
dfh2o["previous"]=dfh2o["previous"].asfactor()

predictors = ["job", #categorical
              "marital", #categorical
              "education", #categorical
              "default", #categorical
              "housing", #categorical, binary
              "loan", #categorical, binary
              "contact", #categorical
              "day", #categorical
              "month", #categorical
              "campaign", #categorical
              "poutcome" #categorical
             ]

response_col = "deposit"

In [4]:
train, test = dfh2o.split_frame(ratios = [0.8], seed = 1234)

In [None]:
# Run AutoML for 20 base models (limited to 1 hour max runtime by default)
aml = H2OAutoML(max_models=20, seed=1)
aml.train(x=predictors, y=response_col, training_frame=dfh2o)

In [None]:
lb = aml.leaderboard
lb.head()

In [None]:
lb.head(rows=lb.nrows)

In [None]:
aml.model_performance(test)

In [None]:
# Get model ids for all models in the AutoML Leaderboard
model_ids = list(aml.leaderboard['model_id'].as_data_frame().iloc[:,0])
# Get the "All Models" Stacked Ensemble model
se = h2o.get_model([mid for mid in model_ids if "StackedEnsemble_AllModels" in mid][0])
# Get the Stacked Ensemble metalearner model
metalearner = h2o.get_model(se.metalearner()['name'])

In [None]:
metalearner.coef_norm()

In [None]:
%matplotlib inline
metalearner.std_coef_plot()

In [None]:
leaderboard = aml.getLeaderboard("ALL")
leaderboard.show(truncate = False)

leaderboard.toPandas().to_csv('leaderboard2.csv')

In [5]:
# initialize model training using H2OGeneralizedLinearEstimator
glm_model = H2OGeneralizedLinearEstimator(family= "binomial",
                                          lambda_ = 0,
                                          compute_p_values = True,
                                          remove_collinear_columns = True)

glm_model.train(predictors, response_col, training_frame= dfh2o)

# predict using the model and the testing dataset
predict = glm_model.predict(test)

# View a summary of the prediction
predict.head()

predict,no,yes,StdErr
yes,0.836871,0.163129,0.250841
yes,0.656497,0.343503,0.205746
no,0.921064,0.0789363,0.170798
yes,0.808966,0.191034,0.167274
no,0.942792,0.0572085,0.182846
yes,0.816285,0.183715,0.182061
yes,0.675848,0.324152,0.217425
yes,0.827096,0.172904,0.233215
no,0.90918,0.0908202,0.147639
no,0.975468,0.0245321,0.282695




In [6]:
glm_model.model_performance(test)


ModelMetricsBinomialGLM: glm
** Reported on test data. **

MSE: 0.08606031943745884
RMSE: 0.29336039173252215
LogLoss: 0.3010324293019492
Null degrees of freedom: 2408
Residual degrees of freedom: 2366
Null deviance: 1723.240743441892
Residual deviance: 1450.374244376791
AIC: 1536.374244376791
AUC: 0.7603440476150285
AUCPR: 0.3912596822827905
Gini: 0.520688095230057

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.20964748144024997: 


Unnamed: 0,Unnamed: 1,no,yes,Error,Rate
0,no,2007.0,124.0,0.0582,(124.0/2131.0)
1,yes,159.0,119.0,0.5719,(159.0/278.0)
2,Total,2166.0,243.0,0.1175,(283.0/2409.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.209647,0.456814,147.0
1,max f2,0.153644,0.502008,188.0
2,max f0point5,0.211953,0.479967,145.0
3,max accuracy,0.392124,0.892487,88.0
4,max precision,0.963579,1.0,0.0
5,max recall,0.014608,1.0,395.0
6,max specificity,0.963579,1.0,0.0
7,max absolute_mcc,0.209647,0.392401,147.0
8,max min_per_class_accuracy,0.097378,0.676259,253.0
9,max mean_per_class_accuracy,0.153644,0.71535,188.0



Gains/Lift Table: Avg response rate: 11.54 %, avg score: 19.70 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010378,0.468116,0.0,0.0,0.0,0.668667,0.0,0.668667,0.0,0.0,-100.0,-100.0,-0.011732
1,2,0.02034,0.339271,3.24955,1.591617,0.375,0.388731,0.183673,0.531556,0.032374,0.032374,224.955036,59.16165,0.013604
2,3,0.030303,0.294769,0.361061,1.18705,0.041667,0.310927,0.136986,0.45902,0.003597,0.035971,-63.893885,18.705036,0.006408
3,4,0.040266,0.285267,0.722122,1.072017,0.083333,0.289715,0.123711,0.41713,0.007194,0.043165,-27.78777,7.201661,0.003278
4,5,0.050228,0.277298,0.722122,1.002616,0.083333,0.28039,0.115702,0.390008,0.007194,0.05036,-27.78777,0.261609,0.000149
5,6,0.100042,0.253994,1.444245,1.222514,0.166667,0.264688,0.141079,0.327608,0.071942,0.122302,44.42446,22.25141,0.025165
6,7,0.15027,0.237837,1.432309,1.292639,0.165289,0.245563,0.149171,0.300184,0.071942,0.194245,43.23087,29.263882,0.049712
7,8,0.200083,0.226419,1.660881,1.384317,0.191667,0.232187,0.159751,0.283255,0.082734,0.276978,66.088129,38.431744,0.086927
8,9,0.300125,0.207389,1.725902,1.498179,0.19917,0.216624,0.172891,0.261045,0.172662,0.44964,72.590227,49.817905,0.169021
9,10,0.400166,0.193184,0.934864,1.35735,0.107884,0.200097,0.156639,0.245808,0.093525,0.543165,-6.513627,35.735022,0.161654







https://docs.h2o.ai/h2o-tutorials/latest-stable/h2o-world-2017/automl/index.html