In [2]:
## Import H2O Package and Initialize
import h2o

## nthreads = number of cores to use -1 = all
## max_mem_size = RAM to allocate to this process
h2o.init(nthreads = -1, max_mem_size=8)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_161"; Java(TM) SE Runtime Environment (build 1.8.0_161-b12); Java HotSpot(TM) 64-Bit Server VM (build 25.161-b12, mixed mode)
  Starting server from /Users/anirvan/anaconda3/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/l7/n8pwny0j3mq6j8h_gxgzm4200000gn/T/tmpu9jfwrcr
  JVM stdout: /var/folders/l7/n8pwny0j3mq6j8h_gxgzm4200000gn/T/tmpu9jfwrcr/h2o_anirvan_started_from_python.out
  JVM stderr: /var/folders/l7/n8pwny0j3mq6j8h_gxgzm4200000gn/T/tmpu9jfwrcr/h2o_anirvan_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,07 secs
H2O cluster version:,3.16.0.4
H2O cluster version age:,20 days
H2O cluster name:,H2O_from_python_anirvan_8w88ke
H2O cluster total nodes:,1
H2O cluster free memory:,7.111 Gb
H2O cluster total cores:,0
H2O cluster allowed cores:,0
H2O cluster status:,"accepting new members, healthy"
H2O connection url:,http://127.0.0.1:54321


In [5]:
## Import Data
loan_csv = "loan.csv"
data = h2o.import_file(loan_csv)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [7]:
## Check Data Shape
## This should output a tuple (rows, columns)
data.shape

(163987, 15)

### Encode A Response Variable
Since this is a binary classification problem, the reposne must be coded as a factor


In [9]:
data['bad_loan'] = data['bad_loan'].asfactor()
data['bad_loan'].levels()

[['0', '1']]

### Partition Data into Training, Validation and Test Sets

70% - Train
15% - Valid
15% - Test

In [15]:
splits = data.split_frame(ratios = [0.7,0.15], seed=1)

train = splits[0]
valid = splits[1]
test = splits[2]

print("Train: "+ str(train.shape))
print("Valid: "+ str(valid.shape))
print("Test: "+str(test.shape))

Train: (114908, 15)
Valid: (24498, 15)
Test: (24581, 15)


### H2O Uses 'y' to designate the response variable
### and 'x' to designate the list of predictor columns

In [16]:
y = 'bad_loan'
x = list(data.columns)

In [17]:
x.remove(y) ##remove the response
x.remove('int_rate') ## remove the interest rate because it is connected to the response

In [19]:
x ## List of predictor columns or parameters/inputs

['loan_amnt',
 'term',
 'emp_length',
 'home_ownership',
 'annual_inc',
 'purpose',
 'addr_state',
 'dti',
 'delinq_2yrs',
 'revol_util',
 'total_acc',
 'longest_credit_length',
 'verification_status']

## Build the DL Model

In [23]:
from h2o.estimators.deeplearning import H2ODeepLearningEstimator as h2odl

#### Train a default model

In [24]:
dl_fit1 = h2odl(model_id='dl_fit1', seed=1)
dl_fit1.train(x=x, y=y,training_frame=train)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


#### Train a model with more epochs and hidden layers and turn off stopping rounds which prevent overfitting

In [25]:
dl_fit2 = h2odl(model_id='dl_fit2', epochs=20, hidden=[10,10],stopping_rounds=0, seed=1)
dl_fit2.train(x=x, y=y, training_frame=train)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


#### Train a third model with stopping rounds

In [26]:
dl_fit3 = h2odl(model_id='dl_fit3',
                epochs=20, hidden=[10,10],
                score_interval=1,
                stopping_rounds=3,
                stopping_metric='AUC',
                stopping_tolerance=0.0005,
                seed=1)
dl_fit3.train(x=x, y=y, training_frame=train, validation_frame=valid)

deeplearning Model Build progress: |██████████████████████████████████████| 100%


### Compare Model Performance

In [28]:
dl_perf1 = dl_fit1.model_performance(test)
dl_perf2 = dl_fit2.model_performance(test)
dl_perf3 = dl_fit3.model_performance(test)

print(dl_perf1.auc())
print(dl_perf2.auc())
print(dl_perf3.auc())

0.6761369141167992
0.6781741816497162
0.6784819944574187


In [29]:
dl_fit3.scoring_history()

Unnamed: 0,Unnamed: 1,timestamp,duration,training_speed,epochs,iterations,samples,training_rmse,training_logloss,training_auc,training_lift,training_classification_error,validation_rmse,validation_logloss,validation_auc,validation_lift,validation_classification_error
0,,2018-02-05 11:53:46,0.000 sec,,0.0,0,0.0,,,,,,,,,,
1,,2018-02-05 11:53:46,0.372 sec,404825 obs/sec,0.870192,1,99992.0,0.384857,0.476337,0.661659,2.830353,0.33303,0.384176,0.474208,0.667317,2.751867,0.35656
2,,2018-02-05 11:53:47,1.396 sec,510255 obs/sec,5.222091,6,600060.0,0.378269,0.456025,0.67144,2.830353,0.399879,0.377752,0.454926,0.675431,2.531718,0.357784
3,,2018-02-05 11:53:48,2.465 sec,554493 obs/sec,10.442476,12,1199924.0,0.377579,0.451896,0.674596,3.156932,0.34193,0.377874,0.452428,0.675282,2.443658,0.356274
4,,2018-02-05 11:53:49,3.470 sec,583737 obs/sec,15.666838,18,1800245.0,0.376189,0.449578,0.680251,2.775923,0.394519,0.376923,0.451155,0.676401,2.619777,0.348314
5,,2018-02-05 11:53:50,4.277 sec,603119 obs/sec,20.018589,23,2300296.0,0.375991,0.448671,0.680739,3.211362,0.354571,0.376429,0.449485,0.679535,2.553733,0.338517
