# Reference

1. http://docs.h2o.ai/h2o/latest-stable/h2o-docs/starting-h2o.html
2. https://github.com/h2oai/h2o-tutorials/blob/master/tutorials/gbm-randomforest/GBM_RandomForest_Example.py
3. https://blog.h2o.ai/2017/06/xgboost-in-h2o-machine-learning-platform/
4. https://www.analyticsvidhya.com/blog/2016/05/h2o-data-table-build-models-large-data-sets/
5. https://aichamp.wordpress.com/2017/10/19/calculating-auc-and-gini-model-metrics-for-logistic-classification/
6. Parameter tuning: https://github.com/h2oai/h2o-3/blob/master/h2o-docs/src/product/tutorials/gbm/gbmTuning.ipynb
7. Interpreting h2o predictions: https://stackoverflow.com/questions/45523997/how-should-we-interpret-the-results-of-the-h2o-predict-function



# Load and Preprocess the Data

In [298]:
import pandas as pd
import numpy as np

# this will take sevaral seconds
train = pd.read_csv('data_transactions/train.csv')
test = pd.read_csv('data_transactions/test.csv')

print('train Shape:', train.shape, 'test Shape:', test.shape)

train Shape: (348978, 51) test Shape: (523466, 50)


In [299]:
# drop id column
train_new = train.copy()
test_new = test.copy()

id_train = train_new.transaction_id
sub_ids = test_new.transaction_id

train_new.drop('transaction_id', axis = 1, inplace = True)
test_new.drop('transaction_id', axis = 1, inplace = True)

## Remove Features With Only One Distinct Value

In [300]:
cat_vars = [x for x in train_new.columns if 'cat_' in x]

# in training set
cat_to_drop_train = []
for x in cat_vars:
    if train_new[x].nunique() == 1:
        cat_to_drop_train.append(x)

# in test set
cat_to_drop_test = []
for x in cat_vars:
    if test_new[x].nunique() == 1:
        cat_to_drop_test.append(x)

# drop these features
cat_to_drop = list(set(cat_to_drop_train + cat_to_drop_test))
train_new = train_new.drop(cat_to_drop, axis = 1)
test_new = test_new.drop(cat_to_drop, axis = 1)

print(train_new.shape)
print(test_new.shape)

(348978, 42)
(523466, 41)


## Encode Categorical Features

In [301]:
from sklearn.preprocessing import LabelEncoder

cat_vars = [x for x in train_new.columns if 'cat_' in x]

for x in cat_vars:
    train_new[x] = train_new[x].fillna('NaN')
    test_new[x] = test_new[x].fillna('NaN')
    encoder = LabelEncoder()
    encoder.fit(list(set(list(train_new[x]) + list(test_new[x]))))
    train_new[x] = encoder.transform(train_new[x])
    test_new[x] = encoder.transform(test_new[x])
    
print(train_new.shape)
print(test_new.shape)

(348978, 42)
(523466, 41)


# Starting H20

In [302]:
import h2o
import os
h2o.init()     #h2o.cluster().shutdown() # in the end

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,14 hours 8 mins
H2O cluster version:,3.16.0.2
H2O cluster version age:,"7 days, 16 hours and 14 minutes"
H2O cluster name:,H2O_from_python_araks_byqnlx
H2O cluster total nodes:,1
H2O cluster free memory:,1001 Mb
H2O cluster total cores:,4
H2O cluster allowed cores:,4
H2O cluster status:,"locked, healthy"
H2O connection url:,http://localhost:54321


In [303]:
train_h2o = h2o.H2OFrame(train_new)
test_h2o = h2o.H2OFrame(test_new)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [304]:
## Run this sell when importing the data from the files directly

#train_h2o = h2o.import_file(os.path.realpath("data_transactions/train.csv"))
#test_h2o = h2o.import_file(os.path.realpath("data_transactions/test.csv"))

In [305]:
# Convert columns to factors (don't do this step if the data were encoded before h2o)
#cat_vars = [x for x in train_h2o.col_names if 'cat_' in x]

#for column in cat_vars:
#    train_h2o[column] = train_h2o[column].asfactor()
    
train_h2o['target'] = train_h2o['target'].asfactor()

# Fit the Models

In [306]:
#train_h2o.col_names

In [307]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
#help(H2OGradientBoostingEstimator)
#help(h2o.import_file)

# Prepare Predictors and Response
predictors_X = train_h2o.col_names[:-1]     #last column is our desired response variable 
response_y_train = train_h2o.col_names[-1] 

# Split the data
train, valid, test = train_h2o.split_frame(ratios = [0.8, 0.1], seed = 1234)

## Random Forest 1

In [308]:
rf_v1 = H2ORandomForestEstimator(
    model_id="rf_v1",
    ntrees=200,
    stopping_rounds=2,
    score_each_iteration=True,
    seed=1000000)

rf_v1.train(predictors_X, response_y_train, training_frame = train, validation_frame = valid)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [309]:
print('training score', rf_v1.auc(train = True))
print('validation score', rf_v1.auc(valid = True))

training score 0.720998398637594
validation score 0.7215307122598251


In [310]:
# Here we can see the hit ratio table.
#rf_v1.hit_ratio_table(valid = True)

rf_v1_perf = rf_v1.model_performance(test)
auc_rf_v1 = rf_v1_perf.auc()
auc_rf_v1

0.7210501376552523

## GBM 1

Default parameters. As the scores show below, this gbm with defailt parameters is worse than previous random forest. Is it overfitting?

1. Default number of trees - 50
2. Default learning rate - 0.1
3. Default depth - 5

In [311]:
# First we will use all default settings, then make some changes to improve our predictions.

gbm_v1 = H2OGradientBoostingEstimator(
    model_id="gbm_covType_v1",
    seed=2000000
)

gbm_v1.train(predictors_X, response_y_train, training_frame = train, validation_frame = valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [312]:
print('training score', gbm_v1.auc(train = True))
print('validation score', gbm_v1.auc(valid = True))

training score 0.7252709618760613
validation score 0.7143121713614224


In [313]:
#gbm_v1.score_history()
#gbm_v1.hit_ratio_table(valid=True)
gbm_v1_perf = gbm_v1.model_performance(test)
auc_gbm_v1 = gbm_v1_perf.auc()
auc_gbm_v1

0.7185211253312612

##  GBM 2

2. increase the learning rate (from default 0.1 to 0.2)
3. increase the depth (from default 5 to 10)

In [314]:
gbm_v2 = H2OGradientBoostingEstimator(
    ntrees=50,
    learn_rate=0.2,
    max_depth=10,
    #stopping_tolerance=0.01, #10-fold increase in threshold as defined in rf_v1
    #stopping_rounds=2,
    score_each_iteration=True,
    model_id="gbm_covType_v2",
    seed=2000000
)

gbm_v2.train(predictors_X, response_y_train, training_frame = train, validation_frame = valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [315]:
print('training score', gbm_v2.auc(train = True))
print('validation score', gbm_v2.auc(valid = True))

training score 0.7544961982216296
validation score 0.7203838970531827


From the scores above, we clearly see that we started overfitting the training data.

In [316]:
#gbm_v2.score_history()
gbm_v2_perf = gbm_v2.model_performance(test)
auc_gbm_v2 = gbm_v2_perf.auc()
auc_gbm_v2

0.722512760954005

Looks like we are improving the results, but I am a little concerned about overfitting.

## GBM 3

1. Increase learning rate (to 0.3)
2. Use a random 70% of rows to fit each tree (to add the nature of randomness). This will help prevent overfitting
3. Use a random 70% of columns to fit each tree (to add the nature of randomness). Will help prevent overfitting

In [320]:
gbm_v3 = H2OGradientBoostingEstimator(
    ntrees=50,
    learn_rate=0.3,
    max_depth=10,
    sample_rate=0.7,
    col_sample_rate=0.7,
    #stopping_rounds=2,
    #stopping_tolerance=0.01, #10-fold increase in threshold as defined in rf_v1
    score_each_iteration=True,
    model_id="gbm_covType_v3",
    seed=2000000
)
gbm_v3.train(predictors_X, response_y_train, training_frame=train, validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [321]:
print('training score', gbm_v3.auc(train = True))
print('validation score', gbm_v3.auc(valid = True))

training score 0.7677209218922407
validation score 0.7167701304997257


In [322]:
gbm_v3_perf = gbm_v3.model_performance(test)
auc_gbm_v3 = gbm_v3_perf.auc()
auc_gbm_v3

0.7214806279781103

## GBM 4

Adding random nature doesn't seem to help overfitting. But we also increased the learning rate on the previous model. So let's bring the learning rate back to 0.2 as in the gbm2 and increase the number of trees.

1. Decrease learning rate (from 0.3 to 0.2)
2. Increase number of trees (from 50 to 70)

In [328]:
gbm_v4 = H2OGradientBoostingEstimator(
    ntrees=70,
    learn_rate=0.2,
    max_depth=10,
    sample_rate=0.7,
    col_sample_rate=0.7,
    #stopping_rounds=2,
    #stopping_tolerance=0.01, #10-fold increase in threshold as defined in rf_v1
    score_each_iteration=True,
    model_id="gbm_covType_v3",
    seed=2000000
)
gbm_v4.train(predictors_X, response_y_train, training_frame=train, validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [329]:
print('training score', gbm_v4.auc(train = True))
print('validation score', gbm_v4.auc(valid = True))

training score 0.7643044136441797
validation score 0.7201199168954743


In [331]:
gbm_v4_perf = gbm_v4.model_performance(test)
auc_gbm_v4 = gbm_v4_perf.auc()
auc_gbm_v4

0.7195996493722369

## Random Forest 2

As the first random forest's result are very comparable to the gbm models' results, let's tune the parameters a little and see what happens.

In [323]:
# takes more than 5 minutes without the stopping conditions

rf_v2 = H2ORandomForestEstimator(
    model_id="rf_covType_v2",
    ntrees=200,
    max_depth=30,
    stopping_rounds=2,    
    stopping_tolerance=0.01,
    score_each_iteration=True,
    seed=3000000)

rf_v2.train(predictors_X, response_y_train, training_frame=train, validation_frame=valid)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [327]:
print('training score', rf_v2.auc(train = True))
print('validation score', rf_v2.auc(valid = True))

training score 0.7199379695749945
validation score 0.7189576972585479


In [326]:
rf_v2_perf = rf_v2.model_performance(test)
auc_rf_v2 = rf_v2_perf.auc()
auc_rf_v2

0.7258007575388039

Not so sure, maybe the gbm2 is the best after all.

# And the winner model is GBM 2 (temporarely, until tuning parameters more)

## Predict on Test Data

In [332]:
# fit on the whole training set
gbm_v2.train(predictors_X, response_y_train, training_frame=train_h2o)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [333]:
# Convert columns to factorsin test data (don't do this step if the data were encoded before h2o)
#cat_vars = [x for x in test_h2o.col_names if 'cat_' in x]

#for column in cat_vars:
#    test_h2o[column] = test_h2o[column].asfactor()

# Make predictions
final_gbm_predictions = gbm_v2.predict(test_h2o)

gbm prediction progress: |████████████████████████████████████████████████| 100%


## Understand h2o predictions and convert to pandas data frame

In [368]:
print(final_gbm_predictions.shape)
print(final_gbm_predictions.type)

(523466, 3)


predict,p0,p1
0,0.937585,0.0624148
0,0.935907,0.064093
0,0.936524,0.063476
0,0.93463,0.0653702
0,0.938609,0.061391
0,0.940626,0.0593745
0,0.912474,0.0875262
0,0.921667,0.0783328
0,0.949965,0.0500351
0,0.939633,0.0603669


<bound method H2OFrame.type of >


### Interpretation

Stackoverflow explanation comes to help (find the link in references).

p0 is the probability (between 0 and 1) that class 0 is chosen.  
p1 is the probability (between 0 and 1) that class 1 is chosen.

In our problem 1 is Fraudulent, 0 - Not Fraudulent. Then we need the column **p1**.


In [362]:
preds_pandas = h2o.as_list(final_gbm_predictions)

In [373]:
preds = preds_pandas['p1']

## Submission

In [374]:
print(sub_ids.shape)
print(preds.shape)

(523466,)
(523466,)


In [375]:
from IPython.display import FileLink

sub = pd.DataFrame({'transaction_id': sub_ids, 'target': preds})
sub = sub[['transaction_id','target']]    

filename='sub_transactions.csv'
sub.to_csv(filename, index=False)
FileLink(filename)      # lb 0.73204