In [121]:
!pip install h2o



In [125]:
# import modules
import h2o
# import autoML from H2O
from h2o.automl import H2OAutoML

# We will be using default parameter Here with H2O init method
h2o.init()


def train_model(dataset, max_runtime_secs=180, stopping_metric="logloss", sort_metric="logloss", balance_classes=False):
    # convert pandas DataFrame into H2O Frame
    dataset = dataset.astype(str)
    dataset = h2o.H2OFrame(dataset)

    # selecting feature and label columns
    x = dataset.columns
    y = 'FraudFound_P'

    # remove label class variable from feature variable
    x.remove(y)

    # convert to categorical variable for classification
    dataset[y] = dataset[y].asfactor()

    # call h20automl  function
    model = H2OAutoML(max_runtime_secs=max_runtime_secs,
                      # exclude_algos =['DeepLearning'],
                      seed=1,
                      stopping_metric=stopping_metric,
                      sort_metric=sort_metric,
                      balance_classes=balance_classes)

    # train model and record time % time
    model.train(x=x, y=y, training_frame=dataset)

    print("Model Training completed")

    # return the model
    return model


def view_leaderboard(model):
    # View the H2O aml leaderboard
    lb = model.leaderboard

    # return all rows instead of 10 rows
    return lb.head(rows=lb.nrows)


def get_best_model(model):
    # Get the top model of leaderboard
    return model.leader


def save_model(model):
    return model.leader.download_mojo(path="./")


def make_predictions(dataset, path):
    # get the saved model
    imported_model = h2o.import_mojo(path)

    # convert pandas DataFrame into H2O Frame
    dataset = dataset.astype(str)
    dataset = h2o.H2OFrame(dataset)

    return int(pd.DataFrame(imported_model.predict(dataset)).iloc[0, 0])

# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    print('model build with automl')


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,4 hours 22 mins
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.1
H2O_cluster_version_age:,17 hours and 37 minutes
H2O_cluster_name:,H2O_from_python_unknownUser_wq4u6y
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,2.667 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


model build with automl


In [126]:
import pandas as pd
from sklearn.model_selection import train_test_split
#import h20_auto_ml

# get dataset
df = pd.read_csv('input_dataset.csv')

# split to train and test
train, test = train_test_split(df, test_size=0.2)

# train the model (and return the best model)
model = train_model(train)

# save the model
path = save_model(model)

# view the leaderboard
#h20_auto_ml.view_leaderboard(model)

# get the best model
#best_model = h20_auto_ml.get_best_model(model)

# load the saved model, make predictions with test dataset
test = test.head(1).drop(columns=["FraudFound_P"])
predicted_outcome = make_predictions(test, path)
predicted_outcome


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Model Training completed
generic Model Build progress: |██████████████████████████████████████████████████| (done) 100%
Model Details
H2OGenericEstimator :  Import MOJO Model
Model Key:  Generic_model_python_1649911591249_24071

No model summary for this model

ModelMetricsBinomialGeneric: generic
** Reported on train data. **

MSE: 0.0028745717461208155
RMSE: 0.05361503283707673
LogLoss: 0.024439482011130026
Mean Per-Class Error: 0.0
AUC: 1.0
AUCPR: 1.0
Gini: 1.0

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.7147386600724938: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,9462.0,0.0,0.0,(0.0/9462.0)
1,1,0.0,591.0,0.0,(0.0/591.0)
2,Total,9462.0,591.0,0.0,(0.0/10053.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.714739,1.0,104.0
1,max f2,0.714739,1.0,104.0
2,max f0point5,0.714739,1.0,104.0
3,max accuracy,0.714739,1.0,104.0
4,max precision,0.995683,1.0,0.0
5,max recall,0.714739,1.0,104.0
6,max specificity,0.995683,1.0,0.0
7,max absolute_mcc,0.714739,1.0,104.0
8,max min_per_class_accuracy,0.714739,1.0,104.0
9,max mean_per_class_accuracy,0.714739,1.0,104.0



Gains/Lift Table: Avg response rate:  5.88 %, avg score:  7.44 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010047,0.975267,17.010152,17.010152,1.0,0.982866,1.0,0.982866,0.170897,0.170897,1601.015228,1601.015228,0.170897
1,2,0.020094,0.962129,17.010152,17.010152,1.0,0.96913,1.0,0.975998,0.170897,0.341794,1601.015228,1601.015228,0.341794
2,3,0.030041,0.948924,17.010152,17.010152,1.0,0.955423,1.0,0.969185,0.169205,0.510998,1601.015228,1601.015228,0.510998
3,4,0.040088,0.929887,17.010152,17.010152,1.0,0.939146,1.0,0.961656,0.170897,0.681895,1601.015228,1601.015228,0.681895
4,5,0.050035,0.902506,17.010152,17.010152,1.0,0.917193,1.0,0.952817,0.169205,0.8511,1601.015228,1601.015228,0.8511
5,6,0.10007,0.118007,2.975931,9.993042,0.17495,0.320703,0.587475,0.63676,0.1489,1.0,197.593121,899.304175,0.95614
6,7,0.150005,0.057124,0.0,6.666446,0.0,0.082375,0.39191,0.45221,0.0,1.0,-100.0,566.644562,0.903086
7,8,0.20004,0.034245,0.0,4.999005,0.0,0.044202,0.293884,0.350157,0.0,1.0,-100.0,399.900547,0.849926
8,9,0.30001,0.015565,0.0,3.333223,0.0,0.023398,0.195955,0.241274,0.0,1.0,-100.0,233.322281,0.743712
9,10,0.39998,0.007387,0.0,2.500124,0.0,0.011082,0.146978,0.18374,0.0,1.0,-100.0,150.012435,0.637497




ModelMetricsBinomialGeneric: generic
** Reported on cross-validation data. **

MSE: 0.033747123949699476
RMSE: 0.18370390292451458
LogLoss: 0.11683951157204443
Mean Per-Class Error: 0.20911511165210955
AUC: 0.9473472687884246
AUCPR: 0.6055522400632498
Gini: 0.8946945375768491

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.30741058592602366: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,11296.0,314.0,0.027,(314.0/11610.0)
1,1,284.0,442.0,0.3912,(284.0/726.0)
2,Total,11580.0,756.0,0.0485,(598.0/12336.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.307411,0.596491,178.0
1,max f2,0.142262,0.679398,256.0
2,max f0point5,0.472019,0.605304,124.0
3,max accuracy,0.472019,0.954199,124.0
4,max precision,0.992129,1.0,0.0
5,max recall,0.001396,1.0,392.0
6,max specificity,0.992129,1.0,0.0
7,max absolute_mcc,0.307411,0.570848,178.0
8,max min_per_class_accuracy,0.079592,0.885357,297.0
9,max mean_per_class_accuracy,0.070985,0.887537,304.0



Gains/Lift Table: Avg response rate:  5.89 %, avg score:  5.88 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010052,0.779722,14.388163,14.388163,0.846774,0.866907,0.846774,0.866907,0.144628,0.144628,1338.816316,1338.816316,0.142992
1,2,0.020023,0.671127,11.465968,12.932981,0.674797,0.727034,0.761134,0.797254,0.114325,0.258953,1046.596788,1193.298089,0.253871
2,3,0.030075,0.54599,9.181018,11.678956,0.540323,0.603337,0.687332,0.73244,0.092287,0.35124,818.10184,1067.895569,0.341248
3,4,0.040045,0.44489,9.255661,11.075585,0.544715,0.495799,0.651822,0.67352,0.092287,0.443526,825.566082,1007.55847,0.428711
4,5,0.050016,0.373741,8.150507,10.492466,0.479675,0.404553,0.617504,0.619901,0.081267,0.524793,715.050729,949.246554,0.504466
5,6,0.100032,0.168209,4.709217,7.600841,0.277147,0.251504,0.447326,0.435702,0.235537,0.760331,370.921682,660.084118,0.701588
6,7,0.150049,0.08735,2.175603,5.792428,0.128039,0.123943,0.340897,0.331783,0.108815,0.869146,117.560309,479.242848,0.764064
7,8,0.200065,0.050242,0.991414,4.592175,0.058347,0.067077,0.270259,0.265606,0.049587,0.918733,-0.858593,359.217488,0.763608
8,9,0.300016,0.020431,0.34452,3.177055,0.020276,0.032589,0.186976,0.187976,0.034435,0.953168,-65.547981,217.705512,0.693995
9,10,0.400049,0.008641,0.179005,2.427391,0.010535,0.013895,0.142857,0.144447,0.017906,0.971074,-82.099468,142.739079,0.606733




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.95062,0.004798,0.955105,0.953921,0.952665,0.94777,0.943638
1,auc,0.947326,0.009465,0.94028,0.95172,0.960652,0.947226,0.936753
2,err,0.04938,0.004798,0.044895,0.046079,0.047335,0.05223,0.056362
3,err_count,121.6,8.905055,113.0,114.0,119.0,130.0,132.0
4,f0point5,0.586007,0.051292,0.641749,0.584677,0.603751,0.597345,0.502513
5,f1,0.605437,0.033917,0.616949,0.604167,0.633846,0.624278,0.547945
6,f2,0.628451,0.031667,0.593995,0.625,0.667098,0.653753,0.60241
7,lift_top_group,14.416196,1.81231,14.182018,16.735882,12.978833,12.445,15.739247
8,logloss,0.11684,0.008418,0.1253,0.107622,0.108684,0.124512,0.118085
9,max_per_class_error,0.353848,0.04283,0.420382,0.360294,0.308725,0.325,0.354839



See the whole table with table.as_data_frame()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
generic prediction progress: |███████████████████████████████████████████████████| (done) 100%


0