In [10]:
!pip install h2o



In [11]:
# import modules
import h2o
# import autoML from H2O
from h2o.automl import H2OAutoML

# We will be using default parameter Here with H2O init method
h2o.init()


def train_model(dataset, max_runtime_secs=600, stopping_metric="logloss", sort_metric="logloss", balance_classes=False):
    # convert pandas DataFrame into H2O Frame
    dataset = dataset.astype(str)
    dataset = h2o.H2OFrame(dataset)

    # selecting feature and label columns
    x = dataset.columns
    y = 'FraudFound_P'

    # remove label class variable from feature variable
    x.remove(y)

    # convert to categorical variable for classification
    dataset[y] = dataset[y].asfactor()

    # call h20automl  function
    model = H2OAutoML(max_runtime_secs=max_runtime_secs,
                      # exclude_algos =['DeepLearning'],
                      seed=1,
                      stopping_metric=stopping_metric,
                      sort_metric=sort_metric,
                      balance_classes=balance_classes)

    # train model and record time % time
    model.train(x=x, y=y, training_frame=dataset)

    print("Model Training completed")

    # return the model
    return model


def view_leaderboard(model):
    # View the H2O aml leaderboard
    lb = model.leaderboard

    # return all rows instead of 10 rows
    return lb.head(rows=lb.nrows)


def get_best_model(model):
    # Get the top model of leaderboard
    return model.leader


def save_model(model):
    return model.leader.download_mojo(path="./")


def make_predictions(dataset, path):
    # get the saved model
    imported_model = h2o.import_mojo(path)

    # convert pandas DataFrame into H2O Frame
    dataset = dataset.astype(str)
    dataset = h2o.H2OFrame(dataset)

    return int(pd.DataFrame(imported_model.predict(dataset)).iloc[0, 0])

# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    print('model build with automl')


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,27 mins 57 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.1.1
H2O_cluster_version_age:,1 day
H2O_cluster_name:,H2O_from_python_unknownUser_5rc6mr
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.060 Gb
H2O_cluster_total_cores:,2
H2O_cluster_allowed_cores:,2


model build with automl


In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
#import h20_auto_ml

# get dataset
df = pd.read_csv('input_dataset.csv')

# split to train and test
train, test = train_test_split(df, test_size=0.2)

# train the model (and return the best model)
model = train_model(train)

# save the model
path = save_model(model)
print(path)
# view the leaderboard
#h20_auto_ml.view_leaderboard(model)

# get the best model
#best_model = h20_auto_ml.get_best_model(model)



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
Model Training completed
/content/StackedEnsemble_AllModels_3_AutoML_2_20220414_195710.zip


In [14]:
# load the saved model, make predictions with test dataset
test_1 = test.head(1).drop(columns=["FraudFound_P"])
predicted_outcome = make_predictions(test_1, "StackedEnsemble_AllModels_3_AutoML_1_20220414_193005.zip")
predicted_outcome


generic Model Build progress: |██████████████████████████████████████████████████| (done) 100%
Model Details
H2OGenericEstimator :  Import MOJO Model
Model Key:  Generic_model_python_1649964546236_9200

No model summary for this model

ModelMetricsBinomialGeneric: generic
** Reported on train data. **

MSE: 0.00427100048764798
RMSE: 0.06535289196086107
LogLoss: 0.025990158775061313
Mean Per-Class Error: 0.005924893606777348
AUC: 0.999968600131042
AUCPR: 0.9995475509418393
Gini: 0.9999372002620841

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6766901246553924: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,9441.0,3.0,0.0003,(3.0/9444.0)
1,1,7.0,600.0,0.0115,(7.0/607.0)
2,Total,9448.0,603.0,0.001,(10.0/10051.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.67669,0.991736,122.0
1,max f2,0.592923,0.993094,130.0
2,max f0point5,0.70475,0.995642,116.0
3,max accuracy,0.67669,0.999005,122.0
4,max precision,0.998861,1.0,0.0
5,max recall,0.324773,1.0,175.0
6,max specificity,0.998861,1.0,0.0
7,max absolute_mcc,0.67669,0.991212,122.0
8,max min_per_class_accuracy,0.567464,0.996705,133.0
9,max mean_per_class_accuracy,0.567464,0.997505,133.0



Gains/Lift Table: Avg response rate:  6.04 %, avg score:  7.34 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010049,0.975532,16.558484,16.558484,1.0,0.985731,1.0,0.985731,0.166392,0.166392,1555.848435,1555.848435,0.166392
1,2,0.020098,0.95739,16.558484,16.558484,1.0,0.966415,1.0,0.976073,0.166392,0.332784,1555.848435,1555.848435,0.332784
2,3,0.030047,0.936822,16.558484,16.558484,1.0,0.948105,1.0,0.966812,0.164745,0.497529,1555.848435,1555.848435,0.497529
3,4,0.040096,0.911602,16.558484,16.558484,1.0,0.924748,1.0,0.95627,0.166392,0.663921,1555.848435,1555.848435,0.663921
4,5,0.050045,0.865027,16.558484,16.558484,1.0,0.890316,1.0,0.943158,0.164745,0.828666,1555.848435,1555.848435,0.828666
5,6,0.10009,0.114307,3.423623,9.991054,0.206759,0.360864,0.60338,0.652011,0.171334,1.0,242.362301,899.105368,0.957751
6,7,0.150035,0.044693,0.0,6.665119,0.0,0.071607,0.40252,0.4588,0.0,1.0,-100.0,566.511936,0.904596
7,8,0.20008,0.024262,0.0,4.998011,0.0,0.03308,0.30184,0.352317,0.0,1.0,-100.0,399.801094,0.851334
8,9,0.30007,0.010878,0.0,3.33256,0.0,0.016488,0.20126,0.240411,0.0,1.0,-100.0,233.255968,0.744917
9,10,0.40006,0.004813,0.0,2.499627,0.0,0.007475,0.150957,0.182191,0.0,1.0,-100.0,149.962696,0.638501




ModelMetricsBinomialGeneric: generic
** Reported on cross-validation data. **

MSE: 0.029906384590010465
RMSE: 0.17293462519117003
LogLoss: 0.10202757059741752
Mean Per-Class Error: 0.14189489047526768
AUC: 0.9634163941714838
AUCPR: 0.6824402520860214
Gini: 0.9268327883429677

Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.2368384043514012: 


Unnamed: 0,Unnamed: 1,0,1,Error,Rate
0,0,11168.0,427.0,0.0368,(427.0/11595.0)
1,1,183.0,558.0,0.247,(183.0/741.0)
2,Total,11351.0,985.0,0.0494,(610.0/12336.0)



Maximum Metrics: Maximum metrics at their respective thresholds


Unnamed: 0,metric,threshold,value,idx
0,max f1,0.236838,0.646582,225.0
1,max f2,0.134815,0.751163,269.0
2,max f0point5,0.522104,0.676311,128.0
3,max accuracy,0.522104,0.959549,128.0
4,max precision,0.997373,1.0,0.0
5,max recall,0.001213,1.0,392.0
6,max specificity,0.997373,1.0,0.0
7,max absolute_mcc,0.188166,0.629447,246.0
8,max min_per_class_accuracy,0.083815,0.913238,298.0
9,max mean_per_class_accuracy,0.07703,0.914307,303.0



Gains/Lift Table: Avg response rate:  6.01 %, avg score:  6.01 %


Unnamed: 0,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain,kolmogorov_smirnov
0,1,0.010052,0.8588178,14.365417,14.365417,0.862903,0.924711,0.862903,0.924711,0.144399,0.144399,1336.541727,1336.541727,0.142933
1,2,0.020023,0.7446725,12.722689,13.547378,0.764228,0.802331,0.813765,0.863769,0.126856,0.271255,1172.268852,1254.737826,0.267288
2,3,0.030075,0.6370048,10.874755,12.654103,0.653226,0.692383,0.760108,0.806486,0.109312,0.380567,987.475513,1165.41026,0.372891
3,4,0.040045,0.5302869,11.369211,12.33418,0.682927,0.582124,0.740891,0.750623,0.11336,0.493927,1036.921102,1133.41802,0.482888
4,5,0.050016,0.4393872,7.85017,11.440285,0.471545,0.482475,0.687196,0.697167,0.078273,0.5722,685.016951,1044.028504,0.555555
5,6,0.100032,0.1585573,5.288434,8.364359,0.317666,0.272936,0.502431,0.485052,0.264507,0.836707,428.843365,736.435935,0.783753
6,7,0.150049,0.06777589,1.753817,6.160845,0.105348,0.104082,0.37007,0.358062,0.087719,0.924426,75.381728,516.084532,0.823866
7,8,0.200065,0.03644916,0.404727,4.721816,0.024311,0.050492,0.28363,0.281169,0.020243,0.944669,-59.527293,372.181576,0.79219
8,9,0.300016,0.01448954,0.243033,3.229695,0.014599,0.023408,0.194002,0.195295,0.024291,0.968961,-75.696681,222.969501,0.711695
9,10,0.400049,0.00566573,0.134909,2.455842,0.008104,0.009445,0.147518,0.148823,0.013495,0.982456,-86.509098,145.584173,0.619627




Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,accuracy,0.952694,0.003289,0.956262,0.951939,0.948269,0.951367,0.955631
1,auc,0.963588,0.006053,0.96292,0.956653,0.959121,0.971239,0.968009
2,err,0.047306,0.003289,0.043738,0.048061,0.051731,0.048633,0.044369
3,err_count,116.8,10.084641,110.0,119.0,130.0,121.0,104.0
4,f0point5,0.609462,0.017384,0.612245,0.586797,0.613346,0.600962,0.633958
5,f1,0.657483,0.026964,0.649682,0.617363,0.657895,0.673855,0.688623
6,f2,0.714636,0.046903,0.691995,0.651289,0.709421,0.766871,0.753604
7,lift_top_group,14.161977,1.138656,15.778778,13.949296,13.311607,14.793513,12.97669
8,logloss,0.10198,0.00728,0.09859,0.107888,0.111233,0.093782,0.098409
9,max_per_class_error,0.240649,0.06638,0.276596,0.323944,0.251497,0.155405,0.195804



See the whole table with table.as_data_frame()

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
generic prediction progress: |███████████████████████████████████████████████████| (done) 100%


0