In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics
import sklearn.model_selection
import time

In [2]:
import h2o
from h2o.automl import H2OAutoML

In [3]:
def compute(x_path, y_path):
    X = pd.read_csv(x_path)
    y = pd.read_csv(y_path)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=7)
    df_train = pd.concat([X_train, y_train], axis=1)
    
    hf_train = h2o.H2OFrame(df_train)
    
    t0 = time.time()
    
    x_names = hf_train.columns
    y_names = "y"
    x_names.remove(y_names)
    
    aml = H2OAutoML(max_runtime_secs=3*3600, exclude_algos=['DeepLearning'], seed = 1, verbosity="NULL")
    aml.train(x=x_names, y=y_names, training_frame=hf_train)
    
    t1 = time.time()
    print("training time cost:",t1-t0)
    
    # View the AutoML Leaderboard
    lb = aml.leaderboard
    print(lb.head(10))  # Print all rows instead of default 
    
    train_predictions = aml.predict(h2o.H2OFrame(X_train))
    train_predictions = h2o.as_list(train_predictions)
    print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
    test_predictions = aml.predict(h2o.H2OFrame(X_test))
    test_predictions = h2o.as_list(test_predictions)
    print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))
        
    t2 = time.time()
    print("prediction time cost",t2-t1)

In [4]:
h2o.init()
for i in range(6,16):
    x_path = 'dataset/QSAR_{}_train_x.csv'.format(i)
    y_path = 'dataset/QSAR_{}_train_y.csv'.format(i)
    compute(x_path, y_path)
    print("now is processing dataset %s" % i)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_152-release"; OpenJDK Runtime Environment (build 1.8.0_152-release-1056-b12); OpenJDK 64-Bit Server VM (build 25.152-b12, mixed mode)
  Starting server from /cache/home/hx152/.conda/envs/merck/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpzvzb63y0
  JVM stdout: /tmp/tmpzvzb63y0/h2o_hx152_started_from_python.out
  JVM stderr: /tmp/tmpzvzb63y0/h2o_hx152_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.3
H2O_cluster_version_age:,1 month and 28 days
H2O_cluster_name:,H2O_from_python_hx152_z9a4ol
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,26.67 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |█████████████████████████████████████████████████████████████| (done) 100%
training time cost: 11888.685725927353


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_1_AutoML_1_20220414_34236,0.205822,0.453676,0.205822,0.313536,
StackedEnsemble_BestOfFamily_1_AutoML_1_20220414_34236,0.207184,0.455174,0.207184,0.3158,
StackedEnsemble_BestOfFamily_2_AutoML_1_20220414_34236,0.208002,0.456072,0.208002,0.315908,
GBM_1_AutoML_1_20220414_34236,0.239471,0.489358,0.239471,0.347207,
XGBoost_3_AutoML_1_20220414_34236,0.273026,0.522519,0.273026,0.379547,
GLM_1_AutoML_1_20220414_34236,0.291326,0.539746,0.291326,0.379238,
XGBoost_2_AutoML_1_20220414_34236,0.339825,0.582945,0.339825,0.421083,
XGBoost_1_AutoML_1_20220414_34236,0.355101,0.595904,0.355101,0.437031,
GBM_2_AutoML_1_20220414_34236,0.703225,0.838585,0.703225,0.660691,
GBM_3_AutoML_1_20220414_34236,0.767945,0.876325,0.767945,0.692392,



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9071583875730205
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.8451595380208887
prediction time cost 124.17309665679932
now is processing dataset 6
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 8968.960248947144


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_3_AutoML_2_20220414_70254,432.071,20.7863,432.071,15.5187,
StackedEnsemble_AllModels_6_AutoML_2_20220414_70254,434.678,20.8489,434.678,15.7146,
StackedEnsemble_AllModels_2_AutoML_2_20220414_70254,438.079,20.9303,438.079,15.7281,
StackedEnsemble_BestOfFamily_7_AutoML_2_20220414_70254,439.945,20.9749,439.945,15.8713,
StackedEnsemble_BestOfFamily_2_AutoML_2_20220414_70254,440.716,20.9932,440.716,15.8581,
StackedEnsemble_BestOfFamily_3_AutoML_2_20220414_70254,441.496,21.0118,441.496,15.8639,
StackedEnsemble_AllModels_1_AutoML_2_20220414_70254,442.066,21.0254,442.066,15.8523,
StackedEnsemble_BestOfFamily_4_AutoML_2_20220414_70254,444.69,21.0877,444.69,15.8723,
StackedEnsemble_Best1000_1_AutoML_2_20220414_70254,450.303,21.2204,450.303,16.6686,1.17653
StackedEnsemble_AllModels_5_AutoML_2_20220414_70254,451.125,21.2397,451.125,15.6699,1.06377



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9794076165277259
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.6745653400096651
prediction time cost 5.0137779712677
now is processing dataset 7
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10801.369727134705


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_5_AutoML_3_20220414_93257,0.393085,0.626965,0.393085,0.470565,0.0697539
StackedEnsemble_BestOfFamily_7_AutoML_3_20220414_93257,0.40039,0.632764,0.40039,0.478591,0.0704596
StackedEnsemble_AllModels_3_AutoML_3_20220414_93257,0.403248,0.635018,0.403248,0.477382,0.0707433
StackedEnsemble_AllModels_1_AutoML_3_20220414_93257,0.403865,0.635504,0.403865,0.477939,0.0708068
StackedEnsemble_AllModels_2_AutoML_3_20220414_93257,0.404276,0.635827,0.404276,0.478131,0.0708426
StackedEnsemble_BestOfFamily_1_AutoML_3_20220414_93257,0.404649,0.63612,0.404649,0.478635,0.0708894
StackedEnsemble_BestOfFamily_2_AutoML_3_20220414_93257,0.405724,0.636965,0.405724,0.479296,0.0709754
StackedEnsemble_BestOfFamily_4_AutoML_3_20220414_93257,0.405729,0.636969,0.405729,0.480201,0.0709757
StackedEnsemble_BestOfFamily_6_AutoML_3_20220414_93257,0.406156,0.637304,0.406156,0.480876,0.070945
StackedEnsemble_BestOfFamily_3_AutoML_3_20220414_93257,0.408508,0.639146,0.408508,0.482488,0.0712415



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.8779004185220038
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.7440634276173057
prediction time cost 28.0816068649292
now is processing dataset 8
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10714.526232481003


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_6_AutoML_4_20220414_123340,0.31675,0.562806,0.31675,0.408622,0.0786082
StackedEnsemble_AllModels_5_AutoML_4_20220414_123340,0.32297,0.568305,0.32297,0.408612,0.0791341
StackedEnsemble_AllModels_3_AutoML_4_20220414_123340,0.327185,0.572001,0.327185,0.415978,0.0798913
StackedEnsemble_Best1000_1_AutoML_4_20220414_123340,0.327273,0.572078,0.327273,0.415941,0.0798992
StackedEnsemble_BestOfFamily_7_AutoML_4_20220414_123340,0.327968,0.572685,0.327968,0.4208,0.0800918
StackedEnsemble_BestOfFamily_4_AutoML_4_20220414_123340,0.329317,0.573862,0.329317,0.419568,0.0801987
StackedEnsemble_AllModels_1_AutoML_4_20220414_123340,0.333483,0.57748,0.333483,0.422718,0.0806968
StackedEnsemble_BestOfFamily_2_AutoML_4_20220414_123340,0.334054,0.577974,0.334054,0.423476,0.0807278
StackedEnsemble_AllModels_2_AutoML_4_20220414_123340,0.334132,0.578042,0.334132,0.422797,0.0807691
StackedEnsemble_BestOfFamily_3_AutoML_4_20220414_123340,0.335989,0.579645,0.335989,0.424412,0.0809671



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9655220091858868
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.8140721333856284
prediction time cost 16.94817543029785
now is processing dataset 9
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10802.025424957275


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_3_AutoML_5_20220414_153305,0.465736,0.682448,0.465736,0.517425,0.0875518
StackedEnsemble_AllModels_2_AutoML_5_20220414_153305,0.466907,0.683306,0.466907,0.518998,0.0877017
StackedEnsemble_AllModels_1_AutoML_5_20220414_153305,0.470614,0.686013,0.470614,0.521324,0.088075
StackedEnsemble_BestOfFamily_2_AutoML_5_20220414_153305,0.472925,0.687695,0.472925,0.523707,0.0882558
StackedEnsemble_BestOfFamily_1_AutoML_5_20220414_153305,0.473448,0.688076,0.473448,0.524001,0.0883099
StackedEnsemble_BestOfFamily_3_AutoML_5_20220414_153305,0.473639,0.688214,0.473639,0.524302,0.0882401
GBM_1_AutoML_5_20220414_153305,0.485454,0.696745,0.485454,0.532419,0.0894098
StackedEnsemble_BestOfFamily_4_AutoML_5_20220414_153305,0.493433,0.702448,0.493433,0.535207,0.0898682
XGBoost_3_AutoML_5_20220414_153305,0.548951,0.740912,0.548951,0.56684,0.0953564
GBM_2_AutoML_5_20220414_153305,0.566919,0.75294,0.566919,0.592528,0.0960149



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9267756098430852
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.7945234513315509
prediction time cost 24.908323287963867
now is processing dataset 10
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10768.209941148758


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_6_AutoML_6_20220414_183348,0.0937642,0.306209,0.0937642,0.226534,0.239457
StackedEnsemble_Best1000_1_AutoML_6_20220414_183348,0.0938258,0.30631,0.0938258,0.226374,0.239603
StackedEnsemble_AllModels_3_AutoML_6_20220414_183348,0.0939151,0.306456,0.0939151,0.22653,0.239643
StackedEnsemble_AllModels_2_AutoML_6_20220414_183348,0.09513,0.308431,0.09513,0.229899,0.241646
StackedEnsemble_AllModels_1_AutoML_6_20220414_183348,0.095736,0.309412,0.095736,0.230313,0.24202
StackedEnsemble_AllModels_5_AutoML_6_20220414_183348,0.0957721,0.309471,0.0957721,0.226277,0.240229
StackedEnsemble_BestOfFamily_3_AutoML_6_20220414_183348,0.0971966,0.311764,0.0971966,0.232937,0.244373
StackedEnsemble_BestOfFamily_7_AutoML_6_20220414_183348,0.0973254,0.31197,0.0973254,0.232774,0.244125
StackedEnsemble_BestOfFamily_2_AutoML_6_20220414_183348,0.0974525,0.312174,0.0974525,0.232112,0.244182
StackedEnsemble_BestOfFamily_4_AutoML_6_20220414_183348,0.0975134,0.312271,0.0975134,0.232394,0.244295



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9560655135093112
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.6917540398522294
prediction time cost 18.140254259109497
now is processing dataset 11
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10801.259329319


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_5_AutoML_7_20220414_213357,0.145354,0.381253,0.145354,0.281822,
StackedEnsemble_AllModels_3_AutoML_7_20220414_213357,0.145697,0.381703,0.145697,0.281711,
StackedEnsemble_AllModels_2_AutoML_7_20220414_213357,0.146604,0.382889,0.146604,0.282279,
StackedEnsemble_AllModels_1_AutoML_7_20220414_213357,0.146985,0.383387,0.146985,0.282604,
StackedEnsemble_BestOfFamily_2_AutoML_7_20220414_213357,0.148462,0.385308,0.148462,0.284222,
StackedEnsemble_BestOfFamily_3_AutoML_7_20220414_213357,0.148731,0.385657,0.148731,0.285138,
StackedEnsemble_BestOfFamily_6_AutoML_7_20220414_213357,0.148922,0.385904,0.148922,0.285357,
StackedEnsemble_BestOfFamily_1_AutoML_7_20220414_213357,0.148955,0.385947,0.148955,0.284549,
StackedEnsemble_BestOfFamily_5_AutoML_7_20220414_213357,0.15197,0.389833,0.15197,0.290339,
GBM_1_AutoML_7_20220414_213357,0.157142,0.396411,0.157142,0.294506,



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9387665068777332
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.7541078353437852
prediction time cost 24.90511989593506
now is processing dataset 12
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10784.775527715683


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_Best1000_1_AutoML_8_20220415_03441,0.166532,0.408084,0.166532,0.298783,
StackedEnsemble_AllModels_6_AutoML_8_20220415_03441,0.166671,0.408253,0.166671,0.299747,
StackedEnsemble_AllModels_3_AutoML_8_20220415_03441,0.166713,0.408305,0.166713,0.298845,
StackedEnsemble_AllModels_1_AutoML_8_20220415_03441,0.167343,0.409076,0.167343,0.30036,
StackedEnsemble_AllModels_2_AutoML_8_20220415_03441,0.167576,0.40936,0.167576,0.300593,
StackedEnsemble_BestOfFamily_4_AutoML_8_20220415_03441,0.168223,0.41015,0.168223,0.30124,
StackedEnsemble_AllModels_5_AutoML_8_20220415_03441,0.168276,0.410214,0.168276,0.301476,
StackedEnsemble_BestOfFamily_7_AutoML_8_20220415_03441,0.168904,0.41098,0.168904,0.302751,
StackedEnsemble_BestOfFamily_2_AutoML_8_20220415_03441,0.168987,0.41108,0.168987,0.302421,
StackedEnsemble_BestOfFamily_3_AutoML_8_20220415_03441,0.170375,0.412765,0.170375,0.304262,



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.8969535784047189
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.5280032494119464
prediction time cost 20.61082434654236
now is processing dataset 13
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10742.083377122879


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_3_AutoML_9_20220415_33500,0.112112,0.334831,0.112112,0.249792,
StackedEnsemble_AllModels_6_AutoML_9_20220415_33500,0.112163,0.334907,0.112163,0.250387,
StackedEnsemble_Best1000_1_AutoML_9_20220415_33500,0.112206,0.334972,0.112206,0.250243,
StackedEnsemble_BestOfFamily_7_AutoML_9_20220415_33500,0.113805,0.33735,0.113805,0.253807,
StackedEnsemble_BestOfFamily_4_AutoML_9_20220415_33500,0.113965,0.337588,0.113965,0.253869,
StackedEnsemble_AllModels_2_AutoML_9_20220415_33500,0.114654,0.338606,0.114654,0.25427,
StackedEnsemble_AllModels_1_AutoML_9_20220415_33500,0.115203,0.339415,0.115203,0.254818,
StackedEnsemble_BestOfFamily_2_AutoML_9_20220415_33500,0.115222,0.339444,0.115222,0.254396,
StackedEnsemble_BestOfFamily_3_AutoML_9_20220415_33500,0.11558,0.339971,0.11558,0.255243,
StackedEnsemble_AllModels_5_AutoML_9_20220415_33500,0.115589,0.339984,0.115589,0.254053,



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9439266788279845
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.46311902264248284
prediction time cost 18.165236711502075
now is processing dataset 14
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |██████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10764.776508331299


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_6_AutoML_10_20220415_63435,0.677045,0.822828,0.677045,0.603312,0.117271
StackedEnsemble_AllModels_5_AutoML_10_20220415_63435,0.687565,0.829195,0.687565,0.605153,0.117184
StackedEnsemble_Best1000_1_AutoML_10_20220415_63435,0.687586,0.829208,0.687586,0.607822,0.118033
StackedEnsemble_AllModels_3_AutoML_10_20220415_63435,0.687618,0.829227,0.687618,0.607824,0.118028
StackedEnsemble_BestOfFamily_4_AutoML_10_20220415_63435,0.701444,0.837523,0.701444,0.617218,0.119428
StackedEnsemble_BestOfFamily_7_AutoML_10_20220415_63435,0.703043,0.838477,0.703043,0.618485,0.119594
StackedEnsemble_AllModels_2_AutoML_10_20220415_63435,0.703308,0.838635,0.703308,0.618473,0.119805
StackedEnsemble_AllModels_1_AutoML_10_20220415_63435,0.704572,0.839388,0.704572,0.620749,0.119893
StackedEnsemble_BestOfFamily_2_AutoML_10_20220415_63435,0.71266,0.844192,0.71266,0.627963,0.120609
StackedEnsemble_BestOfFamily_3_AutoML_10_20220415_63435,0.714079,0.845032,0.714079,0.625952,0.120901



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9716114818185428
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.8276266179095901
prediction time cost 17.88947319984436
now is processing dataset 15
