In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics
import sklearn.model_selection
import time

In [2]:
import h2o
from h2o.automl import H2OAutoML

In [3]:
def compute(x_path, y_path,comp_time):
    X = pd.read_csv(x_path)
    y = pd.read_csv(y_path)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=7)
    del X,y
    df_train = pd.concat([X_train, y_train], axis=1)
    
    hf_train = h2o.H2OFrame(df_train)
    
    t0 = time.time()
    
    x_names = hf_train.columns
    y_names = "y"
    x_names.remove(y_names)
    
    aml = H2OAutoML(max_runtime_secs=comp_time, exclude_algos=['DeepLearning'], seed = 1, verbosity="NULL")
    aml.train(x=x_names, y=y_names, training_frame=hf_train)
    
    t1 = time.time()
    print("training time cost:",t1-t0)
    
    # View the AutoML Leaderboard
    lb = aml.leaderboard
    print(lb.head(10))  # Print all rows instead of default 
    
    train_predictions = aml.predict(h2o.H2OFrame(X_train))
    train_predictions = h2o.as_list(train_predictions)
    print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
    test_predictions = aml.predict(h2o.H2OFrame(X_test))
    test_predictions = h2o.as_list(test_predictions)
    print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))
        
    t2 = time.time()
    print("prediction time cost",t2-t1)
    m = aml.get_best_model()
    print(pd.DataFrame(m.metalearner().coef_norm(),index=["r2"]).T.sort_values('r2',ascending = False))

In [4]:
h2o.init()
i = 13
for comp_time in [600, 1200, 1800, 2400, 3000, 7200]:
    x_path = 'dataset/QSAR_{}_train_x.csv'.format(i)
    y_path = 'dataset/QSAR_{}_train_y.csv'.format(i)
    compute(x_path, y_path, comp_time)
    print("now is processing dataset %s" % i)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_152-release"; OpenJDK Runtime Environment (build 1.8.0_152-release-1056-b12); OpenJDK 64-Bit Server VM (build 25.152-b12, mixed mode)
  Starting server from /cache/home/hx152/.conda/envs/merck/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpeymj2nmn
  JVM stdout: /tmp/tmpeymj2nmn/h2o_hx152_started_from_python.out
  JVM stderr: /tmp/tmpeymj2nmn/h2o_hx152_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.3
H2O_cluster_version_age:,1 month and 24 days
H2O_cluster_name:,H2O_from_python_hx152_76g8zy
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,26.67 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 605.7228455543518


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_1_AutoML_1_20220409_210624,0.187938,0.433518,0.187938,0.317335,
StackedEnsemble_AllModels_2_AutoML_1_20220409_210624,0.188184,0.433801,0.188184,0.318242,
StackedEnsemble_AllModels_3_AutoML_1_20220409_210624,0.189229,0.435004,0.189229,0.318043,
StackedEnsemble_BestOfFamily_2_AutoML_1_20220409_210624,0.190344,0.436284,0.190344,0.321659,
StackedEnsemble_BestOfFamily_1_AutoML_1_20220409_210624,0.190487,0.436448,0.190487,0.320091,
StackedEnsemble_BestOfFamily_3_AutoML_1_20220409_210624,0.191612,0.437735,0.191612,0.322189,
GBM_4_AutoML_1_20220409_210624,0.196323,0.443083,0.196323,0.322979,
GBM_3_AutoML_1_20220409_210624,0.196596,0.443391,0.196596,0.325281,
GBM_2_AutoML_1_20220409_210624,0.197861,0.444815,0.197861,0.329211,
GBM_1_AutoML_1_20220409_210624,0.198806,0.445877,0.198806,0.32371,



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.7580348651265019
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.49681962265735047
prediction time cost 16.14554786682129
                                          r2
Intercept                           1.222523
GBM_4_AutoML_1_20220409_210624      0.117838
GBM_1_AutoML_1_20220409_210624      0.112800
XGBoost_1_AutoML_1_20220409_210624  0.076571
GLM_1_AutoML_1_20220409_210624      0.068572
XGBoost_2_AutoML_1_20220409_210624  0.043812
DRF_1_AutoML_1_20220409_210624      0.030989
GBM_2_AutoML_1_20220409_210624      0.005760
GBM_3_AutoML_1_20220409_210624      0.000000
now is processing dataset 13
Parse progress: |██████████████

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_BestOfFamily_4_AutoML_2_20220409_211703,0.185931,0.431198,0.185931,0.317974,
StackedEnsemble_AllModels_1_AutoML_2_20220409_211703,0.18808,0.433682,0.18808,0.318177,
StackedEnsemble_AllModels_2_AutoML_2_20220409_211703,0.188486,0.43415,0.188486,0.317749,
StackedEnsemble_BestOfFamily_2_AutoML_2_20220409_211703,0.189903,0.435778,0.189903,0.321044,
StackedEnsemble_AllModels_3_AutoML_2_20220409_211703,0.190456,0.436412,0.190456,0.320688,
StackedEnsemble_BestOfFamily_1_AutoML_2_20220409_211703,0.190487,0.436448,0.190487,0.320091,
StackedEnsemble_BestOfFamily_3_AutoML_2_20220409_211703,0.190626,0.436608,0.190626,0.322211,
XGBoost_grid_1_AutoML_2_20220409_211703_model_15,0.192399,0.438634,0.192399,0.319086,
GBM_grid_1_AutoML_2_20220409_211703_model_1,0.193676,0.440086,0.193676,0.324957,
GBM_3_AutoML_2_20220409_211703,0.195206,0.441821,0.195206,0.323366,



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.7705808199427746
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.4939404812900027
prediction time cost 14.00875473022461
                                                        r2
Intercept                                         1.222523
GBM_grid_1_AutoML_2_20220409_211703_model_1       0.138895
GLM_1_AutoML_2_20220409_211703                    0.107179
XGBoost_grid_1_AutoML_2_20220409_211703_model_15  0.090196
DRF_1_AutoML_2_20220409_211703                    0.084616
XRT_1_AutoML_2_20220409_211703                    0.036093
now is processing dataset 13
Parse progress: |████████████████████████████████████████████████████

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_BestOfFamily_4_AutoML_3_20220409_213737,0.18443,0.429453,0.18443,0.313301,
StackedEnsemble_AllModels_1_AutoML_3_20220409_213737,0.187542,0.433061,0.187542,0.316873,
StackedEnsemble_AllModels_2_AutoML_3_20220409_213737,0.188054,0.433652,0.188054,0.316572,
StackedEnsemble_BestOfFamily_2_AutoML_3_20220409_213737,0.188366,0.434012,0.188366,0.319252,
GBM_grid_1_AutoML_3_20220409_213737_model_4,0.189112,0.43487,0.189112,0.315955,
StackedEnsemble_AllModels_3_AutoML_3_20220409_213737,0.189833,0.435698,0.189833,0.320105,
StackedEnsemble_BestOfFamily_3_AutoML_3_20220409_213737,0.19,0.435889,0.19,0.320436,
StackedEnsemble_BestOfFamily_1_AutoML_3_20220409_213737,0.190487,0.436448,0.190487,0.320091,
XGBoost_grid_1_AutoML_3_20220409_213737_model_15,0.192399,0.438634,0.192399,0.319086,
GBM_grid_1_AutoML_3_20220409_213737_model_1,0.193676,0.440086,0.193676,0.324957,



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.7635019182374625
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.495365664959604
prediction time cost 13.929560661315918
                                                        r2
Intercept                                         1.222523
GBM_grid_1_AutoML_3_20220409_213737_model_4       0.172123
DRF_1_AutoML_3_20220409_213737                    0.112117
GLM_1_AutoML_3_20220409_213737                    0.098849
XGBoost_grid_1_AutoML_3_20220409_213737_model_15  0.064491
XRT_1_AutoML_3_20220409_213737                    0.005391
now is processing dataset 13
Parse progress: |████████████████████████████████████████████████████

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_BestOfFamily_4_AutoML_4_20220409_220816,0.184913,0.430016,0.184913,0.314411,
StackedEnsemble_AllModels_1_AutoML_4_20220409_220816,0.187697,0.43324,0.187697,0.317705,
StackedEnsemble_AllModels_2_AutoML_4_20220409_220816,0.18831,0.433947,0.18831,0.31808,
StackedEnsemble_BestOfFamily_2_AutoML_4_20220409_220816,0.188709,0.434407,0.188709,0.31993,
GBM_grid_1_AutoML_4_20220409_220816_model_4,0.189112,0.43487,0.189112,0.315955,
StackedEnsemble_BestOfFamily_1_AutoML_4_20220409_220816,0.190487,0.436448,0.190487,0.320091,
StackedEnsemble_BestOfFamily_3_AutoML_4_20220409_220816,0.190749,0.436748,0.190749,0.321657,
StackedEnsemble_AllModels_3_AutoML_4_20220409_220816,0.190919,0.436943,0.190919,0.321492,
XGBoost_grid_1_AutoML_4_20220409_220816_model_15,0.192399,0.438634,0.192399,0.319086,
GBM_grid_1_AutoML_4_20220409_220816_model_1,0.193676,0.440086,0.193676,0.324957,



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.7613512885063551
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.49600789314129057
prediction time cost 14.208386659622192
                                                        r2
Intercept                                         1.222523
GBM_grid_1_AutoML_4_20220409_220816_model_4       0.176061
GLM_1_AutoML_4_20220409_220816                    0.096897
DRF_1_AutoML_4_20220409_220816                    0.092412
XGBoost_grid_1_AutoML_4_20220409_220816_model_15  0.067154
XRT_1_AutoML_4_20220409_220816                    0.015581
now is processing dataset 13
Parse progress: |██████████████████████████████████████████████████

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_BestOfFamily_4_AutoML_5_20220409_224914,0.184156,0.429134,0.184156,0.313287,
StackedEnsemble_AllModels_1_AutoML_5_20220409_224914,0.187609,0.433139,0.187609,0.317659,
StackedEnsemble_AllModels_2_AutoML_5_20220409_224914,0.188268,0.433898,0.188268,0.317831,
StackedEnsemble_BestOfFamily_2_AutoML_5_20220409_224914,0.188363,0.434008,0.188363,0.319307,
GBM_grid_1_AutoML_5_20220409_224914_model_4,0.189112,0.43487,0.189112,0.315955,
StackedEnsemble_BestOfFamily_3_AutoML_5_20220409_224914,0.189784,0.435642,0.189784,0.320261,
StackedEnsemble_BestOfFamily_1_AutoML_5_20220409_224914,0.190487,0.436448,0.190487,0.320091,
StackedEnsemble_AllModels_3_AutoML_5_20220409_224914,0.191243,0.437313,0.191243,0.320315,
XGBoost_grid_1_AutoML_5_20220409_224914_model_15,0.192399,0.438634,0.192399,0.319086,
GBM_grid_1_AutoML_5_20220409_224914_model_1,0.193676,0.440086,0.193676,0.324957,



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.7629563238328495
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.49442705750650184
prediction time cost 14.011562585830688
                                                        r2
Intercept                                         1.222523
GBM_grid_1_AutoML_5_20220409_224914_model_4       0.179011
DRF_1_AutoML_5_20220409_224914                    0.110707
GLM_1_AutoML_5_20220409_224914                    0.098859
XGBoost_grid_1_AutoML_5_20220409_224914_model_15  0.063350
XRT_1_AutoML_5_20220409_224914                    0.000000
now is processing dataset 13
Parse progress: |██████████████████████████████████████████████████

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_6_AutoML_6_20220409_234019,0.166442,0.407973,0.166442,0.299575,
StackedEnsemble_AllModels_3_AutoML_6_20220409_234019,0.166469,0.408006,0.166469,0.298605,
StackedEnsemble_AllModels_1_AutoML_6_20220409_234019,0.167081,0.408756,0.167081,0.300037,
StackedEnsemble_AllModels_2_AutoML_6_20220409_234019,0.167348,0.409081,0.167348,0.300267,
StackedEnsemble_BestOfFamily_4_AutoML_6_20220409_234019,0.168438,0.410412,0.168438,0.301605,
StackedEnsemble_AllModels_5_AutoML_6_20220409_234019,0.168549,0.410547,0.168549,0.30032,
StackedEnsemble_BestOfFamily_2_AutoML_6_20220409_234019,0.168877,0.410946,0.168877,0.302037,
StackedEnsemble_BestOfFamily_7_AutoML_6_20220409_234019,0.169196,0.411335,0.169196,0.303129,
StackedEnsemble_BestOfFamily_3_AutoML_6_20220409_234019,0.17024,0.412602,0.17024,0.303745,
StackedEnsemble_BestOfFamily_1_AutoML_6_20220409_234019,0.170929,0.413435,0.170929,0.303717,



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9008513993584026
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.5249527449864487
prediction time cost 18.08031177520752
                                                        r2
Intercept                                         1.254492
GBM_1_AutoML_6_20220409_234019                    0.051021
XGBoost_1_AutoML_6_20220409_234019                0.042263
XGBoost_grid_1_AutoML_6_20220409_234019_model_2   0.041822
XGBoost_grid_1_AutoML_6_20220409_234019_model_9   0.040515
XGBoost_grid_1_AutoML_6_20220409_234019_model_5   0.036833
GBM_4_AutoML_6_20220409_234019                    0.035730
DRF_1_AutoML_6_20220409_234019         