In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics
import sklearn.model_selection
import time

In [2]:
import h2o
from h2o.automl import H2OAutoML

In [3]:
def compute(x_path, y_path,comp_time):
    X = pd.read_csv(x_path)
    y = pd.read_csv(y_path)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=7)
    del X,y
    df_train = pd.concat([X_train, y_train], axis=1)
    
    hf_train = h2o.H2OFrame(df_train)
    
    t0 = time.time()
    
    x_names = hf_train.columns
    y_names = "y"
    x_names.remove(y_names)
    
    aml = H2OAutoML(max_runtime_secs=comp_time, exclude_algos=['DeepLearning'], seed = 1, verbosity="NULL")
    aml.train(x=x_names, y=y_names, training_frame=hf_train)
    
    t1 = time.time()
    print("training time cost:",t1-t0)
    
    # View the AutoML Leaderboard
    lb = aml.leaderboard
    print(lb.head(10))  # Print all rows instead of default 
    
    train_predictions = aml.predict(h2o.H2OFrame(X_train))
    train_predictions = h2o.as_list(train_predictions)
    print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
    test_predictions = aml.predict(h2o.H2OFrame(X_test))
    test_predictions = h2o.as_list(test_predictions)
    print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))
        
    t2 = time.time()
    print("prediction time cost",t2-t1)
    m = aml.get_best_model()
    print(pd.DataFrame(m.metalearner().coef_norm(),index=["r2"]).T.sort_values('r2',ascending = False))

In [4]:
h2o.init()
i = 13
for comp_time in [10800, 21600]:
    x_path = 'dataset/QSAR_{}_train_x.csv'.format(i)
    y_path = 'dataset/QSAR_{}_train_y.csv'.format(i)
    compute(x_path, y_path, comp_time)
    print("now is processing dataset %s" % i)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_152-release"; OpenJDK Runtime Environment (build 1.8.0_152-release-1056-b12); OpenJDK 64-Bit Server VM (build 25.152-b12, mixed mode)
  Starting server from /cache/home/hx152/.conda/envs/merck/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpro12r4p3
  JVM stdout: /tmp/tmpro12r4p3/h2o_hx152_started_from_python.out
  JVM stderr: /tmp/tmpro12r4p3/h2o_hx152_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.3
H2O_cluster_version_age:,1 month and 25 days
H2O_cluster_name:,H2O_from_python_hx152_nta59b
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,26.67 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10742.749240636826


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_3_AutoML_1_20220410_210949,0.166124,0.407583,0.166124,0.298773,
StackedEnsemble_Best1000_1_AutoML_1_20220410_210949,0.16626,0.407749,0.16626,0.29868,
StackedEnsemble_AllModels_6_AutoML_1_20220410_210949,0.16671,0.408301,0.16671,0.300023,
StackedEnsemble_AllModels_1_AutoML_1_20220410_210949,0.167335,0.409066,0.167335,0.300364,
StackedEnsemble_AllModels_5_AutoML_1_20220410_210949,0.167431,0.409183,0.167431,0.300288,
StackedEnsemble_AllModels_2_AutoML_1_20220410_210949,0.167565,0.409346,0.167565,0.30058,
StackedEnsemble_BestOfFamily_4_AutoML_1_20220410_210949,0.168105,0.410006,0.168105,0.300976,
StackedEnsemble_BestOfFamily_7_AutoML_1_20220410_210949,0.168917,0.410995,0.168917,0.302735,
StackedEnsemble_BestOfFamily_2_AutoML_1_20220410_210949,0.168998,0.411094,0.168998,0.302269,
StackedEnsemble_BestOfFamily_3_AutoML_1_20220410_210949,0.170305,0.41268,0.170305,0.304046,



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9049888291045766
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.5259253959149157
prediction time cost 23.944796562194824
                                                        r2
Intercept                                         1.254492
XGBoost_grid_1_AutoML_1_20220410_210949_model_33  0.067387
GBM_1_AutoML_1_20220410_210949                    0.053011
GBM_4_AutoML_1_20220410_210949                    0.038710
XGBoost_grid_1_AutoML_1_20220410_210949_model_30  0.037455
...                                                    ...
XGBoost_grid_1_AutoML_1_20220410_210949_model_18  0.000000
XGBoost_grid_1_AutoML_1_20220410_21094

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_Best1000_1_AutoML_2_20220411_00934,0.16553,0.406854,0.16553,0.297511,
StackedEnsemble_AllModels_3_AutoML_2_20220411_00934,0.165729,0.407098,0.165729,0.297954,
StackedEnsemble_AllModels_6_AutoML_2_20220411_00934,0.166221,0.407703,0.166221,0.299195,
StackedEnsemble_BestOfFamily_7_AutoML_2_20220411_00934,0.166556,0.408113,0.166556,0.299638,
StackedEnsemble_AllModels_5_AutoML_2_20220411_00934,0.166749,0.408349,0.166749,0.299735,
StackedEnsemble_AllModels_1_AutoML_2_20220411_00934,0.167296,0.409018,0.167296,0.30014,
StackedEnsemble_AllModels_2_AutoML_2_20220411_00934,0.167461,0.40922,0.167461,0.300267,
StackedEnsemble_BestOfFamily_4_AutoML_2_20220411_00934,0.167965,0.409835,0.167965,0.300788,
StackedEnsemble_BestOfFamily_2_AutoML_2_20220411_00934,0.168924,0.411004,0.168924,0.301892,
XGBoost_lr_search_selection_AutoML_2_20220411_00934_select_grid_model_3,0.169675,0.411916,0.169675,0.301486,



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9060190485760684
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.5270938212902574
prediction time cost 18.316401958465576
                                                       r2
Intercept                                        1.254492
GBM_grid_1_AutoML_2_20220411_00934_model_1       0.074207
GBM_grid_1_AutoML_2_20220411_00934_model_4       0.064777
XGBoost_grid_1_AutoML_2_20220411_00934_model_33  0.051096
GBM_grid_1_AutoML_2_20220411_00934_model_10      0.047966
...                                                   ...
XGBoost_grid_1_AutoML_2_20220411_00934_model_16  0.000000
XGBoost_grid_1_AutoML_2_20220411_00934_model_2