In [1]:
import pandas as pd
import numpy as np
import sklearn.metrics
import sklearn.model_selection
import time

In [2]:
import h2o
from h2o.automl import H2OAutoML

In [3]:
def compute(x_path, y_path):
    X = pd.read_csv(x_path)
    y = pd.read_csv(y_path)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.2, random_state=7)
    df_train = pd.concat([X_train, y_train], axis=1)
    
    hf_train = h2o.H2OFrame(df_train)
    
    t0 = time.time()
    
    x_names = hf_train.columns
    y_names = "y"
    x_names.remove(y_names)
    
    aml = H2OAutoML(max_runtime_secs=3*3600, exclude_algos=['DeepLearning'], seed = 1, verbosity="NULL")
    aml.train(x=x_names, y=y_names, training_frame=hf_train)
    
    t1 = time.time()
    print("training time cost:",t1-t0)
    
    # View the AutoML Leaderboard
    lb = aml.leaderboard
    print(lb.head(10))  # Print all rows instead of default 
    
    train_predictions = aml.predict(h2o.H2OFrame(X_train))
    train_predictions = h2o.as_list(train_predictions)
    print("Train R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
    test_predictions = aml.predict(h2o.H2OFrame(X_test))
    test_predictions = h2o.as_list(test_predictions)
    print("Test R2 score:", sklearn.metrics.r2_score(y_test, test_predictions))
        
    t2 = time.time()
    print("prediction time cost",t2-t1)

In [4]:
h2o.init()
for i in range(1,16):
    x_path = 'dataset/QSAR_{}_train_x.csv'.format(i)
    y_path = 'dataset/QSAR_{}_train_y.csv'.format(i)
    compute(x_path, y_path)
    print("now is processing dataset %s" % i)

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_152-release"; OpenJDK Runtime Environment (build 1.8.0_152-release-1056-b12); OpenJDK 64-Bit Server VM (build 25.152-b12, mixed mode)
  Starting server from /cache/home/hx152/.conda/envs/merck/lib/python3.8/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp61sixrtq
  JVM stdout: /tmp/tmp61sixrtq/h2o_hx152_started_from_python.out
  JVM stderr: /tmp/tmp61sixrtq/h2o_hx152_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/New_York
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.36.0.3
H2O_cluster_version_age:,1 month and 27 days
H2O_cluster_name:,H2O_from_python_hx152_05kwak
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,26.67 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |██████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10911.236030101776


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_BestOfFamily_3_AutoML_1_20220413_23449,0.135256,0.367772,0.135256,0.242018,0.0602149
StackedEnsemble_AllModels_1_AutoML_1_20220413_23449,0.135337,0.367881,0.135337,0.239861,0.0601496
StackedEnsemble_AllModels_2_AutoML_1_20220413_23449,0.135362,0.367916,0.135362,0.239792,0.0601496
StackedEnsemble_BestOfFamily_2_AutoML_1_20220413_23449,0.135465,0.368056,0.135465,0.242368,0.0602682
StackedEnsemble_BestOfFamily_1_AutoML_1_20220413_23449,0.135619,0.368265,0.135619,0.242884,0.0603035
GBM_1_AutoML_1_20220413_23449,0.140398,0.374698,0.140398,0.250608,0.0613986
XGBoost_1_AutoML_1_20220413_23449,0.158078,0.39759,0.158078,0.26719,0.0656924
XGBoost_3_AutoML_1_20220413_23449,0.164114,0.40511,0.164114,0.275699,0.0664477
XGBoost_2_AutoML_1_20220413_23449,0.166407,0.40793,0.166407,0.271902,0.066857
XGBoost_grid_1_AutoML_1_20220413_23449_model_1,0.180091,0.424371,0.180091,0.270425,0.0689348



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.8135444272923474
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.6605292002244378
prediction time cost 130.72709012031555
now is processing dataset 1
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10804.617581367493


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_6_AutoML_2_20220413_53915,0.416671,0.645501,0.416671,0.493553,0.079584
StackedEnsemble_AllModels_5_AutoML_2_20220413_53915,0.420181,0.648214,0.420181,0.495902,0.0798847
StackedEnsemble_BestOfFamily_7_AutoML_2_20220413_53915,0.423258,0.650583,0.423258,0.500303,0.0804233
StackedEnsemble_AllModels_3_AutoML_2_20220413_53915,0.424265,0.651356,0.424265,0.499095,0.0805857
StackedEnsemble_BestOfFamily_6_AutoML_2_20220413_53915,0.426631,0.65317,0.426631,0.502396,0.0806991
StackedEnsemble_AllModels_1_AutoML_2_20220413_53915,0.427037,0.653481,0.427037,0.50171,0.0808782
StackedEnsemble_AllModels_2_AutoML_2_20220413_53915,0.427214,0.653616,0.427214,0.50186,0.0808945
StackedEnsemble_BestOfFamily_4_AutoML_2_20220413_53915,0.427322,0.653699,0.427322,0.502647,0.0809475
StackedEnsemble_BestOfFamily_3_AutoML_2_20220413_53915,0.429219,0.655148,0.429219,0.503438,0.0811144
StackedEnsemble_BestOfFamily_2_AutoML_2_20220413_53915,0.429524,0.655381,0.429524,0.503463,0.081109



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.86423158593792
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.6974569923684721
prediction time cost 24.21174430847168
now is processing dataset 2
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10740.374825239182


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_5_AutoML_3_20220413_84000,0.442219,0.664995,0.442219,0.478324,0.094836
StackedEnsemble_AllModels_6_AutoML_3_20220413_84000,0.449082,0.670136,0.449082,0.478913,0.0957055
StackedEnsemble_AllModels_3_AutoML_3_20220413_84000,0.455641,0.675012,0.455641,0.483608,0.0964851
StackedEnsemble_Best1000_1_AutoML_3_20220413_84000,0.455734,0.675081,0.455734,0.483442,0.0964843
StackedEnsemble_BestOfFamily_7_AutoML_3_20220413_84000,0.458676,0.677256,0.458676,0.486522,0.0967856
StackedEnsemble_AllModels_2_AutoML_3_20220413_84000,0.459096,0.677566,0.459096,0.486994,0.0969059
StackedEnsemble_BestOfFamily_4_AutoML_3_20220413_84000,0.459657,0.67798,0.459657,0.486074,0.0968414
StackedEnsemble_AllModels_1_AutoML_3_20220413_84000,0.45984,0.678115,0.45984,0.487745,0.0969949
StackedEnsemble_BestOfFamily_3_AutoML_3_20220413_84000,0.463591,0.680875,0.463591,0.490208,0.0972287
StackedEnsemble_BestOfFamily_1_AutoML_3_20220413_84000,0.465054,0.681949,0.465054,0.494762,0.0978844



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.8527927099973129
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.7370728261100883
prediction time cost 27.336482286453247
now is processing dataset 3
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 8699.976075172424


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_6_AutoML_4_20220413_113932,0.115602,0.340003,0.115602,0.255198,0.0468351
StackedEnsemble_BestOfFamily_7_AutoML_4_20220413_113932,0.116419,0.341203,0.116419,0.257302,0.0469747
GBM_grid_1_AutoML_4_20220413_113932_model_59,0.116796,0.341754,0.116796,0.259592,0.0470402
GBM_grid_1_AutoML_4_20220413_113932_model_55,0.117538,0.342838,0.117538,0.258353,0.0473431
StackedEnsemble_BestOfFamily_8_AutoML_4_20220413_113932,0.11796,0.343454,0.11796,0.264339,0.0472955
StackedEnsemble_AllModels_3_AutoML_4_20220413_113932,0.118474,0.344201,0.118474,0.261161,0.0474559
GBM_grid_1_AutoML_4_20220413_113932_model_57,0.119013,0.344983,0.119013,0.260217,0.0474427
GBM_grid_1_AutoML_4_20220413_113932_model_45,0.119128,0.345149,0.119128,0.260536,0.0474608
GBM_grid_1_AutoML_4_20220413_113932_model_12,0.119359,0.345483,0.119359,0.260952,0.0476569
GBM_grid_1_AutoML_4_20220413_113932_model_15,0.119468,0.345641,0.119468,0.259503,0.047734



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9476705167180306
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.6269309705445079
prediction time cost 6.483209133148193
now is processing dataset 4
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%
training time cost: 10719.907083749771


model_id,mean_residual_deviance,rmse,mse,mae,rmsle
StackedEnsemble_AllModels_6_AutoML_5_20220413_140448,0.756598,0.869826,0.756598,0.645832,0.125443
StackedEnsemble_AllModels_3_AutoML_5_20220413_140448,0.759833,0.871684,0.759833,0.641582,0.125546
StackedEnsemble_BestOfFamily_7_AutoML_5_20220413_140448,0.760857,0.872271,0.760857,0.650505,0.125278
StackedEnsemble_BestOfFamily_4_AutoML_5_20220413_140448,0.761158,0.872444,0.761158,0.644,0.125173
StackedEnsemble_Best1000_1_AutoML_5_20220413_140448,0.761966,0.872906,0.761966,0.643527,0.125603
StackedEnsemble_AllModels_1_AutoML_5_20220413_140448,0.765796,0.875098,0.765796,0.649363,0.126428
StackedEnsemble_AllModels_2_AutoML_5_20220413_140448,0.766933,0.875747,0.766933,0.649502,0.126453
StackedEnsemble_BestOfFamily_2_AutoML_5_20220413_140448,0.770467,0.877763,0.770467,0.652018,0.126612
StackedEnsemble_BestOfFamily_3_AutoML_5_20220413_140448,0.774746,0.880197,0.774746,0.653497,0.126858
StackedEnsemble_BestOfFamily_1_AutoML_5_20220413_140448,0.778658,0.882416,0.778658,0.657539,0.126608



Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Train R2 score: 0.9387620544627472
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
stackedensemble prediction progress: |███████████████████████████████████████████| (done) 100%
Test R2 score: 0.7685802616730701
prediction time cost 14.369329690933228
now is processing dataset 5


H2OConnectionError: Local server has died unexpectedly. RIP.