In [1]:
import pandas as pd
import h2o
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from h2o.automl import H2OAutoML
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

In [2]:
#Read in the cancer data set
df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer/breast-cancer.data', header=None)

In [3]:
#Declare the column names of the cancer data set
df.columns=["Class", "Age", "Menopause",
            "Tumor_Size", "Inv_Nodes", 
            "Node_Caps", "Deg_Malig",
            "Breast", "Breast_quad",
            "Irradiat"]

In [4]:
df.head()

Unnamed: 0,Class,Age,Menopause,Tumor_Size,Inv_Nodes,Node_Caps,Deg_Malig,Breast,Breast_quad,Irradiat
0,no-recurrence-events,30-39,premeno,30-34,0-2,no,3,left,left_low,no
1,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,right,right_up,no
2,no-recurrence-events,40-49,premeno,20-24,0-2,no,2,left,left_low,no
3,no-recurrence-events,60-69,ge40,15-19,0-2,no,2,right,left_up,no
4,no-recurrence-events,40-49,premeno,0-4,0-2,no,2,right,right_low,no


In [5]:
#Convert all of the categorical features variables to numeric (use LabelEncoder)
d = defaultdict(LabelEncoder)    
df_label_encoded = df.apply(lambda x: d[x.name].fit_transform(x))

In [6]:
df_label_encoded.head()

Unnamed: 0,Class,Age,Menopause,Tumor_Size,Inv_Nodes,Node_Caps,Deg_Malig,Breast,Breast_quad,Irradiat
0,0,1,2,5,0,1,2,0,2,0
1,0,2,2,3,0,1,1,1,5,0
2,0,2,2,3,0,1,1,0,2,0
3,0,4,0,2,0,1,1,1,3,0
4,0,2,2,0,0,1,1,1,4,0


In [7]:
def run_h2o_automl(dataframe, variable_to_predict,
                   max_number_models):
    """
    This function initiates an h2o cluster, converts
    the dataframe to an h2o dataframe, and then runs
    the autoML function to generate a list of optimal 
    predictor models. The best models are displayed via a 
    scoreboard.
    Arguments:
        dataframe: Pandas dataframe. 
        variable_to_predict: String. Name of the dataframe that we're predicting.
        max_number_models: Int. Total number of models to run.
    Outputs:
        Leader board of best performing models in the console, plus performance of
        best fit model on the test data, including confusion matrix
    """
    h2o.init()
    #Convert the dataframe to an h2o dataframe
    dataframe = h2o.H2OFrame(dataframe)
    #Convert the variable we're predicting to a factor; otherwise this
    #will run as a regression problem
    dataframe[variable_to_predict] = dataframe[variable_to_predict].asfactor()
    #Declare the x- and y- variables for the database. 
    #x-variables are predictor variables, and y-variable is what
    #we wish to predict
    x = dataframe.columns
    y = variable_to_predict
    x.remove(y)
    #Pull the training and test data out at a 75/25 split.
    train, test, validate = dataframe.split_frame(ratios=[.75, .125])
    # Run AutoML (limited to 1 hour max runtime by default)
    aml = H2OAutoML(max_models=max_number_models, seed=1)
    aml.train(x=x, y=y, training_frame = train, validation_frame = validate)
    # View the AutoML Leaderboard
    lb = aml.leaderboard
    print(lb.head(rows=lb.nrows))
    #Get performance on test data
    performance = aml.leader.model_performance(test)
    print(performance)

In [10]:
#################################################################################################
###RUN run_h2o_automl() FUNCTION IN MAIN
aml = run_h2o_automl(dataframe=df, 
               variable_to_predict='Deg_Malig',
               max_number_models=10)

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,4 mins 33 secs
H2O_cluster_timezone:,Europe/London
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.32.0.2
H2O_cluster_version_age:,26 days
H2O_cluster_name:,H2O_from_python_Ben_rqqaix
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,1.997 Gb
H2O_cluster_total_cores:,4
H2O_cluster_allowed_cores:,4


Parse progress: |█████████████████████████████████████████████████████████| 100%
AutoML progress: |
17:24:37.960: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.

████████████
17:24:49.17: GBM_5_AutoML_20201213_172437 [GBM def_5] failed: water.exceptions.H2OModelBuilderIllegalArgumentException: Illegal argument(s) for GBM model: GBM_5_AutoML_20201213_172437.  Details: ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 172.0.
ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) rows, but have only 172.0.
ERRR on field: _min_rows: The dataset size is too small to split for min_rows=100.0: must have at least 200.0 (weighted) ro

model_id,mean_per_class_error,logloss,rmse,mse
DRF_1_AutoML_20201213_172437,0.552232,1.38412,0.618719,0.382813
GBM_4_AutoML_20201213_172437,0.571754,1.07606,0.628651,0.395203
GBM_1_AutoML_20201213_172437,0.576742,1.20714,0.6403,0.409984
XGBoost_3_AutoML_20201213_172437,0.582452,1.14157,0.635548,0.403922
XGBoost_1_AutoML_20201213_172437,0.586379,1.06516,0.626313,0.392269
GBM_2_AutoML_20201213_172437,0.589401,1.07653,0.629449,0.396206
GBM_3_AutoML_20201213_172437,0.596917,1.07127,0.629524,0.3963
DeepLearning_1_AutoML_20201213_172437,0.603854,1.16556,0.624043,0.389429
XGBoost_2_AutoML_20201213_172437,0.608262,1.03252,0.624343,0.389805
StackedEnsemble_BestOfFamily_AutoML_20201213_172437,0.666667,1.05513,0.64502,0.416051




ModelMetricsMultinomial: drf
** Reported on test data. **

MSE: 0.4304557472048448
RMSE: 0.6560912643869333
LogLoss: 1.1785792362982555
Mean Per-Class Error: 0.5666666666666668

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,1,2,3,Error,Rate
0,3.0,3.0,0.0,0.5,3 / 6
1,1.0,6.0,3.0,0.4,4 / 10
2,2.0,10.0,3.0,0.8,12 / 15
3,6.0,19.0,6.0,0.612903,19 / 31



Top-3 Hit Ratios: 


Unnamed: 0,k,hit_ratio
0,1,0.387097
1,2,0.774193
2,3,1.0





In [12]:
aml

In [11]:
performance = aml.leader.model_performance(test)
print(performance)

AttributeError: 'NoneType' object has no attribute 'leader'