## Select the best model from the grid and create the stacked ensembles


In [None]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')


import h2o
h2o.init(min_mem_size='25G')

DATA_LOCATION = "../../data/"
MODELS_LOCATION = "../../models/ALL_FEATURES/"

In [None]:
train = h2o.import_file( DATA_LOCATION + "processed/final.train.tsv")
train.head()

In [None]:

test = h2o.import_file("../data/processed/final.test.tsv")
test.head()

In [None]:
# Identify predictors and response
train_predictor_cols = train.columns
train_response_col = "Resistance_Status"
train_predictor_cols.remove('SampleID')
train_predictor_cols.remove(train_response_col)
print("train frame - predictor column: ", train_predictor_cols[0], train_predictor_cols[-1])
print("train frame - response column: ", train_response_col)



# Identify predictors and response
test_predictor_cols = test.columns
test_response_col = "Resistance_Status"
test_predictor_cols.remove('SampleID')
test_predictor_cols.remove(test_response_col)
print("test frame - predictor columns: ", test_predictor_cols[0], test_predictor_cols[-1])
print("test frame - response column: ", test_response_col)

In [None]:

# For binary classification, response should be a factor
train[train_response_col] = train[train_response_col].asfactor()
test[test_response_col] = test[test_response_col].asfactor()


# Number of CV folds (to generate level-one data for stacking)
nfolds = 5

MAX_GRID_MODELS = 10


x = train_predictor_cols
y = train_response_col




# Stacked ensemble from grid models

In [None]:
nb_grid = h2o.load_grid("../models/ALL_FEATURES/FINAL/./nb_grid/Grid_NaiveBayes_py_3_sid_9644_model_python_1604419067081_1")



glm_grid = h2o.load_grid("../models/ALL_FEATURES/FINAL/./glm_grid/Grid_GLM_py_3_sid_b7a1_model_python_1604419221083_1")



gbm_grid = h2o.load_grid("../models/ALL_FEATURES/FINAL/./gbm_grid/Grid_GBM_py_7_sid_9651_model_python_1604407520638_1")



xgb_grid = h2o.load_grid("../models/ALL_FEATURES/FINAL/./xgb_grid/Grid_XGBoost_py_7_sid_a3b5_model_python_1604427337744_1")



dl_grid = h2o.load_grid("../models/ALL_FEATURES/FINAL/./dl_grid/Grid_DeepLearning_py_3_sid_b7a1_model_python_1604419221083_608")


drf_grid = h2o.load_grid("../models/ALL_FEATURES/FINAL/./drf_grid/Grid_DRF_py_3_sid_9421_model_python_1604478808297_199")


In [None]:
def best_model_from_grid (model_grid):
    best_model = model_grid[0]
    for mdl in model_grid:
        if (mdl.model_performance(test).auc() > best_model.model_performance(test).auc()):
            best_model = mdl
    print(best_model.model_performance(test).auc())
    return best_model

In [None]:
best_nb_model = best_model_from_grid(nb_grid)
best_glm_model = best_model_from_grid(glm_grid)
best_gbm_model = best_model_from_grid(gbm_grid)
best_xgb_model= best_model_from_grid(xgb_grid)
best_dl_model= best_model_from_grid(dl_grid)
best_drf_model= best_model_from_grid(drf_grid)

In [None]:
# TODO Add additional params so that we can avoid **{**} later
def extract_params_from_model(actual_params_dict, extra_params = []):
    final_params = actual_params_dict

    columns_to_be_removed =   [
                                'model_id',
                                'validation_frame',
                                'response_column',
                                'ignored_columns',
                                'training_frame',
                                *extra_params
]

    for col_name in columns_to_be_removed:
        del  final_params[col_name]

    return final_params

In [None]:
from h2o.estimators import H2ONaiveBayesEstimator

top_nb = H2ONaiveBayesEstimator(**extract_params_from_model(best_nb_model.actual_params))

top_nb.train(x=x, y=y, training_frame=train, validation_frame=test)

print('AUC on test data: ', top_nb.model_performance(test).auc(), "\n\n============================")

top_nb.model_performance

In [None]:
from h2o.estimators import H2OGeneralizedLinearEstimator

top_glm = H2OGeneralizedLinearEstimator(**extract_params_from_model(best_glm_model.actual_params, ['lambda']))

top_glm.train(x=x, y=y, training_frame=train, validation_frame=test)

print('AUC on test data: ', top_glm.model_performance(test).auc(), "\n\n============================")

top_glm.model_performance

In [None]:
from h2o.estimators import H2OGradientBoostingEstimator

top_gbm = H2OGradientBoostingEstimator(**extract_params_from_model(best_gbm_model.actual_params))

top_gbm.train(x=x, y=y, training_frame=train, validation_frame=test)

print('AUC on test data: ', top_gbm.model_performance(test).auc(), "\n\n============================")

top_gbm.model_performance

In [None]:
from h2o.estimators import H2OXGBoostEstimator

top_xgb = H2OXGBoostEstimator(**extract_params_from_model(best_xgb_model.actual_params))

top_xgb.train(x=x, y=y, training_frame=train, validation_frame=test)

print('AUC on test data: ', top_xgb.model_performance(test).auc(), "\n\n============================")

top_xgb.model_performance

In [None]:
from h2o.estimators import H2ODeepLearningEstimator

top_dl = H2ODeepLearningEstimator(**extract_params_from_model(best_dl_model.actual_params))

top_dl.train(x=x, y=y, training_frame=train, validation_frame=test)

print('AUC on test data: ', top_dl.model_performance(test).auc(), "\n\n============================")

top_dl.model_performance

In [None]:
from h2o.estimators import H2ORandomForestEstimator

#top_drf = H2ORandomForestEstimator(**extract_params_from_model(best_drf_model.actual_params, ['weights_column']))


top_drf = H2ORandomForestEstimator(**{**extract_params_from_model(best_drf_model.actual_params, ['weights_column']),
                                    'nfolds':5,
                                    'fold_assignment':'random'
                                    })


top_drf.train(x=x, y=y, training_frame=train, validation_frame=test)

print('AUC on test data: ', top_drf.model_performance(test).auc(), "\n\n============================")

top_drf.model_performance

In [None]:
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator


ensemble = H2OStackedEnsembleEstimator(
                                       base_models= [
                                                    top_nb,
                                                    top_glm,

                                                    # models with checkpoint available
                                                    top_gbm,
                                                    top_xgb,
                                                    top_dl,
                                                    top_drf
                                                    ]
                                       )

ensemble.train(x=x, y=y, training_frame=train, validation_frame=test)

print('AUC on test data: ', ensemble.model_performance(test).auc(), "\n\n============================")
ensemble.model_performance

### Check for the best  meta-learner

In [None]:
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator

collection_of_models = [top_nb,
                        top_glm,

                        # checkpoint-enabled models
                        top_gbm,
                        top_xgb,
                        top_dl,
                        top_drf]


meta_algos = ["auto", "xgboost", "drf", "gbm", "glm", "naivebayes", "deeplearning"]

ensemble_list = []

for metalearner in meta_algos:
    print("\n\n>>>>> ", metalearner, " <<<<<<")

    ensemble = H2OStackedEnsembleEstimator(
                                       base_models= collection_of_models,

                                       model_id= "stacked_ensemble_metalearner_" + metalearner,

                                       metalearner_algorithm= metalearner,

                                       #metalearner_params
                                       )
    ensemble.train(x=x, y=y, training_frame=train, validation_frame=test)
    print("AUC on test data: ",  ensemble.model_performance(test).auc())

    ensemble_list.append(ensemble)




