## H2O Automl

In [None]:
import platform, time, sys
import numpy as np
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
%matplotlib inline
import itertools
from imblearn.over_sampling import SMOTE
from tabulate import tabulate
import seaborn as sns

from sklearn.metrics import accuracy_score, auc, classification_report, confusion_matrix, f1_score
from sklearn.metrics import log_loss, mean_squared_error, precision_score, recall_score, r2_score
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.model_selection import train_test_split

import h2o
from h2o.automl import H2OAutoML

from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
print('Operating system version....', platform.platform())
print("Python version is........... %s.%s.%s" % sys.version_info[:3])
print('pandas version is...........', pd.__version__)
print('numpy version is............', np.__version__)
print('matplotlib version is.......', matplotlib.__version__)

### Start the h2o server

In [None]:
start_time = int(time.time())

localH2O = h2o.init(ip = "localhost",
                    port = 54321,
                    max_mem_size="24G",
                    nthreads = 6)
h2o.no_progress()
h2o.remove_all()

### Define a correlation plot

In [None]:
def Correlation_plot(df_new, TARGET_COL):
    plt.ioff()
    red_green = ["#ff0000", "#00ff00"]
    sns.set_palette(red_green)
    np.seterr(divide='ignore', invalid='ignore')
    g = sns.pairplot(df_new,
                     diag_kind='kde',
                     hue=TARGET_COL, 
                     markers=["o", "D"],
                     size=1.5,
                     aspect=1,
                     plot_kws={"s": 10})
    g.fig.subplots_adjust(right=0.9)
    plt.show()

### Define a method to get the data set, and split it into train, validate, test

In [None]:
def Get_Model_Data():

    path = "C:/Users/affiqazrin/Desktop/mmlspark/Data_FinalProject_READY2.csv"
    model = pd.read_csv(path)
    
    feature_columns = ["duration",
                       "campaign",
                       "pdays",
                       "previous"]
    response_column = ["indexedDeposit"]
    mask = feature_columns + response_column
    
    # Correlation_plot(model[mask], response_column)
    
    X = model[feature_columns].values
    y = model[response_column].values.ravel()
    
    sm = SMOTE(random_state=12)
    X_resampled, y_resampled = sm.fit_sample(X, y)
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                        y_resampled,
                                                        test_size = 0.3,
                                                        random_state = 0)
    # reshape so we can append the columns
    y_train_ = y_train.reshape(y_train.shape[0], 1)
    df_train = pd.DataFrame(X_train, columns=feature_columns)
    df_train[response_column] = pd.DataFrame(y_train_).astype(int)
    print('train: \n', df_train.head(5))
    
    y_test_ = y_test.reshape(y_test.shape[0], 1)
    df_test = pd.DataFrame(X_test, columns=feature_columns)
    df_test[response_column] = pd.DataFrame(y_test_).astype(int)
    print('test: \n',  df_test.head(5))
    
    train = h2o.H2OFrame(df_train)
    train[response_column] = train[response_column].asfactor()
    X_train = train[feature_columns]
    y_train = train[response_column]
    
    test = h2o.H2OFrame(df_test)
    test[response_column] = test[response_column].asfactor()
    X_test = test[feature_columns]
    y_test = test[response_column]

    return X_train, y_train, X_test, y_test, train, test, feature_columns, response_column

In [None]:
DATASET_LOCAL_PATH = "C:/Users/affiqazrin/Desktop/mmlspark/Data_FinalProject_READY4.csv"
df = pd.read_csv(DATASET_LOCAL_PATH)
    
ALL_COLS = ["age", #numerical
            "job", #categorical
            "marital", #categorical
            "education", #categorical
            "default", #categorical
            "housing", #categorical, binary
            "loan", #categorical, binary
            "contact", #categorical
            "day", #categorical
            "month", #categorical
            "duration", #numerical
            "campaign", #categorical
            "pdays", #numerical
            "previous", #numerical
            "poutcome", #categorical
            "deposit", #categorical, binary
           ]
    
NUMERICAL_COLS = ["age", #numerical
                  "duration", #numerical
                  "pdays", #numerical
                  "previous", #numerical
                 ]
    
CATEGORICAL_COLS = ["job", #categorical
                    "marital", #categorical
                    "education", #categorical
                    "default", #categorical
                    "housing", #categorical, binary
                    "loan", #categorical, binary
                    "contact", #categorical
                    "day", #categorical
                    "month", #categorical
                    "campaign", #categorical
                    "poutcome" #categorical
                   ]

TARGET_COL = ["deposit" #categorical, binary
             ]
    
le = LabelEncoder()
#TARGET_COL2 = le.fit_transform(df[TARGET_COL])
TARGET_COL2 = df[TARGET_COL].apply(LabelEncoder().fit_transform)
    
ohe = OneHotEncoder(handle_unknown='ignore')
CATEGORICAL_COLS2 = pd.DataFrame(ohe.fit_transform(df[CATEGORICAL_COLS]).toarray())
    
df[DATA_PROCESSED] = pd.concat([df[NUMERICAL_COLS], CATEGORICAL_COLS2], axis=1)
mask = DATA_PROCESSED + TARGET_COL2
    
# Correlation_plot(model[mask], TARGET_COL)    
X = df[DATA_PROCESSED].values
y = df[TARGET_COL2].values.ravel()
    
sm = SMOTE(random_state=12)
X_resampled, y_resampled = sm.fit_sample(X, y)
    
X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                        y_resampled,
                                                        test_size = 0.3,
                                                        random_state = 0)
    
print('Size of resampled data:')
print(' train shape... ', X_train.shape, y_train.shape)
print(' test shape.... ', X_test.shape, y_test.shape)
    
DATA_PROCESSED_HEADER=list(df[DATA_PROCESSED].columns.values)
n_features = len(DATA_PROCESSED_HEADER)
n_ALL_COLS = len(ALL_COLS)
n_NUMERICAL_COLS = len(NUMERICAL_COLS)
n_CATEGORICAL_COLS = len(CATEGORICAL_COLS)

# reshape so we can append the columns
y_train_ = y_train.reshape(y_train.shape[0], 1)
df_train = pd.DataFrame(X_train, columns=DATA_PROCESSED_HEADER)
df_train[TARGET_COL] = pd.DataFrame(y_train_).astype(int)
#print('train: \n', df_train.head(5))

train = h2o.H2OFrame(df_train)
train[TARGET_COL] = train[TARGET_COL].asfactor()
X_train = train[DATA_PROCESSED]
y_train = train[TARGET_COL]
    
y_test_ = y_test.reshape(y_test.shape[0], 1)
df_test = pd.DataFrame(X_test, columns=DATA_PROCESSED_HEADER)
df_test[TARGET_COL] = pd.DataFrame(y_test_).astype(int)
#print('test: \n',  df_test.head(5))
      
test = h2o.H2OFrame(df_test)
test[TARGET_COL] = test[TARGET_COL].asfactor()
X_test = test[DATA_PROCESSED]
y_test = test[TARGET_COL]

### Plot the ROC curve

In [None]:
def ROC_Curve(best_model, train):
    performance = best_model.model_performance(train)
    auc = performance.auc()
    false_positive_rate = performance.fprs
    true_positive_rate = performance.tprs

    plt.style.use('ggplot')
    plt.figure()
    plt.plot(false_positive_rate, true_positive_rate, 'k--')
    plt.plot(false_positive_rate, 
             true_positive_rate, 
             color='darkorange',
             lw = 2,
             label='ROC curve (area = %0.2f)' % auc)
    plt.plot([0,1], [0,1], color = 'navy', lw = 2, linestyle = '--')
    plt.xlim(0, 0.2)
    plt.ylim(0.8, 1)
    plt.xlabel('False positive rate')
    plt.ylabel('True positive rate')
    plt.title('ROC curve')
    plt.legend(loc='best')
    plt.show()

### Plot the performance of each predictor

In [None]:
def Plot_predictor_importance(best_model):
    fig, ax = plt.subplots()
    variables = best_model._model_json['output']['variable_importances']['variable']
    y_pos = np.arange(len(variables))
    scaled_importance = best_model._model_json['output']['variable_importances']['scaled_importance']
    ax.barh(y_pos, 
            scaled_importance, 
            align='center', 
            color='green', 
            ecolor='black', 
            height=0.5)
    ax.set_yticks(y_pos)
    ax.set_yticklabels(variables)
    ax.invert_yaxis()
    ax.set_xlabel('Scaled Importance')
    ax.set_title('Variable Importance')
    plt.show()

### Plot the confusion matrix

In [None]:
def Plot_Confusion_Matrix(best_model):
    y_predicted = best_model.predict(X_test)
    y_predicted_ = y_predicted.as_data_frame(use_pandas=True, header=False)
    y_pred = y_predicted_['predict']
    
    y_test_ = y_test.as_data_frame(use_pandas=True, header=False)
    y = y_test_[TARGET_COL]

    cm = confusion_matrix(y, y_pred)
    
    cmap = plt.cm.Blues
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    title='Confusion matrix (on test data)'
    classes = [0, 1]
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

    c_report = classification_report(y, y_pred)
    print('\nClassification report:\n', c_report)

### Print the model metrics for the validate and test data sets

In [None]:
def Print_Metrics(best_model, X_test, y_test):
    print('\nModel performance on test data set:')
    
    predictions = best_model.predict(X_test)
    misclassification_rate = (predictions['predict'] == y_test).as_data_frame(use_pandas=True).mean()
    
    performance = best_model.model_performance(test)
    accuracy  = performance.accuracy()[0][1]
    precision = performance.precision()[0][1]
    recall    = 0.0  #performance.recall()[0][1]  not available yet
    F1        = performance.F1()[0][1]
    r2        = performance.r2()
    auc       = performance.auc()
    mse       = performance.mse()
    logloss   = performance.logloss()
    
    header = ["Metric", "Test dataset"]
    table = [["accuracy",               accuracy],
             ["precision",              precision],
             ["recall",                 recall],
             ["misclassification rate", misclassification_rate],
             ["F1",                     F1],
             ["r2",                     r2],
             ["AUC",                    auc],
             ["mse",                    mse],
             ["logloss",                logloss]
            ]
    
    print(tabulate(table, header, tablefmt="fancy_grid", floatfmt=".8f"))

### Load the Bottle Rocket dataset

In [None]:
 X_train, y_train, X_test, y_test, train, test, DATA_PROCESSED, TARGET_COL = Get_Model_Data()

### Define the AutoML model.

In [None]:
def Run():
    aml = H2OAutoML(max_models = 500, 
                    seed = 7, 
                    max_runtime_secs = 1*60*60, 
                    nfolds = 5, 
                    stopping_metric = "misclassification",
                    project_name="bottle_rocket",
                    stopping_rounds = 5)
    
    aml.train(x = DATA_PROCESSED, 
              y = TARGET_COL,
              training_frame = train)

    lb = aml.leaderboard
    return lb, X_test, y_test, train, test, DATA_PROCESSED, TARGET_COL

### Now let’s run the model and get the best model from the leaderboard

In [None]:
lb = Run()

### Choose the best model from the leader board.
#### At this time, we cannot choose a Stacked Ensemble model because it lacks the metrics we want.

In [None]:
df = lb[0][0]
print('df:\n', df)
df2 = df.as_data_frame(use_pandas=True, header=True)

for row in range(len(lb)):
    model_id = df2.iloc[row,0]
    print('model_id:', model_id)
    if 'StackedEnsemble' in model_id:
        continue
    else:
        best_model = h2o.get_model(model_id)
        break

### Plot the ROC Curve

In [None]:
ROC_Curve(best_model, train)

### Plot the confusion matrix

In [None]:
Plot_Confusion_Matrix(best_model)

### Save the model

In [None]:
model_path = h2o.save_model(model=best_model, path="h2o_automl", force=True)

### Load the model

In [None]:
saved_model = h2o.load_model(model_path)

### Plot predictor importance

In [None]:
Plot_predictor_importance(saved_model)

### Print the performance of the model
#### Note that ‘recall’ is not available in this version of h2o.automl

In [None]:
Print_Metrics(saved_model, X_test, y_test)

### Print the computation time

In [None]:
end_time = int(time.time())
d = divmod(end_time - start_time,86400)  # days
h = divmod(d[1],3600)  # hours
m = divmod(h[1],60)  # minutes
s = m[1]  # seconds
print('%d days, %d hours, %d minutes, %d seconds' % (d[0],h[0],m[0],s))

In [None]:
leaderboard = lb.getLeaderboard("ALL")
leaderboard.show(truncate = False)