# 4. Finding the best ML model with H2o

Earlier we tested the models through trial and error. Now, we do the same using __autoMachineLearning__ from __H2o__.

In [1]:
#Data management
import pandas as pd
import joblib

#autoMachineLearning
import h2o
from h2o.automl import H2OAutoML

In [2]:
train = pd.read_csv('output/clean_diamonds_train.csv') #Loading TRAIN data

In [3]:
h2o.init() #To start h2o

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,12 days 6 hours 22 mins
H2O cluster timezone:,Europe/Madrid
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,2 months and 26 days
H2O cluster name:,H2O_from_python_fernandocosta_sinund
H2O cluster total nodes:,1
H2O cluster free memory:,1.232 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [4]:
h2train = h2o.H2OFrame(train) #H20Frame is equal to a Pandas dataframe. We parse so it can be later trained.

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [5]:
x = list(train.columns)
x.remove('price')

y = "price"

print("X:", x)
print("y:", y)

X: ['id', 'carat', 'cut', 'color', 'clarity', 'depth', 'table']
y: price


In [6]:
#TRAINING all the h20 models

automl = H2OAutoML(max_models=30, max_runtime_secs=3600, sort_metric='RMSE')
automl.train(x=x, y=y, training_frame=h2train)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [7]:
#Showing the best performers

leader_board = automl.leaderboard
leader_board.head()

model_id,rmse,mean_residual_deviance,mse,mae,rmsle
StackedEnsemble_AllModels_AutoML_20200501_195126,536.661,288005,288005,276.297,0.103422
StackedEnsemble_BestOfFamily_AutoML_20200501_195126,537.244,288631,288631,276.47,0.103107
GBM_1_AutoML_20200501_195126,545.384,297444,297444,286.231,0.111327
XGBoost_grid__1_AutoML_20200501_195126_model_2,547.807,300093,300093,278.647,0.0995881
GBM_2_AutoML_20200501_195126,552.232,304960,304960,296.842,0.127661
XGBoost_3_AutoML_20200501_195126,552.58,305345,305345,298.074,
GBM_3_AutoML_20200501_195126,558.104,311480,311480,299.07,0.127623
GBM_5_AutoML_20200501_195126,561.65,315451,315451,300.645,
XGBoost_1_AutoML_20200501_195126,565.024,319252,319252,310.757,0.141617
GBM_4_AutoML_20200501_195126,571.623,326753,326753,313.911,0.14207




### In the leaderboard ⬆️ there are two types of models: stacked ensamble models & single models. We'll select the best of each category (since the stacked ones cannot be exported).

## 4.1. The best model is a stacked ensamble

In [8]:
#Loading the TEST dataset

stacked_test = pd.read_csv('output/clean_diamonds_test.csv') #Cleaned Test dataset from Kaggle
h2test_stacked = h2o.H2OFrame(stacked_test) #Conversion into a H20 frame to train
h2test_stacked.head() #preview

Parse progress: |█████████████████████████████████████████████████████████| 100%


id,carat,cut,color,clarity,depth,table
0,1.1,4,30,2,62.2,58
1,0.51,5,20,3,62.5,57
2,2.03,4,40,3,61.9,59
3,1.21,4,50,3,60.0,60
4,0.55,5,50,3,61.8,55
5,0.7,4,30,3,61.1,58
6,0.59,5,60,4,60.2,61
7,0.54,5,30,7,61.9,54
8,0.9,4,70,2,58.1,60
9,1.33,5,50,8,60.2,57




Stacked Ensamble models cannot be exported, so we'll just apply it here: 

In [9]:
predicted_price_h2_stacked = automl.leader.predict(h2test_stacked).as_data_frame() #PREDICTING the PRICES on the TEST dataset
predicted_price_h2_stacked #Result

stackedensemble prediction progress: |████████████████████████████████████| 100%


Unnamed: 0,predict
0,4359.154800
1,1072.671123
2,17289.115505
3,6599.684610
4,1511.681404
...,...
13444,755.702161
13445,2676.910646
13446,540.731617
13447,772.001157


In [10]:
#Adding the PRICE column
stacked_test["price"] = predicted_price_h2_stacked 

#Creating a DF with just the two columns we need
stacked_columns = ["id","price"]
stacked_submission = stacked_test["price"]
stacked_test = stacked_test[stacked_columns]
stacked_test.head()

Unnamed: 0,id,price
0,0,4359.1548
1,1,1072.671123
2,2,17289.115505
3,3,6599.68461
4,4,1511.681404


In [11]:
stacked_test.to_csv("output/submissions/predicted_price_h2_stacked.csv") #Saving the CSV

## 4.2. Saving the best single model predicted by h20

In [12]:
#Loading the TEST dataset

single_test = pd.read_csv('output/clean_diamonds_test.csv') #Cleaned Test dataset from Kaggle
h2test_single = h2o.H2OFrame(single_test) #Conversion into a H20 frame to train
h2test_single.head() #preview

Parse progress: |█████████████████████████████████████████████████████████| 100%


id,carat,cut,color,clarity,depth,table
0,1.1,4,30,2,62.2,58
1,0.51,5,20,3,62.5,57
2,2.03,4,40,3,61.9,59
3,1.21,4,50,3,60.0,60
4,0.55,5,50,3,61.8,55
5,0.7,4,30,3,61.1,58
6,0.59,5,60,4,60.2,61
7,0.54,5,30,7,61.9,54
8,0.9,4,70,2,58.1,60
9,1.33,5,50,8,60.2,57




In [13]:
single_model = h2o.get_model(automl.leaderboard.as_data_frame()['model_id'][3]) #Saving the best NON-STACKED model

In [14]:
joblib.dump(single_model, 'output/trained_models/predicted_price_h2_single.sav') #Saving the model


#Another way to save it:
model_path = h2o.save_model(model=single_model, path="output/trained_models/predicted_price_h2_single_again", force=True)
saved_model = h2o.load_model(model_path)

In [15]:
predicted_price_h2_single = single_model.predict(h2test_single).as_data_frame() #PREDICTING the PRICES on the TEST dataset
predicted_price_h2_single #preview

xgboost prediction progress: |████████████████████████████████████████████| 100%


Unnamed: 0,predict
0,4473.162109
1,1058.432739
2,17640.648438
3,6226.746582
4,1470.134399
...,...
13444,752.353394
13445,2706.625488
13446,520.586365
13447,753.365112


In [16]:
#Adding the PRICE column
single_test["price"] = predicted_price_h2_single

#Creating a DF with just the two columns we need
single_columns = ["id","price"]
single_submission = single_test["price"]
single_test = single_test[single_columns]
single_test.head()

Unnamed: 0,id,price
0,0,4473.162109
1,1,1058.432739
2,2,17640.648438
3,3,6226.746582
4,4,1470.134399


In [17]:
predicted_price_h2_single.to_csv("output/submissions/predicted_price_h2_single.csv") #Saving the CSV

## Now we have two different files to upload to Kaggle to see which one did better.


# We're done 🚀
