# 1. Importing dependencies 

In [1]:
import h2o
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# 2. Starting H2O

H2O will automatically check if an instance is already running and connect to

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,3 hours 35 mins
H2O cluster timezone:,America/Sao_Paulo
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.3
H2O cluster version age:,"14 days, 16 hours and 15 minutes"
H2O cluster name:,H2O_from_python_Semantix_qmsw3y
H2O cluster total nodes:,1
H2O cluster free memory:,3.204 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


# 3. Importing data

In [3]:
url = "C:/Users/Semantix/Downloads/smoking.csv"
smoking = h2o.import_file(url, destination_frame = "smoking")

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [4]:
smoking.summary()

Unnamed: 0,C1,id,age,smoke,population,dead,pct
type,int,int,enum,enum,int,int,int
mins,1.0,1.0,,,98.0,18.0,3.0
mean,18.5,18.5,,,1558.9444444444443,1007.3333333333331,72.55555555555559
maxs,36.0,36.0,,,6052.0,6052.0,252.0
sigma,10.535653752852738,10.535653752852738,,,1562.232174887577,1529.5398000705966,69.84695287123716
zeros,0,0,,,0,0,0
missing,0,0,0,0,0,0,0
0,1.0,1.0,40-44,no,656.0,18.0,3.0
1,2.0,2.0,45-59,no,359.0,22.0,6.0
2,3.0,3.0,50-54,no,249.0,19.0,8.0


In [5]:
smoking[:, "population"].sum()

56122.0

# 3. Running a GLM model 

In [25]:
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [26]:
x = [2, 3] #age, smoke
y = 6 # pct

In [27]:
mGLM = H2OGeneralizedLinearEstimator(
    family = "poisson",
    model_id = "smoking_andre_bootcamp")

mGLM.train(x, y, smoking)

glm Model Build progress: |███████████████████████████████████████████████| 100%


let's look at the model performance

In [28]:
mGLM.model_performance()


ModelMetricsRegressionGLM: glm
** Reported on train data. **

MSE: 1152.5051975914891
RMSE: 33.948566944592656
MAE: 26.581720633155037
RMSLE: 0.7413361129878597
R^2: 0.7570133462655522
Mean Residual Deviance: 16.45701542948929
Null degrees of freedom: 35
Residual degrees of freedom: 25
Null deviance: 2266.8887959248736
Residual deviance: 592.4525554616143
AIC: 814.4007230239404




In [29]:
mGLM.coef()

{'Intercept': 4.111104754071415,
 'age.40-44': -0.3220519225700471,
 'age.45-59': 0.0,
 'age.50-54': 0.0,
 'age.55-59': -0.44860283701419945,
 'age.60-64': -0.2667237747172493,
 'age.65-69': 0.0,
 'age.70-74': 0.030909545520565018,
 'age.75-79': 0.2480823071341831,
 'age.80+': 0.5749518333250379,
 'smoke.cigarPipeOnly': 0.3154449339678749,
 'smoke.cigarretteOnly': -0.27857024755800097,
 'smoke.cigarrettePlus': 0.7152524902046729,
 'smoke.no': -0.7521271766145503}

# 4. Naive Bayes 

### observations
* only for Classification - not regression
* doesn't deal well with NAs 
* doesn't handle well with new data - like in production
* mostly associated with text

### importing the iris dataset

In [30]:
url = "http://h2o-public-test-data.s3.amazonaws.com/smalldata/iris/iris_wheader.csv"

In [31]:
iris = h2o.import_file(url)

Parse progress: |█████████████████████████████████████████████████████████| 100%


### splitting the data

In [32]:
train, test = iris.split_frame([0.8])

In [33]:
train.nrows

123

In [34]:
test.nrows

27

In [35]:
train.summary()

Unnamed: 0,sepal_len,sepal_wid,petal_len,petal_wid,class
type,real,real,real,real,enum
mins,4.3,2.0,1.0,0.1,
mean,5.820325203252032,3.058536585365855,3.7341463414634144,1.194308943089431,
maxs,7.7,4.4,6.9,2.5,
sigma,0.8518022695970809,0.43133928474992006,1.8116575721522281,0.7821052728822288,
zeros,0,0,0,0,
missing,0,0,0,0,0
0,4.7,3.2,1.3,0.2,Iris-setosa
1,4.6,3.1,1.5,0.2,Iris-setosa
2,5.0,3.6,1.4,0.2,Iris-setosa


In [36]:
from h2o.estimators.naive_bayes import H2ONaiveBayesEstimator

In [37]:
mNB = H2ONaiveBayesEstimator()
mNB.train(["sepal_len", "sepal_wid", "petal_len", "petal_wid"], "class", train)

naivebayes Model Build progress: |████████████████████████████████████████| 100%


In [38]:
mNB

Model Details
H2ONaiveBayesEstimator :  Naive Bayes
Model Key:  NaiveBayes_model_python_1567871777231_5


Model Summary: 

Unnamed: 0,Unnamed: 1,number_of_response_levels,min_apriori_probability,max_apriori_probability
0,,3.0,0.308943,0.349593




ModelMetricsMultinomial: naivebayes
** Reported on train data. **

MSE: 0.02784168640186024
RMSE: 0.16685828238915873
LogLoss: 0.09412664048544672
Mean Per-Class Error: 0.034252297410192145

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica,Error,Rate
0,43.0,0.0,0.0,0.0,0 / 43
1,0.0,35.0,3.0,0.078947,3 / 38
2,0.0,1.0,41.0,0.02381,1 / 42
3,43.0,36.0,44.0,0.03252,4 / 123



Top-3 Hit Ratios: 

Unnamed: 0,k,hit_ratio
0,1,0.96748
1,2,1.0
2,3,1.0




In [39]:
p = mNB.predict(test)

naivebayes prediction progress: |█████████████████████████████████████████| 100%


In [40]:
p

predict,Iris-setosa,Iris-versicolor,Iris-virginica
Iris-setosa,1.0,4.77113e-17,4.3951299999999996e-27
Iris-setosa,1.0,4.30358e-16,1.2448700000000001e-26
Iris-setosa,1.0,1.4858e-16,4.07078e-26
Iris-setosa,1.0,7.22595e-15,4.48232e-25
Iris-setosa,1.0,1.38442e-16,1.2791900000000001e-26
Iris-setosa,1.0,1.70616e-18,6.98351e-28
Iris-setosa,1.0,1.40402e-16,1.90382e-26
Iris-versicolor,7.18423e-95,0.745993,0.254007
Iris-versicolor,1.56087e-75,0.997676,0.00232406
Iris-versicolor,2.0598500000000002e-53,0.999998,2.15788e-06




In [41]:
mNB.model_performance(test)


ModelMetricsMultinomial: naivebayes
** Reported on test data. **

MSE: 0.06196330723684956
RMSE: 0.24892430021363837
LogLoss: 0.26427364753122023
Mean Per-Class Error: 0.08333333333333333

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica,Error,Rate
0,7.0,0.0,0.0,0.0,0 / 7
1,0.0,12.0,0.0,0.0,0 / 12
2,0.0,2.0,6.0,0.25,2 / 8
3,7.0,14.0,6.0,0.074074,2 / 27



Top-3 Hit Ratios: 

Unnamed: 0,k,hit_ratio
0,1,0.925926
1,2,1.0
2,3,1.0




### let's tune one of the key parameters to see what performance we get

In [46]:
mNB2 = H2ONaiveBayesEstimator(laplace = 2)
mNB2.train(["sepal_len", "sepal_wid", "petal_len", "petal_wid"], "class", train)

naivebayes Model Build progress: |████████████████████████████████████████| 100%


In [47]:
mNB2.model_performance(test)


ModelMetricsMultinomial: naivebayes
** Reported on test data. **

MSE: 0.06196862923988915
RMSE: 0.24893498998712324
LogLoss: 0.26444772917327364
Mean Per-Class Error: 0.08333333333333333

Confusion Matrix: Row labels: Actual class; Column labels: Predicted class


Unnamed: 0,Iris-setosa,Iris-versicolor,Iris-virginica,Error,Rate
0,7.0,0.0,0.0,0.0,0 / 7
1,0.0,12.0,0.0,0.0,0 / 12
2,0.0,2.0,6.0,0.25,2 / 8
3,7.0,14.0,6.0,0.074074,2 / 27



Top-3 Hit Ratios: 

Unnamed: 0,k,hit_ratio
0,1,0.925926
1,2,1.0
2,3,1.0


