# Week 2 - Assignment

In [1]:
# imports
import h2o
import numpy as np
import pandas as pd

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_232"; OpenJDK Runtime Environment (build 1.8.0_232-8u232-b09-0ubuntu1~19.04.1-b09); OpenJDK 64-Bit Server VM (build 25.232-b09, mixed mode)
  Starting server from /home/megan/Projects/h2oclass/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp2ucw_rgp
  JVM stdout: /tmp/tmp2ucw_rgp/h2o_megan_started_from_python.out
  JVM stderr: /tmp/tmp2ucw_rgp/h2o_megan_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,America/Chicago
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,9 days
H2O cluster name:,H2O_from_python_megan_2omhio
H2O cluster total nodes:,1
H2O cluster free memory:,1.520 Gb
H2O cluster total cores:,3
H2O cluster allowed cores:,3


In [3]:
# set random seed for reproducibility
np.random.seed(123)

In [4]:
# set the number of rows in our data
N = 1000

In [5]:
# begin populating the dataframe
df = pd.DataFrame({
    'id': range(N)
})

In [6]:
# create an unrelated variable pets
pets = np.array(['cat', 'cat', 'cat', 'dog', 'dog', 'dog', 'fish', 'fish', 'hamster', 'turtle', 'snake'])

In [7]:
# assign pets randomly by id
df = df.assign(pet = pets[df.id.values % len(pets)])

In [8]:
# create an age column in our dataframe
df = df.assign(age = np.random.uniform(18, 71, N).round())

In [9]:
# create a column for years of education, and ensure it doesn't go below zero
v = np.random.normal(15, 3, N).round()
v = np.where(v < 0, 0, v)
df = df.assign(education = v)

In [10]:
# create a column for performance review score, on a scale of 1 to 9
v = np.random.normal(5, 2, N).round()
v = np.where(v < 1, 1, v)
v = np.where(v > 9, 9, v)
df = df.assign(performance_review = v)

In [11]:
# create a column for salary based on age
# adjust for performance review and education and add some noise
# add some interaction between education and performance review
v = 20000 + (df.age.values * 3)**2
v += df.education * 1000
v += df.performance_review * 250
v += df.education * df.performance_review * 500
v += np.random.normal(5000, 5000, N)
df = df.assign(income = v.round(-2))

In [12]:
df.describe(include='all')

Unnamed: 0,id,pet,age,education,performance_review,income
count,1000.0,1000,1000.0,1000.0,1000.0,1000.0
unique,,6,,,,
top,,cat,,,,
freq,,273,,,,
mean,499.5,,44.464,14.92,5.082,99105.5
std,288.819436,,15.113607,2.901382,1.902865,21939.523752
min,0.0,,18.0,4.0,1.0,47300.0
25%,249.75,,32.0,13.0,4.0,82500.0
50%,499.5,,44.0,15.0,5.0,98800.0
75%,749.25,,57.0,17.0,6.0,112550.0


In [13]:
# create an H2O frame named people
people = h2o.H2OFrame(
    df,
    destination_frame='people'
)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [14]:
# split data into train, validation, and test sets
train, valid, test = people.split_frame(
    ratios=[0.8, 0.1],
    destination_frames=['people_train', 'people_valid', 'people_test'],
    seed=123
)

In [15]:
# how many rows in train/valid/test (slighly off with small data sets)
print("%d/%d/%d" % (train.nrows, valid.nrows, test.nrows))

788/118/94


In [16]:
# set up our x and y
y = 'income'
# don't include y or our id column in the features
ignore_fields = [y, 'id']
x = [i for i in train.names if i not in ignore_fields]

In [17]:
# now train a GBM on this data
from h2o.estimators.gbm import H2OGradientBoostingEstimator
m1 = H2OGradientBoostingEstimator(model_id='defaults')
m1.train(x, y, train, validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [18]:
m1.mae(train=True)

3259.036445114213

In [19]:
m1.mae(valid=True)

4770.091897295906

In [20]:
perf = m1.model_performance(test)
perf.mae()

4110.304684277261

In [21]:
# now what happens if we deliberately overfit the model
m2 = H2OGradientBoostingEstimator(model_id='overfit', 
                                  ntrees=1000, 
                                  max_depth=10
                                 )
m2.train(x, y, train, validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [22]:
# compare MAE across train, valid, and test
print("Train: %d --> %d" % (m1.mae(train=True), m2.mae(train=True)))
print("Valid: %d --> %d" % (m1.mae(valid=True), m2.mae(valid=True)))
print(" Test: %d --> %d" % (perf.mae(), m2.model_performance(test).mae()))

Train: 3259 --> 511
Valid: 4770 --> 5145
 Test: 4110 --> 5184


We have overfit the model with model 2, reducing the training error significantly from model 1, but increasing both the validation and test error.