In [1]:
import h2o
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import random
from h2o.estimators.gbm import H2OGradientBoostingEstimator

h2o.init()

# referencing for python code: https://github.com/istrategy/H2O-Course-Week-2-Artificial-Data-Sets-Python/blob/master/coursera6.py
# note: took reference from above transcribing R into Python from the lecture--modified data to look at PhD students in the data science industry, mostly influenced by age and school tier

random.seed(12234516)
sample_size = 150

age_list = []
for i in range(sample_size):
    # assume only PhD graduates... start from age 25
    age_list.append(random.randrange(25, 66, 1))

school_tier_list = []
for i in range(sample_size):
    # assume only from schools in the top 500
    school_tier_list.append(random.randrange(1, 501, 1))
    
income_list = []
for i in range(sample_size):
    income_var = 80000 + round((20000 * age_list[i] / 12))^16
    income_var += (500 - school_tier_list[i]) * 100
    
    # adding random noise
    income_var += random.randrange(0, 4001, 1)
    income_var = round(income_var, -2)
    income_list.append(income_var)
 
peopleDF = pd.DataFrame(list(zip(age_list, school_tier_list, income_list)))
peopleDF.columns =['Age', 'School Tier', 'Income']
people = h2o.H2OFrame(peopleDF, destination_frame = "people")

train, valid, test = people.split_frame(
    ratios = [0.8, 0.1],
    destination_frames = ["people_train", "people_valid", "people_test"],
    seed = 1
)

y = "Income"
ignoreFields = [y, "id"]
x = [i for i in train.names if i not in ignoreFields]

m1 = H2OGradientBoostingEstimator(model_id="defaults")
m1.train(x, y, train, validation_frame=valid)

m2 = H2OGradientBoostingEstimator(model_id="overfits", ntrees=1000, max_depth=10)
m2.train(x, y, train, validation_frame=valid)

print("Train: %d --> %d" % (m1.mae(train=True), m2.mae(train=True)))
print("Valid: %d --> %d" % (m1.mae(valid=True), m2.mae(valid=True)))
print(" Test: %d --> %d" % (m1.model_performance(test).mae(), m2.model_performance(test).mae()))

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,12 mins 22 secs
H2O cluster timezone:,Europe/Paris
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.1
H2O cluster version age:,"7 days, 21 hours and 37 minutes"
H2O cluster name:,H2O_from_python_a_nogue_sanchez_axv9ns
H2O cluster total nodes:,1
H2O cluster free memory:,3.363 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


Parse progress: |█████████████████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
gbm Model Build progress: |███████████████████████████████████████████████| 100%
Train: 1292 --> 211
Valid: 1920 --> 2213
 Test: 2076 --> 1940
