# Week 2 - Artificial Data

This is adapted from the R code shown in the video.

In [1]:
# imports
import h2o
import numpy as np
import pandas as pd

In [2]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_232"; OpenJDK Runtime Environment (build 1.8.0_232-8u232-b09-0ubuntu1~19.04.1-b09); OpenJDK 64-Bit Server VM (build 25.232-b09, mixed mode)
  Starting server from /home/megan/Projects/h2oclass/lib/python3.7/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp2emrz996
  JVM stdout: /tmp/tmp2emrz996/h2o_megan_started_from_python.out
  JVM stderr: /tmp/tmp2emrz996/h2o_megan_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,01 secs
H2O cluster timezone:,America/Chicago
H2O data parsing timezone:,UTC
H2O cluster version:,3.28.0.3
H2O cluster version age:,9 days
H2O cluster name:,H2O_from_python_megan_hp24x1
H2O cluster total nodes:,1
H2O cluster free memory:,1.520 Gb
H2O cluster total cores:,3
H2O cluster allowed cores:,3


In [3]:
# set random seed for reproducibility
np.random.seed(123)

In [4]:
# set the number of rows in our data
N = 1000

In [5]:
# create the blood types
blood_types = np.array(['A', 'A', 'A', 'O', 'O', 'O', 'AB', 'B'])

In [6]:
# begin populating the dataframe
df = pd.DataFrame({
    'id': range(N)
})

In [7]:
# assign blood types randomly by id
df = df.assign(blood_type = blood_types[df.id.values % len(blood_types)])

In [8]:
# see our data so far
df.head(15)

Unnamed: 0,id,blood_type
0,0,A
1,1,A
2,2,A
3,3,O
4,4,O
5,5,O
6,6,AB
7,7,B
8,8,A
9,9,A


In [9]:
# create an age column in our dataframe
df = df.assign(age = np.random.uniform(18, 65, N).round())

In [10]:
# create a column for healthy eating, on a scale of 0 to 9
v = np.random.normal(5, 2, N).round()
v = np.where(v > 9, 9, v)
v = np.where(v < 0, 0, v)
df = df.assign(healthy_eating = v)

In [11]:
df.healthy_eating.value_counts()

6.0    194
5.0    193
4.0    186
3.0    121
7.0    112
8.0     66
2.0     60
1.0     30
9.0     27
0.0     11
Name: healthy_eating, dtype: int64

In [12]:
# create a column for active lifestyle, also on a scale of 0 to 9
# however, add 1 if the person is less than age 30
v = np.random.normal(5, 2, N).round()
df = df.assign(active_lifestyle = v)
df = df.assign(active_lifestyle = np.where(df.age < 30, df.active_lifestyle + 1, df.active_lifestyle))
df = df.assign(active_lifestyle = np.where(df.active_lifestyle > 9, 9, df.active_lifestyle))
df = df.assign(active_lifestyle = np.where(df.active_lifestyle < 0, 0, df.active_lifestyle))

In [13]:
df.active_lifestyle.value_counts()

 5.0    207
 6.0    204
 7.0    139
 4.0    132
 3.0     99
 8.0     75
 2.0     59
 9.0     56
 1.0     18
-0.0     11
Name: active_lifestyle, dtype: int64

In [14]:
# create a column for salary based on age
# adjust for healthy eating and active lifestyle and add some noise
v = 20000 + (df.age.values * 3)**2
v += df.healthy_eating * 500
v -= df.active_lifestyle * 300
v += np.random.uniform(0, 5000, N)
df = df.assign(income = v.round(-2))

In [15]:
df.describe(include='all')

Unnamed: 0,id,blood_type,age,healthy_eating,active_lifestyle,income
count,1000.0,1000,1000.0,1000.0,1000.0,1000.0
unique,,4,,,,
top,,A,,,,
freq,,375,,,,
mean,499.5,,41.462,4.941,5.297,40419.7
std,288.819436,,13.394475,1.906083,1.966891,10376.553125
min,0.0,,18.0,0.0,-0.0,22600.0
25%,249.75,,30.0,4.0,4.0,31500.0
50%,499.5,,41.0,5.0,5.0,38700.0
75%,749.25,,53.0,6.0,7.0,48425.0


In [16]:
# create an H2O frame named people
people = h2o.H2OFrame(
    df,
    destination_frame='people'
)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [17]:
# review the frame
people

id,blood_type,age,healthy_eating,active_lifestyle,income
0,A,51,5,8,46600
1,A,31,3,5,32300
2,A,29,7,5,31800
3,O,44,2,5,39200
4,O,52,4,7,46200
5,O,38,5,4,35600
6,AB,64,5,4,63100
7,B,50,1,4,44900
8,A,41,3,5,36900
9,A,36,5,5,33900




In [18]:
# split data into train, validation, and test sets
train, valid, test = people.split_frame(
    ratios=[0.8, 0.1],
    destination_frames=['people_train', 'people_valid', 'people_test'],
    seed=123
)

In [19]:
# how many rows in train/valid/test (slighly off with small data sets)
print("%d/%d/%d" % (train.nrows, valid.nrows, test.nrows))

788/118/94


In [20]:
# set up our x and y
y = 'income'
# don't include y or our id column in the features
ignore_fields = [y, 'id']
x = [i for i in train.names if i not in ignore_fields]

In [21]:
# now train a GBM on this data
from h2o.estimators.gbm import H2OGradientBoostingEstimator
m1 = H2OGradientBoostingEstimator(model_id='defaults')
m1.train(x, y, train, validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [22]:
m1.mae(train=True)

1003.0623884636739

In [23]:
m1.mae(valid=True)

1322.225134423593

In [24]:
perf = m1.model_performance(test)
perf.mae()

1233.476877045928

In [25]:
# now what happens if we deliberately overfit the model
m2 = H2OGradientBoostingEstimator(model_id='overfit', 
                                  ntrees=1000, 
                                  max_depth=10
                                 )
m2.train(x, y, train, validation_frame=valid)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [26]:
# compare MAE across train, valid, and test
print("Train: %d --> %d" % (m1.mae(train=True), m2.mae(train=True)))
print("Valid: %d --> %d" % (m1.mae(valid=True), m2.mae(valid=True)))
print(" Test: %d --> %d" % (perf.mae(), m2.model_performance(test).mae()))

Train: 1003 --> 338
Valid: 1322 --> 1534
 Test: 1233 --> 1444


Summary of Overfitting

We can make the model fit the training set better and better, shrinking MAE on the training set, but the model stops generalizing as well. The MAE goes up on both the validation and the test sets.

In [27]:
# let's use cross validation now - need only a train and test split now so resplit
train, test = people.split_frame(
    ratios=[0.897],
    destination_frames=['people_train', 'people_test'],
    seed=123
)

In [28]:
print("%d/%d" % (train.nrows, test.nrows))

900/100


In [29]:
# build the gbm model with 9 CV folds
m3 = H2OGradientBoostingEstimator(model_id='def9folds', 
                                  nfolds=9
                                 )
m3.train(x, y, train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [30]:
m3.mae(train=True)

1033.4505078125

In [31]:
# get cross validation error
m3.mae(xval=True)

1329.7299643106753

In [32]:
# note that this is better than our original performance
# final model was built with 900 rows, rather than about 800 originally
perf = m3.model_performance(test)
perf.mae()

1186.9082111475288

In [33]:
# overfit the model again but this time with cross validation
m4 = H2OGradientBoostingEstimator(model_id='overfit9folds', 
                                  ntrees=1000, 
                                  max_depth=10,
                                  nfolds=9
                                 )
m4.train(x, y, train)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [35]:
print("Train: %d --> %d" % (m3.mae(train=True), m4.mae(train=True)))
print("Valid: %d --> %d" % (m3.mae(xval=True), m4.mae(xval=True)))
print(" Test: %d --> %d" % (perf.mae(), m4.model_performance(test).mae()))

Train: 1033 --> 377
Valid: 1329 --> 1539
 Test: 1186 --> 1485


We see the same pattern with overfitting here with cross validation.  Training error improves a lot, but both validation and test error increase.

Cross validation does not cure overfitting.  It simply provides a better estimation of your error.