In [None]:
#libraries we need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import date
pd.options.mode.chained_assignment = None
import h2o
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt

#libraries we need
# !pip install h2o

from scipy.special import expit
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.grid.grid_search import H2OGridSearch
from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from h2o.estimators import H2OGradientBoostingEstimator
SEED  = 1111   # global random seed for better reproducibility

from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

# conda install -c conda-forge pydotplus

h2o.init(max_mem_size='24G', nthreads=4) # start h2o with plenty of memory and threads
h2o.remove_all()                         # clears h2o memory
h2o.no_progress() 

In [None]:
train = pd.read_csv('adult.data.txt',
                    names=['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','class'])

In [None]:
test = pd.read_csv('adult.test.txt',
                    names=['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','class'])

In [None]:
train.head()

In [None]:
train.dtypes

In [None]:
train.shape

In [None]:
test

In [None]:
## USE THE ABOVE TO REMOVE THE '?'

Replace all entries that contains ? with Null

In [None]:
train=train.replace(' ?', np.nan)

test=test.replace(' ?', np.nan)

Fill Null with 'None' type

In [None]:
train.update(train[['native-country','occupation','workclass']].fillna('None'))
test.update(test[['native-country','occupation','workclass']].fillna('None'))

In [None]:
train.isna().sum()

In [None]:
test.isna().sum()

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.to_csv('train.csv',index=False)

In [None]:
test.to_csv('test.csv',index=False)

## GLM

GLM can produce two categories of models: classification and regression.

Logistic regression is the GLM performing binary classification.

In [None]:
hf=h2o.H2OFrame(train)
gf=h2o.H2OFrame(test)

In [None]:
response_col = 'class'
predictors = list(train.columns.values)
predictors.remove('class')

In [None]:
glm_model = H2OGeneralizedLinearEstimator(family= "binomial",
                                          lambda_ = 0,
                                          compute_p_values = True,remove_collinear_columns=True)
glm_model.train(predictors, response_col, training_frame= hf,validation_frame=gf)

In [None]:
glm_model.std_coef_plot()

In [None]:
glm_model

In [None]:
glm_model.predict(gf)

## GLM WITH HYPERPARAMETERS

In [None]:
# Example of values to grid over for `lambda`
# import Grid Search

hyper_params = {'lambda': [1, 0.5, 0.1, 0.01, 0.001, 0.0001, 0.00001, 0]}


glm = H2OGeneralizedLinearEstimator(family = 'binomial',remove_collinear_columns=True)

grid = H2OGridSearch(model = glm, hyper_params = hyper_params,
                     search_criteria = {'strategy': "Cartesian"})

grid.train(x = predictors, y = response_col, training_frame = hf, validation_frame = gf,nfolds=5,seed=1)

In [None]:
grid_table = grid.get_grid(sort_by = 'auc', decreasing = True)
grid_table

In [None]:
best = grid_table.models[0]
print(best.actual_params['lambda'])

In [None]:
best

In [None]:
best.varimp_plot()

In [None]:
best.predict(gf)

## GBM

#### PRE-PROCESSING FOR GBM


In [None]:
ss = hf.split_frame(seed = 1)
train_hf = ss[0]
valid_hf = ss[1]

### GBM WITHOUT HYPRERPARAMETERS

In [None]:
gbm = H2OGradientBoostingEstimator(nfolds=5,
                                        seed=1111,
                                        keep_cross_validation_predictions = True)
gbm.train(x=predictors, y=response_col, training_frame=hf)

# Eval performance:
perf = gbm.model_performance(gf)

# Generate predictions on a test set:
pred = gbm.predict(gf)

In [None]:
gbm.model_performance(gf)

### GBM WITH HYPERPARAMETERS

* MAX DEPTH

In [None]:
params = {'learn_rate': [0.01, 0.05, 0.1], 
          'ntrees': [20, 50, 80, 110, 140, 170, 200],
          'sample_rate': [0.5,0.6,0.7,0.9,1], 
          'col_sample_rate': [0.2,0.4,0.5,0.6,0.8,1],
          'max_depth': list(range(3,30,4))
}


# Prepare the grid object
grid = H2OGridSearch(model=H2OGradientBoostingEstimator,   # Model to be trained
                     grid_id='grid',
                     hyper_params=params,              # Dictionary of parameters
                     search_criteria={"strategy": "RandomDiscrete"}   # RandomDiscrete
                     )

# Train the Model
grid.train(x=predictors,y=response_col, 
           training_frame=train_hf, 
           validation_frame=valid_hf,
           seed = 1111) # Grid Search ID

In [None]:
sorted_final_grid = grid.get_grid(sort_by='auc',decreasing = False)

In [None]:
sorted_final_grid

In [None]:
preds_train = best_gbm_from_grid.predict(hf).exp().as_data_frame()

In [None]:
best_gbm_from_grid.model_performance(gf)

In [None]:
best_gbm_from_grid.varimp_plot()