# Step 03: Modeling

## read modeling dataset

In [None]:
# -------------------------------
# training
# -------------------------------
trn_sdf = spark.read.parquet(path + 'train_feat_sel.parquet')
print("number of obs:", trn_sdf.count())
print("number of cols:", len(trn_sdf.columns))

trn_sdf.groupBy(['week_n']) \ 
            .agg(F.sum(F.col('target')).alias('tot pos'),
                F.count('*').alias('tot rows'),
                (F.sum(F.col('target'))/F.count('*')).alias('target rate')).orderBy('week_n').show()
print()

## perform downsampling calcualtions

In [None]:
#downsample such that we have 20% response rate
print("***************** BEFORE DOWNSAMPLING ********************")
down_smpl = trn_sdf.agg(F.sum(F.col('target')).alias('tot pos'),
                       (F.count('*').alias('tot rows') - F.sum(F.col('target'))).alias('tot neg'),
                        F.count('*').alias('tot rows'),
                        (F.sum(F.col('target'))/F.count('*')).alias('target rate')).toPandas()
                       
down_smpl['new_tot_size'] = down_smpl['tot pos']/0.20   #20% desired response
down_smpl['majority_smpl_rate'] = down_smpl['new_tot_size']/down_smpl['tot neg']
down_smpl

In [None]:
# sampledata due to large size
trn_sdf_smpl = trn_sdf.sampleBy('target',
                               fractions = {1:1.0,                                   #100% minority
                                           0: down_smpl['majority_smpl_rate'][0]},   #majority
                               seed=5432)

trn_sdf_smpl.groupBy('target')\
            .agg(F.count('*').alias('tot rows')).show()


#save parquet
trn_sdf_smpl.write.partionBy('week_n').mode('overwrite').parquet(file + 'train_feat_sel_downsampled.parquet')

In [None]:
# beta = [#negative after downsampling/#pos after downsamping]*
# [#pos before downsampling/#neg before downsampling]

print("***************** AFTER DOWNSAMPLING ********************")
beta_calc = trn_sdf.agg(F.sum(F.col('target')).alias('tot pos'),
                       (F.count('*').alias('tot rows') - F.sum(F.col('target'))).alias('tot neg'),
                        F.count('*').alias('tot rows'),
                        (F.sum(F.col('target'))/F.count('*')).alias('target rate')).toPandas()

beta_calc['beta'] = (beta_calc['tot neg'] / beta_calc['tot pos']) * \
                        (down_smpl['tot pos'][0] / down_smpl['tot neg'][0])
beta_calc

## downsampled data

In [None]:
# read downsampling dataset
trn_sdf = spark.read.parquet(file + 'train_feat_sel_downsampled.parquet')
trn_sdf.agg(F.sum(F.col('target')).alias('tot pos'),
            F.count('*').alias('tot rows'),
            (F.sum(F.col('target'))/F.count('*')).alias('target rate')).show()
trn_sdf.dtypes

In [None]:
# list of primary key (dates, claim, prov_id)
pk_lst = ['claim_nbr']

# list of numeric var that must be treated as factor in H2o
binary_lst = ['hosp_ind']

print("*****************************")
print("Before removing PK")
print("*****************************")
# ---------------------------
# numeric
# ---------------------------
num_cols = []
for item in trn_sdf.dtypes:
    if item[1].startswith('int') | item[1].startswith('double') | item[1].startswith('bigint'):
        num_cols.append(item[0])
        
print("Numeric list:", num_cols)
print("there are", len(num_cols), "numeric attributes, including target")
print()


# ---------------------------
# categorical
# ---------------------------
cat_cols = []
for item in trn_sdf.dtypes:
    if item[1].startswith('string'):
        cat_cols.append(item[0])
        
print("Categorical list:", cat_cols)
print("there are", len(cat_cols), "categorical attributes, including target")
print() 



#remove PK list
print("*****************************")
print("After removing PK")
print("*****************************")
num_lst_n = []
for i in num_cols:
    if i not in pk_lst and i not in binary_lst:
        num_lst_n.append(i)
        
print("numeric list:", num_lst_n)
print('there are', len(num_lst_n), 'numeric attributes, including target')


cat_lst_n = []
for i in cat_cols:
    if i not in pk_lst:
        cat_lst_n.append(i)
        
# add binary variables into categorical
cat_lst_n.extend(binary_lst)
print("Categorical list:", cat_lst_n)
print("there are", len(cat_lst_n), "categorical attributes, including target")
print()



print("*****************************")
print("Final list")
print("*****************************")
# merge into one list
lst = num_lst_n + cat_lst_n
print("Final list:", lst)
print("there are", len(lst), "categorical attributes, including target")

## Launch H2o

In [None]:
from pysparkling import *
from h2o.estimators.gbm import H2OGradientBoostingEstimator
hc = H2OContext.getOrCreate()

## Transform into h2o dataframe

In [None]:
y = 'target'

# --------------------
# in-time training
# --------------------
print("preparing training")
trn_sdf_n = trn_sdf.select(*lst)  #only required cols

#prepare h2o dataframe
train_h2o = hc.asH2OFrame(trn_sdf_n)

#prepare final list of attribres - converting categorical as factor
for i in cat_lst_n:
    train_h2o[i] = train_h2o[i].asfactor()
    
trn_cols = train_h2o.columns
#remove target variable from X frame
trn_cols.remove(y)

In [None]:
print(train_h2o.shape)
train_h2o.head(5)

## feature selection using GBM

In [None]:
# xgboost hyperparameters
gbm_params1 = {'learn_rate': [0.01,0.1],
              'max_depth':[3, 4, 5],
              'ntrees':[200, 300, 400, 500]}

#train and validate
gbm_grid1 = H2OGridSearch(model = H2OGradientBoostingEstimator(learn_rate_annealing=0.99, seed=542),
                                 grid_id = 'gbm_grid1',
                                 hyper_params = gmb_params1)

gbm_grd1.train(x = trn_cols,
              y = y,
              training_frame = train_h2o,
              seed = 4313)

In [None]:
#get grid results, sorted by validation AUC
gbm_gridperf1 = gbm_grid1.get_grid(sort_by = 'auc', decreasing = True)
print(gbm_gridperf1)

#grab top GBM model
best_gbm1 = gbm_gridperf1.models[0]

#plot feature importance
best_gbm1.varimp_plot()

In [None]:
# retrieve variable importance
varimp = best_gbm1.varimp(use_pandas=True)
varimp.to_csv(fileapth + 'feat_sel.csv')

## only keep top contributing features (>=0.01)

In [None]:
top_feat = pd.read_csv(filepath + 'feat_sel.csv')
top_feat['sel'] = np.where(top_feat['percentage']>=0.01,1,0)   #filter on contribution of at least 1%

# select top features that contribute at least 1%
top_sel = top_feat[top_feat['sel']==1]
print("number of selected features:", len(top_sel))
print(top_sel)

## save datasets with selected variables

In [None]:
lst_keep = list(top_sel['variable'])
final_lst = pk_lst + ['target'] + lst_keep
print("there are", len(final_lst), "variables")
print(final_lst)

In [None]:
final_lst2 = ['week_n','pos_cd_n']

#prepare data
trn_sdf_n = trn_sdf.select(*final_lst2)

print("number of obs:", trn_sdf_n.count())
print("number of cols:", len(trn_sdf_n.columns))

#save parquet
trn_sdf_n.write.partionBy('week_n').mode('overwrite').parquet(file + 'train_feat_sel_r2.parquet')

## ===========================================================
## Read modeling dataset after R2 feature selection
## ===========================================================

In [None]:
# -----------------------------------
# Training (downsampled)
# -----------------------------------
trn_sdf = spark.read.parquet(filepath + 'train_feat_sel_r2.parquet')
print("number of obs:", trn_sdf.count())
print("number of cols:", len(trn_sdf.columns))
trn_sdf.groupBy(['week_n']) \ 
            .agg(F.sum(F.col('target')).alias('tot pos'),
                F.count('*').alias('tot rows'),
                (F.sum(F.col('target'))/F.count('*')).alias('target rate')).orderBy('week_n').show()
print()

In [None]:
#list o primarey keys
pk_lst = ['claim_nbr']

# list of numeric var that must be treated as factor in H2o
binary_lst = ['hosp_ind']

print("*****************************")
print("Before removing PK")
print("*****************************")
# ---------------------------
# numeric
# ---------------------------
num_cols = []
for item in trn_sdf.dtypes:
    if item[1].startswith('int') | item[1].startswith('double') | item[1].startswith('bigint'):
        num_cols.append(item[0])
        
print("Numeric list:", num_cols)
print("there are", len(num_cols), "numeric attributes, including target")
print()


# ---------------------------
# categorical
# ---------------------------
cat_cols = []
for item in trn_sdf.dtypes:
    if item[1].startswith('string'):
        cat_cols.append(item[0])
        
print("Categorical list:", cat_cols)
print("there are", len(cat_cols), "categorical attributes, including target")
print() 



#remove PK list
print("*****************************")
print("After removing PK")
print("*****************************")
num_lst_n = []
for i in num_cols:
    if i not in pk_lst and i not in binary_lst:
        num_lst_n.append(i)
        
print("numeric list:", num_lst_n)
print('there are', len(num_lst_n), 'numeric attributes, including target')


cat_lst_n = []
for i in cat_cols:
    if i not in pk_lst:
        cat_lst_n.append(i)
        
# add binary variables into categorical
cat_lst_n.extend(binary_lst)
print("Categorical list:", cat_lst_n)
print("there are", len(cat_lst_n), "categorical attributes, including target")
print()



print("*****************************")
print("Final list")
print("*****************************")
# merge into one list
lst = num_lst_n + cat_lst_n
print("Final list:", lst)
print("there are", len(lst), "categorical attributes, including target")

## Transform into h2o frame

In [None]:
y = 'target'

# --------------------
# in-time training
# --------------------
print("preparing training")
trn_sdf = trn_sdf.select(*lst)  #only required cols

#prepare h2o dataframe
train_h2o = hc.asH2OFrame(trn_sdf)

#prepare final list of attribres - converting categorical as factor
for i in cat_lst_n:
    train_h2o[i] = train_h2o[i].asfactor()
    
trn_cols = train_h2o.columns
#remove target variable from X frame
trn_cols.remove(y)


# -------------------------------
# split training into X_calib
# -------------------------------
#print("preparing calibration datasets")
#train_calib, test_calib = train_h2o.split_frame(ratios=[0.9], seed=6431)


# --------------------
# in-time testing
# --------------------
print("preparing testing")
tst_sdf = tst_sdf.select(*lst)
tst_h2o = hc.asH2OFrame(tst_sdf)

#prepare final list of attribres - converting categorical as factor
for i in cat_lst_n:
    tst_h2o[i] = tst_h2o[i].asfactor()
    
tst_cols = tst_h2o.columns
#remove target variable from X frame
tst_cols.remove(y)

In [None]:
print(train_h2o.shape)
train_h2o.head(5)

## =============================================
## Fit GBM 
## =============================================

In [None]:
# xgboost hyperparameters
gbm_params2 = {'learn_rate': [0.01,0.1],
              'max_depth':[3, 4, 5],
              'ntrees':[200, 300, 400, 500]}

#train and validate
gbm_grid2 = H2OGridSearch(model = H2OGradientBoostingEstimator(learn_rate_annealing=0.99, nfolds=3, seed=542,
                                                              gainslift_bins=20),
                                                              #calibrate_model = True,
                                                              #calibration_frame = test_calib),
                                 grid_id = 'gbm_grid2',
                                 hyper_params = gmb_params2)

gbm_grd2.train(x = trn_cols,
              y = y,
              training_frame = train_h2o,   #train on downsampled data
              seed = 4313)

In [None]:
#get grid results, sorted by validation AUC
gbm_gridperf2 = gbm_grid2.get_grid(sort_by = 'auc', decreasing = True)
print(gbm_gridperf2)

#grab top GBM model, chosen by validation AUC
best_gbm2 = gbm_gridperf2.models[0]

#AUC of cross-validation holdout predictions
best_gbm2.auc(xval = True)

## Save model

In [None]:
# save model as MOJO file
model_path = best_gbm2.download_mojo(path = filepath, get_genmodel_jar = False)
print(model_path)

## Load model

In [None]:
# load MOJO file
model_path = '/mapr/datalake/gbm_grid2_model.zip'
imported_model = h2o.upload_mojo(model_path)
print("model imported")

## ====================
## Auto ML
## ====================

In [None]:
aml_model = H2OAutoML(max_models = 10, 
                     nfolds = 3,
                     seed = 5421,
                     include_alogs = ['xGBoost','GBM','DRF'])
am_model.train(x = trn_cols,
               y = y ,
               training_frame = train_h2o)   #training on downsampled majority

In [None]:
# view leaderboard
lb = am_model.leaderboard
lb.head(rows = lb.nrows)

In [None]:
# get best model using the metric
best_model = aml_model.leader
best_model

In [None]:
# save model

In [None]:
#output to file
tmp_df = pd.DataFrame()
km_lst = list(best_model.params.keys())

for i in km_lst:
    df = pd.DataFrame.from_dict(best_model.params[i], orient='index', columns=[i])
    tmp_df = pd.concat([tmp_df,df],axis=1)
    
tmp_df.head()