In [538]:
import numpy as np
import pandas as pd
from datetime import datetime
%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.pylab import rcParams
from sklearn.metrics import classification_report
rcParams['figure.figsize'] = 20, 4
rcParams['font.size'] = 18.0
import h2o
from h2o.estimators import H2ORandomForestEstimator
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
import warnings
warnings.filterwarnings('ignore')
h2o.init(nthreads=-1,min_mem_size=64)
import glob
import math
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelEncoder

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O cluster uptime:,1 day 0 hours 6 mins
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.18.0.11
H2O cluster version age:,1 month and 28 days
H2O cluster name:,H2O_from_python_bryceeb_vmtdeh
H2O cluster total nodes:,1
H2O cluster free memory:,60.22 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [539]:
train = h2o.import_file(path="train_ZoGVYWq.csv")
test_predict = h2o.import_file(path="test_66516Ee.csv")

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [540]:
train = train.as_data_frame(use_pandas=True)
test_predict = test_predict.as_data_frame(use_pandas=True)

In [541]:
lb_make = LabelEncoder()
train["sourcing_channel"] = lb_make.fit_transform(train["sourcing_channel"])
train["residence_area_type"] = lb_make.fit_transform(train["residence_area_type"])
test_predict["sourcing_channel"] = lb_make.fit_transform(test_predict["sourcing_channel"])
test_predict["residence_area_type"] = lb_make.fit_transform(test_predict["residence_area_type"])

In [542]:
fill_NaN = Imputer(missing_values=np.nan, strategy='mean', axis=1)
imputed_DF = pd.DataFrame(fill_NaN.fit_transform(train))
imputed_DF.columns = train.columns
imputed_DF.index = train.index
train = imputed_DF

imputed_DF = pd.DataFrame(fill_NaN.fit_transform(test_predict))
imputed_DF.columns = test_predict.columns
imputed_DF.index = test_predict.index
test_predict = imputed_DF

In [543]:
train = train.apply(pd.to_numeric, errors='ignore')
test_predict = test_predict.apply(pd.to_numeric, errors='ignore')

In [544]:
train = h2o.H2OFrame(train)
test_predict = h2o.H2OFrame(test_predict)

Parse progress: |█████████████████████████████████████████████████████████| 100%
Parse progress: |█████████████████████████████████████████████████████████| 100%


In [545]:
train["renewal"]=train["renewal"].asfactor()
train["Count_3-6_months_late"]=train["Count_3-6_months_late"].ascharacter()
train["Count_6-12_months_late"]=train["Count_6-12_months_late"].ascharacter()
train["Count_more_than_12_months_late"]=train["Count_more_than_12_months_late"].ascharacter()
train["sourcing_channel"]=train["sourcing_channel"].ascharacter()
train["residence_area_type"]=train["residence_area_type"].ascharacter()
train["Count_3-6_months_late"]=train["Count_3-6_months_late"].asfactor()
train["Count_6-12_months_late"]=train["Count_6-12_months_late"].asfactor()
train["Count_more_than_12_months_late"]=train["Count_more_than_12_months_late"].asfactor()
train["sourcing_channel"]=train["sourcing_channel"].asfactor()
train["residence_area_type"]=train["residence_area_type"].asfactor()


test_predict["Count_3-6_months_late"]=test_predict["Count_3-6_months_late"].ascharacter()
test_predict["Count_6-12_months_late"]=test_predict["Count_6-12_months_late"].ascharacter()
test_predict["Count_more_than_12_months_late"]=test_predict["Count_more_than_12_months_late"].ascharacter()
test_predict["sourcing_channel"]=test_predict["sourcing_channel"].ascharacter()
test_predict["residence_area_type"]=test_predict["residence_area_type"].ascharacter()
test_predict["Count_3-6_months_late"]=test_predict["Count_3-6_months_late"].asfactor()
test_predict["Count_6-12_months_late"]=test_predict["Count_6-12_months_late"].asfactor()
test_predict["Count_more_than_12_months_late"]=test_predict["Count_more_than_12_months_late"].asfactor()
test_predict["sourcing_channel"]=test_predict["sourcing_channel"].asfactor()
test_predict["residence_area_type"]=test_predict["residence_area_type"].asfactor()

In [546]:
interaction_cols_1 = ['sourcing_channel','residence_area_type']
interaction_cols_2 = ['Count_3-6_months_late','Count_6-12_months_late','Count_more_than_12_months_late']

In [547]:
train['channel_residence_type'] = train.interaction(factors=interaction_cols_1,
                                   pairwise=False,
                                   max_factors=3125,
                                   min_occurrence=1,
                                   destination_frame="itest")

test_predict['channel_residence_type'] = test_predict.interaction(factors=interaction_cols_1,
                                   pairwise=False,
                                   max_factors=3125,
                                   min_occurrence=1,
                                   destination_frame="itest1")

Interactions progress: |██████████████████████████████████████████████████| 100%
Interactions progress: |██████████████████████████████████████████████████| 100%


In [548]:
train['late_interaction'] = train.interaction(factors=interaction_cols_2,
                                   pairwise=False,
                                   max_factors=100000,
                                   min_occurrence=1,
                                   destination_frame="itest")

test_predict['late_interaction'] = test_predict.interaction(factors=interaction_cols_2,
                                   pairwise=False,
                                   max_factors=100000,
                                   min_occurrence=1,
                                   destination_frame="itest1")

Interactions progress: |██████████████████████████████████████████████████| 100%
Interactions progress: |██████████████████████████████████████████████████| 100%


In [549]:
def model_prep(df, train_ratio, test_ratio):
    train_R, test_R, rest_of_df_R = df.split_frame(ratios=[train_ratio, test_ratio])
    exclude_field = ['id']
    target = ['renewal'] 
    features = [x for x in df.columns if x not in target and x not in exclude_field]
    return train_R, test_R, target, features

In [550]:
train, test, target, features = model_prep(train, 0.95, 0.04)

In [551]:
trained_model = H2ORandomForestEstimator(model_id = 'trained_model'
                               ,balance_classes=True
                               ,binomial_double_trees=True
                               ,ntrees=15
                               ,seed=15)

trained_model.train(x=features, y=target[0], training_frame = train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [552]:
mod_performance = trained_model.model_performance(test_data=test)

In [553]:
feature_importance = trained_model._model_json['output']['variable_importances'].as_data_frame()
feature_importance

Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,channel_residence_type,117894.039062,1.0,0.200366
1,perc_premium_paid_by_cash_credit,105324.734375,0.893385,0.179004
2,late_interaction,63278.945312,0.536744,0.107545
3,age_in_days,61625.890625,0.522723,0.104736
4,no_of_premiums_paid,46452.179688,0.394016,0.078947
5,Income,45660.609375,0.387302,0.077602
6,premium,36751.652344,0.311735,0.062461
7,Count_3-6_months_late,30709.644531,0.260485,0.052192
8,Count_6-12_months_late,26491.171875,0.224703,0.045023
9,application_underwriting_score,24031.8125,0.203842,0.040843


In [554]:
features = feature_importance.variable[:10].tolist()

In [555]:
trained_model = H2ORandomForestEstimator(model_id = 'trained_model'
                               ,balance_classes=True
                               ,binomial_double_trees=True
                               ,ntrees=100
                               ,seed=15)

trained_model.train(x=features, y=target[0], training_frame = train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [556]:
mod_performance = trained_model.model_performance(test_data=test)

In [557]:
print (mod_performance)


ModelMetricsBinomial: drf
** Reported on test data. **

MSE: 0.05212655604902473
RMSE: 0.22831240888095578
LogLoss: 0.23294033889397928
Mean Per-Class Error: 0.23758771929824563
AUC: 0.8199140350877193
Gini: 0.6398280701754386
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.6861074352459597: 


0,1,2,3,4
,0.0,1.0,Error,Rate
0,12.0,178.0,0.9368,(178.0/190.0)
1,6.0,2994.0,0.002,(6.0/3000.0)
Total,18.0,3172.0,0.0577,(184.0/3190.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.6861074,0.9701879,387.0
max f2,0.0,0.9874918,399.0
max f0point5,0.9208396,0.9593791,284.0
max accuracy,0.6861074,0.9423197,387.0
max precision,0.9990156,0.9932886,9.0
max recall,0.0,1.0,399.0
max specificity,0.9997945,0.9947368,0.0
max absolute_mcc,0.9534447,0.3303818,225.0
max min_per_class_accuracy,0.9841934,0.7616667,120.0


Gains/Lift Table: Avg response rate: 94.04 %



0,1,2,3,4,5,6,7,8,9,10,11
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,cumulative_response_rate,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100313,0.9996550,0.996875,0.996875,0.9375,0.9375,0.01,0.01,-0.3125000,-0.3125000
,2,0.0200627,0.9995619,1.0633333,1.0301042,1.0,0.96875,0.0106667,0.0206667,6.3333333,3.0104167
,3,0.0300940,0.9994846,1.0633333,1.0411806,1.0,0.9791667,0.0106667,0.0313333,6.3333333,4.1180556
,4,0.0401254,0.9994030,1.0633333,1.0467187,1.0,0.984375,0.0106667,0.042,6.3333333,4.6718750
,5,0.0501567,0.9993467,1.0633333,1.0500417,1.0,0.9875,0.0106667,0.0526667,6.3333333,5.0041667
,6,0.1,0.9988668,1.0566457,1.0533333,0.9937107,0.9905956,0.0526667,0.1053333,5.6645702,5.3333333
,7,0.1501567,0.9983181,1.0500417,1.0522338,0.9875,0.9895616,0.0526667,0.158,5.0041667,5.2233820
,8,0.2,0.9978279,1.0566457,1.0533333,0.9937107,0.9905956,0.0526667,0.2106667,5.6645702,5.3333333
,9,0.3,0.9968720,1.0466667,1.0511111,0.9843260,0.9885057,0.1046667,0.3153333,4.6666667,5.1111111






In [558]:
#effort-incentives
def effort_incentives(x):
    return (10 * ( 1 - (math.exp(-x/400))))

#% improvement in renewal prob vs effort curve
def percent_renewal_improvement(x):
    return (.20 * ( 1 - (math.exp(-x/5))))

def incentives_perc(predict_renewal, percent_renewal_improvement):
    if (predict_renewal + percent_renewal_improvement) >= 1:
        renewal = (1 - predict_renewal)
    else:
        renewal = percent_renewal_improvement
    return (predict_renewal * ( predict_renewal * renewal)) 

In [559]:
test['predict_renewal'] = trained_model.predict(test)['p1']
test = test.as_data_frame(use_pandas=True)

test['effort_incentives'] = test['predict_renewal'].apply(effort_incentives)
test['percent_renewal_improvement'] = test['predict_renewal'].apply(percent_renewal_improvement)

test['incentives_perc'] = test.apply(lambda row: incentives_perc(row['predict_renewal'], row['percent_renewal_improvement']), axis=1)

test['incentives'] = test.incentives_perc * test.premium

drf prediction progress: |████████████████████████████████████████████████| 100%


In [560]:
test_predict['predict_renewal'] = trained_model.predict(test_predict)['p1']
test_predict = test_predict.as_data_frame(use_pandas=True)

test_predict['effort_incentives'] = test_predict['predict_renewal'].apply(effort_incentives)
test_predict['percent_renewal_improvement'] = test_predict['predict_renewal'].apply(percent_renewal_improvement)

test_predict['incentives_perc'] = test_predict.apply(lambda row: incentives_perc(row['predict_renewal'], row['percent_renewal_improvement']), axis=1)

test_predict['incentives'] = test_predict.incentives_perc * test_predict.premium

drf prediction progress: |████████████████████████████████████████████████| 100%


In [561]:
test_predict = test_predict[['id', 'predict_renewal','incentives']]
test_predict.columns = ['id' , 'renewal','incentives']

In [562]:
test_predict.to_csv('bryce_beckwith_submission.csv')

In [563]:
test_predict.to_csv('final_submission.csv')