In [1]:
!pip install --upgrade git+https://github.com/stanfordmlgroup/ngboost.git
!pip install catboost
!pip install lightgbm
!pip install xgboost


Collecting git+https://github.com/stanfordmlgroup/ngboost.git
  Cloning https://github.com/stanfordmlgroup/ngboost.git to /tmp/pip-req-build-n4j52ajj
  Running command git clone -q https://github.com/stanfordmlgroup/ngboost.git /tmp/pip-req-build-n4j52ajj
Collecting lifelines>=0.22.8
[?25l  Downloading https://files.pythonhosted.org/packages/44/d6/3b0c9c2775ce299275b36246850e51c80f1ba3c1af9eef2a34b7bdbdb2d4/lifelines-0.24.8-py3-none-any.whl (332kB)
[K     |████████████████████████████████| 337kB 2.7MB/s 
Collecting autograd-gamma>=0.3
  Downloading https://files.pythonhosted.org/packages/0a/07/d99339c9420b58b723a9189d1373e5c3889758b2202a1a7fe4a3b7a10c5a/autograd_gamma-0.4.2-py2.py3-none-any.whl
Building wheels for collected packages: ngboost
  Building wheel for ngboost (setup.py) ... [?25l[?25hdone
  Created wheel for ngboost: filename=ngboost-0.2.1.dev0-cp36-none-any.whl size=34441 sha256=c7767536e6f82e8e705bf82c4b4947e54c6fb5c3fb143098a2bcb1fde211ab06
  Stored in directory: /tmp

In [0]:
import pandas as pd 
import numpy as np 
import math
from datetime import datetime
import random

import sklearn 
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

from ngboost import NGBClassifier 
from ngboost.distns import Bernoulli
from ngboost.learners import default_tree_learner

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb

In [0]:
SEED_VAL = 1000
random.seed(SEED_VAL)
np.random.seed(SEED_VAL)

# Preapring Data


In [0]:
targets = pd.read_csv('drive/My Drive/The Zimnat Insurance Assurance Challenge by #ZindiWeekendz/train.csv')

## Policy Data 

In [0]:
policy_data = pd.read_csv('drive/My Drive/The Zimnat Insurance Assurance Challenge by #ZindiWeekendz/policy_data.csv')

In [16]:
policy_data=policy_data.drop_duplicates()
print(len(policy_data['Policy ID'].unique()),len(policy_data))

51685 278988


In [0]:
policy_data['PPR_PRODCD'] = pd.Categorical(policy_data['PPR_PRODCD'])
policy_data['NLO_TYPE'] = pd.Categorical(policy_data['NLO_TYPE'])

dfDummiesProd = pd.get_dummies(policy_data['PPR_PRODCD'], prefix = 'category')
dfDummiesNLO = pd.get_dummies(policy_data['NLO_TYPE'], prefix = 'category')

dfDummiesNLO['Policy ID'] = policy_data['Policy ID']
dfDummiesProd['Policy ID'] = policy_data['Policy ID']

dfDummiesNLO=dfDummiesNLO.groupby(by='Policy ID').sum()
dfDummiesProd=dfDummiesProd.groupby(by='Policy ID').sum()



In [18]:
#too null values
policy_data.drop(['NPR_SUMASSURED'],axis=1,inplace=True)

#NLO_AMOUNT - amount if there’s an extra charge 
policy_data['NLO_AMOUNT'] = policy_data['NLO_AMOUNT'].fillna(0)
policy_data['NLO_AMOUNT'] = policy_data['NLO_AMOUNT'].apply(lambda x : 1 if x>0 else 0)

#change date to datetime type 
policy_data['NP2_EFFECTDATE']= policy_data['NP2_EFFECTDATE'].apply(lambda a:
datetime(year=int(a.split('/')[2]), month=int(a.split('/')[1]), day=int(a.split('/')[0]))
)
policy_data.head()

Unnamed: 0,Policy ID,NP2_EFFECTDATE,PPR_PRODCD,NPR_PREMIUM,NPH_LASTNAME,CLF_LIFECD,NSP_SUBPROPOSAL,NLO_TYPE,NLO_AMOUNT,AAG_AGCODE,PCL_LOCATCODE,OCCUPATION,CATEGORY
0,PID_EPZDSP8,2019-09-01,PPR_PRODCD_B2KVCE7,265.724174,NPH_LASTNAME_BPN2LEB,2,222,NLO_TYPE_DPBHSAH,0,AAG_AGCODE_APWOOPE,PCL_LOCATCODE_7SHK7I9,OCCUPATION_NNHJ7XV,CATEGORY_GWW4FYB
1,PID_6M6G9IB,2018-08-01,PPR_PRODCD_64QNIHM,2795.06938,NPH_LASTNAME_U2H3GC6,1,111,NLO_TYPE_XTHV3A3,1,AAG_AGCODE_9Z3FBGA,PCL_LOCATCODE_7VFS3EQ,OCCUPATION_IKCIDKW,CATEGORY_R821UZV
2,PID_UL0F7LH,2017-08-01,PPR_PRODCD_KOFUYNN,2492.759107,NPH_LASTNAME_B68RERV,1,111,NLO_TYPE_XAJI0Y6,1,AAG_AGCODE_Y0LKFF0,PCL_LOCATCODE_SKPRCR4,OCCUPATION_NUJZA7T,CATEGORY_8DALFYO
3,PID_TRGUBTU,2018-04-01,PPR_PRODCD_KOFUYNN,3982.538095,NPH_LASTNAME_NPN3VGI,1,111,NLO_TYPE_XAJI0Y6,1,AAG_AGCODE_1OCF2N0,PCL_LOCATCODE_SPQHMX5,OCCUPATION_W9XA3KX,CATEGORY_LXSLG6M
4,PID_TODLPIB,2019-12-01,PPR_PRODCD_KOFUYNN,1143.953733,NPH_LASTNAME_9VSNH0E,3,555,NLO_TYPE_DPBHSAH,0,AAG_AGCODE_E31VV8B,PCL_LOCATCODE_0T6GYGX,OCCUPATION_NNHJ7XV,CATEGORY_GWW4FYB


In [19]:
stats_pdata = policy_data.groupby(by="Policy ID").agg({
    'NP2_EFFECTDATE':['min','max'],
})
stats_pdata.columns = ["_".join(x) for x in stats_pdata.columns.ravel()]
policy_data = pd.merge(policy_data,stats_pdata,on="Policy ID",how="left")
#Add counts 
policy_data['count'] = policy_data.groupby(by='Policy ID').transform('count')['NP2_EFFECTDATE']
policy_data.head()

Unnamed: 0,Policy ID,NP2_EFFECTDATE,PPR_PRODCD,NPR_PREMIUM,NPH_LASTNAME,CLF_LIFECD,NSP_SUBPROPOSAL,NLO_TYPE,NLO_AMOUNT,AAG_AGCODE,PCL_LOCATCODE,OCCUPATION,CATEGORY,NP2_EFFECTDATE_min,NP2_EFFECTDATE_max,count
0,PID_EPZDSP8,2019-09-01,PPR_PRODCD_B2KVCE7,265.724174,NPH_LASTNAME_BPN2LEB,2,222,NLO_TYPE_DPBHSAH,0,AAG_AGCODE_APWOOPE,PCL_LOCATCODE_7SHK7I9,OCCUPATION_NNHJ7XV,CATEGORY_GWW4FYB,2019-09-01,2019-09-01,10
1,PID_6M6G9IB,2018-08-01,PPR_PRODCD_64QNIHM,2795.06938,NPH_LASTNAME_U2H3GC6,1,111,NLO_TYPE_XTHV3A3,1,AAG_AGCODE_9Z3FBGA,PCL_LOCATCODE_7VFS3EQ,OCCUPATION_IKCIDKW,CATEGORY_R821UZV,2018-08-01,2018-08-01,2
2,PID_UL0F7LH,2017-08-01,PPR_PRODCD_KOFUYNN,2492.759107,NPH_LASTNAME_B68RERV,1,111,NLO_TYPE_XAJI0Y6,1,AAG_AGCODE_Y0LKFF0,PCL_LOCATCODE_SKPRCR4,OCCUPATION_NUJZA7T,CATEGORY_8DALFYO,2017-08-01,2018-05-01,34
3,PID_TRGUBTU,2018-04-01,PPR_PRODCD_KOFUYNN,3982.538095,NPH_LASTNAME_NPN3VGI,1,111,NLO_TYPE_XAJI0Y6,1,AAG_AGCODE_1OCF2N0,PCL_LOCATCODE_SPQHMX5,OCCUPATION_W9XA3KX,CATEGORY_LXSLG6M,2018-04-01,2018-04-01,10
4,PID_TODLPIB,2019-12-01,PPR_PRODCD_KOFUYNN,1143.953733,NPH_LASTNAME_9VSNH0E,3,555,NLO_TYPE_DPBHSAH,0,AAG_AGCODE_E31VV8B,PCL_LOCATCODE_0T6GYGX,OCCUPATION_NNHJ7XV,CATEGORY_GWW4FYB,2019-12-01,2019-12-01,12


In [0]:
policy_data.drop(['NLO_TYPE','PPR_PRODCD'],axis=1,inplace=True)

In [0]:
#last name ~unique identfiefr , NLO AMount replaced by Nlo_amount_sum 
policy_data.drop(['NPH_LASTNAME','NLO_AMOUNT'],axis=1,inplace=True)

In [0]:
policy_data['monthOfPolicy'] = policy_data['NP2_EFFECTDATE'].apply(lambda x:x.month)
policy_data['diffMaxMinDate'] = (policy_data['NP2_EFFECTDATE_max']-policy_data['NP2_EFFECTDATE_min'])
policy_data['diffMaxMinDate'] =policy_data['diffMaxMinDate'].apply(lambda x : x.days)
policy_data['BOOLdiffMaxMinDate'] = policy_data['diffMaxMinDate'].apply(lambda x: 1 if x>0 else 0)


In [23]:
policy_data = pd.merge(policy_data,dfDummiesNLO,left_on="Policy ID",right_index=True,how="left")
policy_data = pd.merge(policy_data,dfDummiesProd,left_on="Policy ID",right_index=True,how="left")
policy_data.head()

Unnamed: 0,Policy ID,NP2_EFFECTDATE,NPR_PREMIUM,CLF_LIFECD,NSP_SUBPROPOSAL,AAG_AGCODE,PCL_LOCATCODE,OCCUPATION,CATEGORY,NP2_EFFECTDATE_min,NP2_EFFECTDATE_max,count,monthOfPolicy,diffMaxMinDate,BOOLdiffMaxMinDate,category_NLO_TYPE_4V30T9N,category_NLO_TYPE_DPBHSAH,category_NLO_TYPE_T3W5UZB,category_NLO_TYPE_XAJI0Y6,category_NLO_TYPE_XTHV3A3,category_NLO_TYPE_ZMF8MDD,category_PPR_PRODCD_165U4LY,category_PPR_PRODCD_64QNIHM,category_PPR_PRODCD_6J9HUC7,category_PPR_PRODCD_8ZHO2CE,category_PPR_PRODCD_APTRA9E,category_PPR_PRODCD_B2KVCE7,category_PPR_PRODCD_EJ7YKFV,category_PPR_PRODCD_GLE2MHV,category_PPR_PRODCD_H6S21FA,category_PPR_PRODCD_ID7TAK9,category_PPR_PRODCD_KOFUYNN,category_PPR_PRODCD_KYXNF1V,category_PPR_PRODCD_OK3TM96,category_PPR_PRODCD_T8ONK55,category_PPR_PRODCD_V6HBYGK,category_PPR_PRODCD_W0F6GK1,category_PPR_PRODCD_XRWHUER
0,PID_EPZDSP8,2019-09-01,265.724174,2,222,AAG_AGCODE_APWOOPE,PCL_LOCATCODE_7SHK7I9,OCCUPATION_NNHJ7XV,CATEGORY_GWW4FYB,2019-09-01,2019-09-01,10,9,0,0,0,9,0,1,0,0,0,0,0,0,0,5,0,0,0,0,5,0,0,0,0,0,0
1,PID_6M6G9IB,2018-08-01,2795.06938,1,111,AAG_AGCODE_9Z3FBGA,PCL_LOCATCODE_7VFS3EQ,OCCUPATION_IKCIDKW,CATEGORY_R821UZV,2018-08-01,2018-08-01,2,8,0,0,0,0,0,1,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,PID_UL0F7LH,2017-08-01,2492.759107,1,111,AAG_AGCODE_Y0LKFF0,PCL_LOCATCODE_SKPRCR4,OCCUPATION_NUJZA7T,CATEGORY_8DALFYO,2017-08-01,2018-05-01,34,8,273,1,0,31,0,3,0,0,0,0,0,0,0,17,0,0,0,0,17,0,0,0,0,0,0
3,PID_TRGUBTU,2018-04-01,3982.538095,1,111,AAG_AGCODE_1OCF2N0,PCL_LOCATCODE_SPQHMX5,OCCUPATION_W9XA3KX,CATEGORY_LXSLG6M,2018-04-01,2018-04-01,10,4,0,0,0,9,0,1,0,0,0,0,0,0,0,5,0,0,0,0,5,0,0,0,0,0,0
4,PID_TODLPIB,2019-12-01,1143.953733,3,555,AAG_AGCODE_E31VV8B,PCL_LOCATCODE_0T6GYGX,OCCUPATION_NNHJ7XV,CATEGORY_GWW4FYB,2019-12-01,2019-12-01,12,12,0,0,0,11,0,1,0,0,0,0,0,0,0,6,0,0,0,0,6,0,0,0,0,0,0


In [0]:
policy_data.drop(['NP2_EFFECTDATE','NPR_PREMIUM','CLF_LIFECD',
                  'NSP_SUBPROPOSAL','NP2_EFFECTDATE_min',
                  'NP2_EFFECTDATE_max'],axis=1,inplace=True)

In [25]:
policy_data = policy_data.drop_duplicates(subset=['Policy ID'])
policy_data.head()

Unnamed: 0,Policy ID,AAG_AGCODE,PCL_LOCATCODE,OCCUPATION,CATEGORY,count,monthOfPolicy,diffMaxMinDate,BOOLdiffMaxMinDate,category_NLO_TYPE_4V30T9N,category_NLO_TYPE_DPBHSAH,category_NLO_TYPE_T3W5UZB,category_NLO_TYPE_XAJI0Y6,category_NLO_TYPE_XTHV3A3,category_NLO_TYPE_ZMF8MDD,category_PPR_PRODCD_165U4LY,category_PPR_PRODCD_64QNIHM,category_PPR_PRODCD_6J9HUC7,category_PPR_PRODCD_8ZHO2CE,category_PPR_PRODCD_APTRA9E,category_PPR_PRODCD_B2KVCE7,category_PPR_PRODCD_EJ7YKFV,category_PPR_PRODCD_GLE2MHV,category_PPR_PRODCD_H6S21FA,category_PPR_PRODCD_ID7TAK9,category_PPR_PRODCD_KOFUYNN,category_PPR_PRODCD_KYXNF1V,category_PPR_PRODCD_OK3TM96,category_PPR_PRODCD_T8ONK55,category_PPR_PRODCD_V6HBYGK,category_PPR_PRODCD_W0F6GK1,category_PPR_PRODCD_XRWHUER
0,PID_EPZDSP8,AAG_AGCODE_APWOOPE,PCL_LOCATCODE_7SHK7I9,OCCUPATION_NNHJ7XV,CATEGORY_GWW4FYB,10,9,0,0,0,9,0,1,0,0,0,0,0,0,0,5,0,0,0,0,5,0,0,0,0,0,0
1,PID_6M6G9IB,AAG_AGCODE_9Z3FBGA,PCL_LOCATCODE_7VFS3EQ,OCCUPATION_IKCIDKW,CATEGORY_R821UZV,2,8,0,0,0,0,0,1,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,PID_UL0F7LH,AAG_AGCODE_Y0LKFF0,PCL_LOCATCODE_SKPRCR4,OCCUPATION_NUJZA7T,CATEGORY_8DALFYO,34,8,273,1,0,31,0,3,0,0,0,0,0,0,0,17,0,0,0,0,17,0,0,0,0,0,0
3,PID_TRGUBTU,AAG_AGCODE_1OCF2N0,PCL_LOCATCODE_SPQHMX5,OCCUPATION_W9XA3KX,CATEGORY_LXSLG6M,10,4,0,0,0,9,0,1,0,0,0,0,0,0,0,5,0,0,0,0,5,0,0,0,0,0,0
4,PID_TODLPIB,AAG_AGCODE_E31VV8B,PCL_LOCATCODE_0T6GYGX,OCCUPATION_NNHJ7XV,CATEGORY_GWW4FYB,12,12,0,0,0,11,0,1,0,0,0,0,0,0,0,6,0,0,0,0,6,0,0,0,0,0,0


## Create train and test df

In [26]:
ss = pd.read_csv('/content/drive/My Drive/The Zimnat Insurance Assurance Challenge by #ZindiWeekendz/sample_sub.csv')
ss.head(1)

Unnamed: 0,Policy ID,Lapse
0,PID_4928TWH,0


In [0]:
train = pd.merge(policy_data,targets[targets.Lapse=='1'],on='Policy ID',how="left")
train['Lapse'] = train.Lapse.fillna(0)
train.drop('Lapse Year',axis=1,inplace=True)

In [28]:
test = pd.DataFrame()
test['Policy ID']= ss['Policy ID']
test = pd.merge(test,policy_data,how='left',on='Policy ID')
test.head()

Unnamed: 0,Policy ID,AAG_AGCODE,PCL_LOCATCODE,OCCUPATION,CATEGORY,count,monthOfPolicy,diffMaxMinDate,BOOLdiffMaxMinDate,category_NLO_TYPE_4V30T9N,category_NLO_TYPE_DPBHSAH,category_NLO_TYPE_T3W5UZB,category_NLO_TYPE_XAJI0Y6,category_NLO_TYPE_XTHV3A3,category_NLO_TYPE_ZMF8MDD,category_PPR_PRODCD_165U4LY,category_PPR_PRODCD_64QNIHM,category_PPR_PRODCD_6J9HUC7,category_PPR_PRODCD_8ZHO2CE,category_PPR_PRODCD_APTRA9E,category_PPR_PRODCD_B2KVCE7,category_PPR_PRODCD_EJ7YKFV,category_PPR_PRODCD_GLE2MHV,category_PPR_PRODCD_H6S21FA,category_PPR_PRODCD_ID7TAK9,category_PPR_PRODCD_KOFUYNN,category_PPR_PRODCD_KYXNF1V,category_PPR_PRODCD_OK3TM96,category_PPR_PRODCD_T8ONK55,category_PPR_PRODCD_V6HBYGK,category_PPR_PRODCD_W0F6GK1,category_PPR_PRODCD_XRWHUER
0,PID_4928TWH,AAG_AGCODE_KZITWY3,PCL_LOCATCODE_0T6GYGX,OCCUPATION_9DOM5IG,CATEGORY_8DALFYO,1,8,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
1,PID_KBLLEGK,AAG_AGCODE_HG2GBMQ,PCL_LOCATCODE_PI2W0SA,OCCUPATION_8DRC11E,CATEGORY_LXSLG6M,2,5,0,0,0,0,0,1,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,PID_90F0QA3,AAG_AGCODE_C8COEA8,PCL_LOCATCODE_O6OBSFL,OCCUPATION_NNHJ7XV,CATEGORY_GWW4FYB,2,9,0,0,0,0,0,1,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,PID_18F3NHF,AAG_AGCODE_2ZOT2W3,PCL_LOCATCODE_295LOO6,OCCUPATION_NNHJ7XV,CATEGORY_GWW4FYB,1,12,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
4,PID_SX4QUVO,AAG_AGCODE_L1YVKZO,PCL_LOCATCODE_PEU5TF2,OCCUPATION_NNHJ7XV,CATEGORY_GWW4FYB,1,3,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


In [0]:
le = LabelEncoder()
for i in ['PCL_LOCATCODE','OCCUPATION','CATEGORY','AAG_AGCODE']:
  le.fit(policy_data[i])
  train[i] = le.transform(train[i])
  test[i] = le.transform(test[i])

In [30]:
train.head()

Unnamed: 0,Policy ID,AAG_AGCODE,PCL_LOCATCODE,OCCUPATION,CATEGORY,count,monthOfPolicy,diffMaxMinDate,BOOLdiffMaxMinDate,category_NLO_TYPE_4V30T9N,category_NLO_TYPE_DPBHSAH,category_NLO_TYPE_T3W5UZB,category_NLO_TYPE_XAJI0Y6,category_NLO_TYPE_XTHV3A3,category_NLO_TYPE_ZMF8MDD,category_PPR_PRODCD_165U4LY,category_PPR_PRODCD_64QNIHM,category_PPR_PRODCD_6J9HUC7,category_PPR_PRODCD_8ZHO2CE,category_PPR_PRODCD_APTRA9E,category_PPR_PRODCD_B2KVCE7,category_PPR_PRODCD_EJ7YKFV,category_PPR_PRODCD_GLE2MHV,category_PPR_PRODCD_H6S21FA,category_PPR_PRODCD_ID7TAK9,category_PPR_PRODCD_KOFUYNN,category_PPR_PRODCD_KYXNF1V,category_PPR_PRODCD_OK3TM96,category_PPR_PRODCD_T8ONK55,category_PPR_PRODCD_V6HBYGK,category_PPR_PRODCD_W0F6GK1,category_PPR_PRODCD_XRWHUER,Lapse
0,PID_EPZDSP8,181,5,144,2,10,9,0,0,0,9,0,1,0,0,0,0,0,0,0,5,0,0,0,0,5,0,0,0,0,0,0,0
1,PID_6M6G9IB,170,6,110,5,2,8,0,0,0,0,0,1,1,0,0,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,PID_UL0F7LH,550,12,145,1,34,8,273,1,0,31,0,3,0,0,0,0,0,0,0,17,0,0,0,0,17,0,0,0,0,0,0,0
3,PID_TRGUBTU,34,13,192,3,10,4,0,0,0,9,0,1,0,0,0,0,0,0,0,5,0,0,0,0,5,0,0,0,0,0,0,0
4,PID_TODLPIB,225,1,144,2,12,12,0,0,0,11,0,1,0,0,0,0,0,0,0,6,0,0,0,0,6,0,0,0,0,0,0,0


In [0]:
train.rename(columns={'Lapse':'target'},inplace=True)

In [0]:
train['target']=train['target'].astype('int')


In [0]:
train.drop('Policy ID',axis=1,inplace=True)
test.drop('Policy ID',axis=1,inplace=True)


In [34]:
len(train),len(test)

(51685, 43707)

# Modeling

In [0]:
X = train.drop(['target'],axis=1)
y = train['target']

In [0]:
X_train,X_test,y_train,y_test = train_test_split(X,y,stratify=y,random_state = SEED_VAL)

## Random Forest

In [37]:
rf = RandomForestClassifier(n_estimators=400,
                      
                      max_depth = 13,
                      random_state=SEED_VAL)
rf.fit(X_train,y_train)
pred = rf.predict_proba(X_test)[:,1]
print(log_loss(y_test,pred))

0.32432570921819254


In [38]:
rf = RandomForestClassifier(n_estimators=400,
                      
                      max_depth = 13,
                      random_state=SEED_VAL)
rf.fit(X,y)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=13, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=400,
                       n_jobs=None, oob_score=False, random_state=1000,
                       verbose=0, warm_start=False)

In [0]:
preds_rf = rf.predict_proba(test)[:,1]

## XGB

In [0]:
xgb_model  = XGBClassifier(
                       n_estimators = 700,
                       learning_rate=0.075,
                       max_depth=3,
                       min_child_weight=12,
                       colsample_by_tree=0.7,
                       seed=SEED_VAL,
                       subsample=1,
                       
                      )


In [41]:
xgb_model.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_by_tree=0.7,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              gamma=0, learning_rate=0.075, max_delta_step=0, max_depth=3,
              min_child_weight=12, missing=None, n_estimators=700, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=1000,
              silent=None, subsample=1, verbosity=1)

In [0]:
pred_xgb = np.array(xgb_model.predict_proba(X_test))[:,1]

In [43]:
log_loss(y_test,pred_xgb)

0.32026504531254707

In [0]:
xgb_model.fit(X,y)
preds_xgb = xgb_model.predict_proba(test)[:,1]

## Catboost


In [0]:
model = CatBoostClassifier(iterations=900, 
                           learning_rate=0.1,
                           loss_function='Logloss',
                           random_seed = SEED_VAL,
                           verbose=100) 

In [46]:
model.fit(X_train,y_train)

0:	learn: 0.5479885	total: 70.4ms	remaining: 1m 3s
100:	learn: 0.3216773	total: 2.13s	remaining: 16.8s
200:	learn: 0.3098372	total: 4.17s	remaining: 14.5s
300:	learn: 0.3013640	total: 6.21s	remaining: 12.4s
400:	learn: 0.2945252	total: 8.26s	remaining: 10.3s
500:	learn: 0.2883160	total: 10.3s	remaining: 8.22s
600:	learn: 0.2837161	total: 12.4s	remaining: 6.18s
700:	learn: 0.2792504	total: 14.5s	remaining: 4.12s
800:	learn: 0.2749949	total: 16.6s	remaining: 2.05s
899:	learn: 0.2709167	total: 18.6s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f6be870dc18>

In [0]:
pred = np.array(model.predict_proba(X_test))[:,1]

In [48]:
log_loss(y_test,pred)

0.3166625310104639

In [49]:
model.fit(X,y)

0:	learn: 0.5437310	total: 25.6ms	remaining: 23s
100:	learn: 0.3218548	total: 2.64s	remaining: 20.9s
200:	learn: 0.3124369	total: 5.17s	remaining: 18s
300:	learn: 0.3044347	total: 7.74s	remaining: 15.4s
400:	learn: 0.2988292	total: 10.3s	remaining: 12.8s
500:	learn: 0.2938678	total: 12.9s	remaining: 10.3s
600:	learn: 0.2897955	total: 15.4s	remaining: 7.68s
700:	learn: 0.2861176	total: 18s	remaining: 5.11s
800:	learn: 0.2823689	total: 20.6s	remaining: 2.54s
899:	learn: 0.2791929	total: 23.1s	remaining: 0us


<catboost.core.CatBoostClassifier at 0x7f6be870dc18>

In [0]:
preds_cat = model.predict_proba(test)[:,1]

## LGB

In [0]:
d_train = lgb.Dataset(X_train, label=y_train)
# parameters for LightGBMClassifier
params = {
    'objective' :'binary',
    'learning_rate' : 0.06,
    'num_leaves' : 50,
    'feature_fraction': 0.5, 
    'bagging_fraction': 0.85, 
    'bagging_freq':1,
    'boosting_type' : 'gbdt',
    'metric': 'binary_logloss',
    'max_depth':260,
    'seed':SEED_VAL
}
n_estimators = 200
model_lgb = lgb.train(params, d_train, n_estimators, verbose_eval=1)
d_train = 0


In [0]:
pred_prob_lgbm = model_lgb.predict(X_test)

In [53]:
log_loss(y_test,pred_prob_lgbm)

0.31519373350099567

In [0]:
d_train = lgb.Dataset(X, label=y)
model_lgb = lgb.train(params, d_train, n_estimators, verbose_eval=1)
preds_lgb = model_lgb.predict(test)

## NGboost

In [0]:
ngb_model  = NGBClassifier(Dist=Bernoulli,
                           random_state=SEED_VAL,
                           n_estimators=700,
                           learning_rate=0.01)


In [56]:
ngb_model.fit(np.array(X_train),np.array(y_train))

[iter 0] loss=0.4302 val_loss=0.0000 scale=2.0000 norm=4.0000
[iter 100] loss=0.3457 val_loss=0.0000 scale=2.0000 norm=3.2476
[iter 200] loss=0.3381 val_loss=0.0000 scale=2.0000 norm=3.2088
[iter 300] loss=0.3370 val_loss=0.0000 scale=2.0000 norm=3.2087
[iter 400] loss=0.3365 val_loss=0.0000 scale=8.0000 norm=12.8339
[iter 500] loss=0.3365 val_loss=0.0000 scale=8.0000 norm=12.8337
[iter 600] loss=0.3365 val_loss=0.0000 scale=8.0000 norm=12.8337


NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=3,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=None, splitter='best'),
              Dist=<class 'ngboost.distns.categorical.k_categorical.<locals>.Categorical'>,
              Score=<class 'ngboost.scores.LogScore'>, col_sample=1.0,
              learning_rate=0.01, minibatch_frac=1.0, n_estimators=700,
              natural_gradient=True,
              random_state=RandomState(MT1

In [0]:
pred = ngb_model.pred_dist(np.array(X_test))

In [58]:
log_loss(y_test,pred.probs[1])

0.3358886732837496

In [59]:
ngb_model.fit(np.array(X),np.array(y))

[iter 0] loss=0.3364 val_loss=0.0000 scale=2.0000 norm=3.2100
[iter 100] loss=0.3362 val_loss=0.0000 scale=8.0000 norm=12.8273
[iter 200] loss=0.3362 val_loss=0.0000 scale=8.0000 norm=12.8273
[iter 300] loss=0.3362 val_loss=0.0000 scale=8.0000 norm=12.8273
[iter 400] loss=0.3362 val_loss=0.0000 scale=0.0002 norm=0.0004
[iter 500] loss=0.3362 val_loss=0.0000 scale=0.0002 norm=0.0004
[iter 600] loss=0.3362 val_loss=0.0000 scale=0.0002 norm=0.0004


NGBClassifier(Base=DecisionTreeRegressor(ccp_alpha=0.0,
                                         criterion='friedman_mse', max_depth=3,
                                         max_features=None, max_leaf_nodes=None,
                                         min_impurity_decrease=0.0,
                                         min_impurity_split=None,
                                         min_samples_leaf=1,
                                         min_samples_split=2,
                                         min_weight_fraction_leaf=0.0,
                                         presort='deprecated',
                                         random_state=None, splitter='best'),
              Dist=<class 'ngboost.distns.categorical.k_categorical.<locals>.Categorical'>,
              Score=<class 'ngboost.scores.LogScore'>, col_sample=1.0,
              learning_rate=0.01, minibatch_frac=1.0, n_estimators=700,
              natural_gradient=True,
              random_state=RandomState(MT1

In [0]:
pred_ngb = ngb_model.pred_dist(np.array(test)).probs[1]

# Sub

In [61]:
sub=pd.DataFrame()
sub['Policy ID'] = ss['Policy ID']
sub['Lapse'] =  (((preds_xgb+preds_cat+preds_cat)/3)*0.2+preds_rf*0.1 + pred_ngb*0.7)*0.75
sub.head()

Unnamed: 0,Policy ID,Lapse
0,PID_4928TWH,0.244966
1,PID_KBLLEGK,6.2e-05
2,PID_90F0QA3,5.7e-05
3,PID_18F3NHF,0.067835
4,PID_SX4QUVO,0.204525


In [0]:
sub.to_csv('FinalCheck.csv',index=False)
#0.2442192983426