In [26]:
import pandas as pd
import numpy as np

from sklearn.model_selection import *
from sklearn.metrics import accuracy_score,roc_auc_score
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
le = preprocessing.LabelEncoder()


import warnings
warnings.filterwarnings("ignore")

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [27]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

policy=list(set(train['Policy_Sales_Channel']).intersection(set(test['Policy_Sales_Channel'])))
train['Policy_Sales_Channel']=train['Policy_Sales_Channel'].apply(lambda x: x if x in policy else 'Missing')
test['Policy_Sales_Channel']=test['Policy_Sales_Channel'].apply(lambda x: x if x in policy else 'Missing')

In [28]:
train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152,39,0


In [29]:
data=pd.concat([train,test])
data.drop(['id'],axis=1,inplace=True)

cat_col=['Gender', 'Driving_License', 'Region_Code','Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage','Policy_Sales_Channel']
for col in cat_col:
    data[col]=data[col].astype(str)
    data[col]=le.fit_transform(data[col])
    
train2=data[~data['Response'].isna()]
test2=data[data['Response'].isna()]

X=train2.drop(['Response','Policy_Sales_Channel'],axis=1)
y=train2['Response']
X_test=test2[X.columns]
cat_col=['Gender', 'Driving_License', 'Region_Code','Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage']

In [30]:
from deeptables.models.deeptable import DeepTable, ModelConfig
from deeptables.datasets import dsutils
from deeptables.models import deeptable
from tensorflow import keras
from tensorflow.keras.callbacks import LearningRateScheduler , EarlyStopping

In [31]:
def build_model():  
    conf = ModelConfig( dnn_params={'hidden_units':((128, 0.2, True),(128, 0.2, True),),
                                   'dnn_activation':'relu',},
                        fixed_embedding_dim=True,
                        embeddings_output_dim=10,
                        nets =['pnn_nets'],
                        stacking_op = 'add',
                        metrics=['accuracy'],
                        apply_class_weight=False,
                        categorical_columns = cat_col
                        )
    dt = DeepTable(config = conf)
    
    return dt

In [32]:
epochs = 100 
batch_size = 2000
seeds = [32,432 ,73]
early_stop = EarlyStopping(monitor='val_loss', patience = 5 ,restore_best_weights= True)
annealer = LearningRateScheduler(lambda x: 1e-3 * 0.95 ** x) #for plateau

In [33]:
scores = []
X_train_cv,y_train_cv = X.copy(), y.copy()
pred=np.zeros((len(test2),1))
sssf = StratifiedShuffleSplit(n_splits=5, test_size = 0.2,random_state=1)
for i, (idxT, idxV) in enumerate(sssf.split(X_train_cv, y_train_cv)):  
    
    steps_per_epoch = len(X_train_cv.iloc[idxT])//batch_size  
    validation_steps = len(X_train_cv.iloc[idxV])//batch_size
    
    print('Fold',i)
    print(' rows of train =',len(idxT),'rows of holdout =',len(idxV))
    
    dt_cv =  build_model()
    model_dnn_cv, history_cv = dt_cv.fit(X_train_cv.iloc[idxT], y_train_cv.iloc[idxT],
                                                 validation_data = (X_train_cv.iloc[idxV],y_train_cv.iloc[idxV]),
                                                 steps_per_epoch = steps_per_epoch,
                                                 validation_steps = validation_steps,
                                                 batch_size=batch_size, epochs=epochs, 
                                                 verbose=0, callbacks=[early_stop,annealer])
    
    val_stats = dt_cv.evaluate(X_train_cv.iloc[idxV],y_train_cv.iloc[idxV], batch_size=batch_size, verbose=0)
    pred+=dt_cv.predict_proba(X_test)
    
    acc= val_stats['accuracy']*100
    scores.append(acc)      

    print ('LGB Val CV=',acc)
    print('_'*130)
    print('\n')

Fold 0
 rows of train = 304887 rows of holdout = 76222
2 class detected, {0.0, 1.0}, so inferred as a [binary classification] task
Preparing features cost:0.023377180099487305
Imputation cost:0.11058306694030762
Categorical encoding cost:0.1085209846496582
fit_transform cost:0.2991211414337158
transform_X cost:2.570528030395508
transform_y cost:0.0017039775848388672
>>>>>>>>>>>>>>>>>>>>>> Model Desc <<<<<<<<<<<<<<<<<<<<<<< 
---------------------------------------------------------
inputs:
---------------------------------------------------------
['all_categorical_vars: (6)', 'input_continuous_all: (3)']
---------------------------------------------------------
embeddings:
---------------------------------------------------------
input_dims: [4, 4, 55, 4, 5, 4]
output_dims: [10, 10, 10, 10, 10, 10]
dropout: 0.3
---------------------------------------------------------
dense: dropout: 0
batch_normalization: False
---------------------------------------------------------
concat_embed_dens

Preparing features cost:0.018023014068603516
Imputation cost:0.0950620174407959
Categorical encoding cost:0.10158205032348633
fit_transform cost:0.2695589065551758
transform_X cost:2.6078779697418213
transform_y cost:0.0015380382537841797
>>>>>>>>>>>>>>>>>>>>>> Model Desc <<<<<<<<<<<<<<<<<<<<<<< 
---------------------------------------------------------
inputs:
---------------------------------------------------------
['all_categorical_vars: (6)', 'input_continuous_all: (3)']
---------------------------------------------------------
embeddings:
---------------------------------------------------------
input_dims: [4, 4, 55, 4, 5, 4]
output_dims: [10, 10, 10, 10, 10, 10]
dropout: 0.3
---------------------------------------------------------
dense: dropout: 0
batch_normalization: False
---------------------------------------------------------
concat_embed_dense: shape: (None, 63)
---------------------------------------------------------
nets: ['pnn_nets']
--------------------------------

In [35]:
pred=pred/5
sub=pd.DataFrame()
sub['Response']=pred.ravel()
sub.index=test.id
sub=sub[['Response']]
sub.to_csv('DeepTable.csv')
sub.head()

Unnamed: 0_level_0,Response
id,Unnamed: 1_level_1
381110,0.000155
381111,0.072189
381112,0.066493
381113,0.001948
381114,0.000113
