<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#loading-data" data-toc-modified-id="loading-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>loading data</a></span></li><li><span><a href="#feature-engineering" data-toc-modified-id="feature-engineering-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>feature engineering</a></span></li><li><span><a href="#lgb-Model" data-toc-modified-id="lgb-Model-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>lgb Model</a></span></li><li><span><a href="#cat-Model" data-toc-modified-id="cat-Model-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>cat Model</a></span></li><li><span><a href="#xgb-Model" data-toc-modified-id="xgb-Model-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>xgb Model</a></span></li><li><span><a href="#ensemble-Model" data-toc-modified-id="ensemble-Model-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>ensemble Model</a></span></li></ul></div>

In [11]:
%load_ext autoreload
%autoreload 2

import sys
import pandas as pd
import numpy as np
from pathlib import Path

import os
import gc

import matplotlib.pyplot as plt
from tqdm.notebook import tqdm as tqdm

pd.set_option("max_colwidth", -1)
pd.set_option("max_columns", 500)
pd.set_option("max_rows", 500)

plt.style.use('fivethirtyeight')
plt.rcParams["axes.labelsize"] = 16
plt.rcParams["xtick.labelsize"] = 14
plt.rcParams["ytick.labelsize"] = 14

%matplotlib inline
import config as config

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### loading data

In [12]:
train = pd.read_csv(Path(config.DATA_DIR,config.TRAIN_FILE), index_col=None)
print(train.shape)
train.head().T

(381109, 12)


Unnamed: 0,0,1,2,3,4
id,1,2,3,4,5
Gender,Male,Male,Male,Male,Female
Age,44,76,47,21,29
Driving_License,1,1,1,1,1
Region_Code,28,3,28,11,41
Previously_Insured,0,0,0,1,1
Vehicle_Age,> 2 Years,1-2 Year,> 2 Years,< 1 Year,< 1 Year
Vehicle_Damage,Yes,No,Yes,No,No
Annual_Premium,40454,33536,38294,28619,27496
Policy_Sales_Channel,26,26,26,152,152


In [13]:
test = pd.read_csv(Path(config.DATA_DIR,config.TEST_FILE), index_col=None)
print(test.shape)
test.head().T

(127037, 11)


Unnamed: 0,0,1,2,3,4
id,381110,381111,381112,381113,381114
Gender,Male,Male,Male,Male,Male
Age,25,40,47,24,27
Driving_License,1,1,1,1,1
Region_Code,11,28,28,27,28
Previously_Insured,1,0,0,1,1
Vehicle_Age,< 1 Year,1-2 Year,1-2 Year,< 1 Year,< 1 Year
Vehicle_Damage,No,Yes,Yes,Yes,No
Annual_Premium,35786,33762,40050,37356,59097
Policy_Sales_Channel,152,7,124,152,152


In [14]:
target = config.TARGET
date_cols = config.DATE_COLS
cat_cols = ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 
            'Region_Code', 'Policy_Sales_Channel']
print("cat cols {}".format(cat_cols))
num_cols = [col for col in train.columns if col not in [target]+cat_cols+date_cols]
print("num cols {}".format(num_cols))

cat cols ['Gender', 'Driving_License', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Region_Code', 'Policy_Sales_Channel']
num cols ['id', 'Age', 'Annual_Premium', 'Vintage']


In [15]:
print("ratio of size of test to train {:0.2f}%".format(len(test)/len(train)*100))

ratio of size of test to train 33.33%


### feature engineering

In [16]:
def feature_engineering(df):

    bins = np.array([25, 65])
    age_map = {0:'0-group', 1:'1-group', 2:'2-group'}
    df['Age_group'] = np.digitize(df['Age'], bins, right=True)
    df['Age_group'] = df['Age_group'].map(age_map)

    df['Vehicle_Damage'] = np.where(df['Vehicle_Damage']=='Yes',1,0)
    df['Vintage_halfyearly'] = np.round(df['Vintage']/180, 0)
    
    tmp = df.groupby('Gender').agg({'Vehicle_Damage': ["median", "std"]})
    tmp.columns = ["Gender_"+"_".join(col) for col in tmp.columns]
    tmp = tmp.reset_index().fillna(0)

    df = pd.merge(df, tmp, on='Gender')
    
    tmp = df.groupby('Age_group').agg({'Vehicle_Damage': ["median", "std"]})
    tmp.columns = ["Age_group_"+"_".join(col) for col in tmp.columns]
    tmp = tmp.reset_index().fillna(0)

    df = pd.merge(df, tmp, on='Age_group')
    
    return df

cat_cols += ['Age_group', 'Vintage_halfyearly']

In [17]:
overall = train.append(test, sort=False)
overall = feature_engineering(overall)

if config.OHE:
    overall = pd.get_dummies(overall, columns=cat_cols)
else:
    overall[cat_cols] = overall[cat_cols].astype('category')

train_data = overall[overall['id'].isin(train['id'])].sort_values('id').reset_index(drop=True)
test_data = overall[overall['id'].isin(test['id'])].sort_values('id').reset_index(drop=True)

del overall
gc.collect()

train_data.shape, test_data.shape

((381109, 18), (127037, 18))

In [18]:
drop_cols = ['Vintage']
drop_cols += config.ID_COLS

In [19]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(train_data.drop([target]+drop_cols, axis=1),
                                                      train_data[target],
                                                      test_size=0.3,
                                                      stratify=train_data[target],
                                                      random_state=34343434)


### lgb Model

In [20]:
import lightgbm as lgb

clf = lgb.LGBMClassifier(verbose=0, n_estimators=10000, class_weight='balanced', colsample_bytree=0.5,
                         learning_rate=0.01, importance_type='gain', lambda_l1=0.1, lambda_l2=0,
                         metric=config.EVAL_METRIC)
clf.fit(X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)], 
        eval_metric=config.EVAL_METRIC,
        early_stopping_rounds=500, 
        verbose=500)

Training until validation scores don't improve for 500 rounds
[500]	valid_0's auc: 0.858977
[1000]	valid_0's auc: 0.859087
Early stopping, best iteration is:
[733]	valid_0's auc: 0.859095


LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=0.5, importance_type='gain', lambda_l1=0.1,
               lambda_l2=0, learning_rate=0.01, max_depth=-1, metric='auc',
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=10000, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
               verbose=0)

In [21]:
clf_final = lgb.LGBMClassifier(verbose=0, n_estimators=int(clf.best_iteration_*1.1), 
                               class_weight='balanced', colsample_bytree=0.5,
                               learning_rate=0.01, importance_type='gain', lambda_l1=0.1, lambda_l2=0,
                                metric=config.EVAL_METRIC)
clf_final.fit(train_data.drop([target]+drop_cols, axis=1), train_data[target])

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=0.5, importance_type='gain', lambda_l1=0.1,
               lambda_l2=0, learning_rate=0.01, max_depth=-1, metric='auc',
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=806, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0,
               verbose=0)

In [22]:
prediction = clf_final.predict_proba(test_data.drop([target]+drop_cols, axis=1))[:, 1]

In [23]:
if len(config.SUBMISSION_COLS):
    sub = pd.DataFrame(prediction, columns=config.SUBMISSION_COLS)
    sub.to_csv(Path(config.DATA_DIR, "lgb_fulldata.csv"), index=None)
else:
    sub = pd.DataFrame()
    for i in range(len(config.ID_COLS)):
        sub[config.ID_COLS[i]] = test[config.ID_COLS[i]]
        
    sub[target] = prediction
    sub.to_csv(Path(config.DATA_DIR, "lgb_fulldata.csv"), index=None)

sub.head()

Unnamed: 0,id,Response
0,381110,0.006658
1,381111,0.792396
2,381112,0.750607
3,381113,0.039712
4,381114,0.004605


In [24]:
del clf, clf_final

### cat Model

In [25]:
import catboost as cat
clf = cat.CatBoostClassifier(verbose=0, 
                             n_estimators=10000, 
#                              learning_rate=0.1, 
                             eval_metric='AUC', 
                             cat_features=cat_cols)
clf.fit(X_train, 
            y_train,
            eval_set=[(X_valid, y_valid)],
            early_stopping_rounds=100, 
            verbose=500)

Learning rate set to 0.05236
0:	test: 0.8239293	best: 0.8239293 (0)	total: 243ms	remaining: 40m 33s
500:	test: 0.8588514	best: 0.8588692 (469)	total: 2m 13s	remaining: 42m 18s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8589042268
bestIteration = 579

Shrink model to first 580 iterations.


<catboost.core.CatBoostClassifier at 0x1a16de08d0>

In [27]:
clf_final = cat.CatBoostClassifier(verbose=0, 
                             n_estimators=int(clf.best_iteration_*1.1),
                             eval_metric='AUC', 
                             cat_features=cat_cols)
clf_final.fit(train_data.drop([target]+drop_cols, axis=1), train_data[target])

<catboost.core.CatBoostClassifier at 0x1a1b761160>

In [28]:
prediction = clf_final.predict_proba(test_data.drop([target]+drop_cols, axis=1))[:, 1]

In [29]:
if len(config.SUBMISSION_COLS):
    sub = pd.DataFrame(prediction, columns=config.SUBMISSION_COLS)
    sub.to_csv(Path(config.DATA_DIR, "cb_fulldata.csv"), index=None)
else:
    sub = pd.DataFrame()
    for i in range(len(config.ID_COLS)):
        sub[config.ID_COLS[i]] = test[config.ID_COLS[i]]
        
    sub[target] = prediction
    sub.to_csv(Path(config.DATA_DIR, "cb_fulldata.csv"), index=None)

sub.head()

Unnamed: 0,id,Response
0,381110,0.000486
1,381111,0.306712
2,381112,0.297098
3,381113,0.00804
4,381114,0.000487


In [30]:
del clf, clf_final

### xgb Model

In [31]:
# Target Encoding

import category_encoders as ce

encoder = ce.target_encoder.TargetEncoder(cols=cat_cols)
X_train = encoder.fit_transform(X_train, y_train)
X_valid = encoder.transform(X_valid)

In [32]:
%%time

import xgboost as xgb

clf = xgb.XGBClassifier(n_jobs=-1,
                        n_estimators=10000,
                        max_depth=7,
                        learning_rate=0.01,
                        colsample_bytree=0.5)

clf.fit(X_train, 
        y_train,
        eval_set=[(X_valid, y_valid)], 
        eval_metric=config.EVAL_METRIC,
        early_stopping_rounds=500, 
        verbose=500)

[0]	validation_0-auc:0.821926
Will train until validation_0-auc hasn't improved in 500 rounds.
[500]	validation_0-auc:0.858146
[1000]	validation_0-auc:0.858855
[1500]	validation_0-auc:0.858804
Stopping. Best iteration:
[1121]	validation_0-auc:0.858926

CPU times: user 54min 9s, sys: 2min 3s, total: 56min 13s
Wall time: 6min 4s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.5, gamma=0,
              learning_rate=0.01, max_delta_step=0, max_depth=7,
              min_child_weight=1, missing=None, n_estimators=10000, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [33]:
# Target Encoding Complete data

encoder = ce.target_encoder.TargetEncoder(cols=cat_cols)
train_data = encoder.fit_transform(train_data, train_data[target])
test_data = encoder.transform(test_data)

In [34]:
clf_final = xgb.XGBClassifier(n_jobs=-1,
                              n_estimators=int(clf.best_iteration*1.1),
                              max_depth=7,
                              learning_rate=0.01,
                              colsample_bytree=0.5)
                        
clf_final.fit(train_data.drop([target]+drop_cols, axis=1), train_data[target])
prediction = clf_final.predict_proba(test_data.drop([target]+drop_cols, axis=1))[:, 1]

if len(config.SUBMISSION_COLS):
    sub = pd.DataFrame(prediction, columns=config.SUBMISSION_COLS)
    sub.to_csv(Path(config.DATA_DIR, "xgb_fulldata.csv"), index=None)
else:
    sub = pd.DataFrame()
    for i in range(len(config.ID_COLS)):
        sub[config.ID_COLS[i]] = test[config.ID_COLS[i]]
        
    sub[target] = prediction
    sub.to_csv(Path(config.DATA_DIR, "xgb_fulldata.csv"), index=None)

sub.head()

Unnamed: 0,id,Response
0,381110,0.000858
1,381111,0.315897
2,381112,0.302546
3,381113,0.008342
4,381114,0.000447


### ensemble Model

In [35]:
import pandas as pd
import numpy as np
import config as config
from pathlib import Path

In [36]:
target='Response'

In [37]:
sub1 = pd.read_csv("lgb_fulldata.csv", index_col=None)
sub2 = pd.read_csv("cb_fulldata.csv", index_col=None)
sub3 = pd.read_csv("xgb_fulldata.csv", index_col=None)

In [38]:
sub1.head()

Unnamed: 0,id,Response
0,381110,0.006658
1,381111,0.792396
2,381112,0.750607
3,381113,0.039712
4,381114,0.004605


In [39]:
sub2.head()

Unnamed: 0,id,Response
0,381110,0.000486
1,381111,0.306712
2,381112,0.297098
3,381113,0.00804
4,381114,0.000487


In [40]:
sub3.head()

Unnamed: 0,id,Response
0,381110,0.000858
1,381111,0.315897
2,381112,0.302546
3,381113,0.008342
4,381114,0.000447


In [41]:
np.corrcoef(sub1[target], sub2[target])

array([[1.        , 0.95230542],
       [0.95230542, 1.        ]])

In [42]:
np.corrcoef(sub1[target], sub3[target])

array([[1.        , 0.96140514],
       [0.96140514, 1.        ]])

In [43]:
w1 = 0.4
w2 = 0.4
w3 = 0.2

total = w1+w2+w3

w1 = w1/total
w2 = w2/total
w3 = w3/total

print(w1,w2,w3)

final_pred = w1*sub1[target].values + w2*sub2[target].values + w3*sub3[target].values

0.4 0.4 0.2


In [44]:
if len(config.SUBMISSION_COLS):
    sub = pd.DataFrame(final_pred, columns=config.SUBMISSION_COLS)
    sub.to_csv(Path(config.DATA_DIR, "ensemble_fulldata.csv"), index=None)
else:
    sub = pd.DataFrame()
    for i in range(len(config.ID_COLS)):
        sub[config.ID_COLS[i]] = sub1[config.ID_COLS[i]]
        
    sub[target] = final_pred
    sub.to_csv(Path(config.DATA_DIR, "ensemble_fulldata.csv"), index=None)

sub.head()

Unnamed: 0,id,Response
0,381110,0.003029
1,381111,0.502822
2,381112,0.479591
3,381113,0.020769
4,381114,0.002126


In [45]:
sub.describe()

Unnamed: 0,id,Response
count,127037.0,127037.0
mean,444628.0,0.206121
std,36672.567411,0.221492
min,381110.0,0.000627
25%,412869.0,0.002177
50%,444628.0,0.089587
75%,476387.0,0.436876
max,508146.0,0.856853
