In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
import lightgbm
import xgboost as xgb
from catboost import CatBoostClassifier


In [2]:
train = pd.read_csv('/SharedHDD/Code/Cross_sell/train.csv')
test = pd.read_csv('/SharedHDD/Code/Cross_sell/test.csv')
submission = pd.read_csv('/SharedHDD/Code/Cross_sell/sample-submission.csv')

In [3]:
df=pd.concat([train,test],axis=0)

# Feature Engineering & EDA

In [4]:
le = LabelEncoder()
for col in df.select_dtypes(include='object').columns:
    df[col] = le.fit_transform(df[col])

In [5]:
# creating new features from the existing features
df['Region_population'] = df.groupby(['Region_Code'])['id'].transform('nunique')
df['Region_safety'] = round(df.groupby(['Region_Code'])['Vehicle_Damage'].transform('mean')*100).astype('int')
df['Channel_popularity'] =  round(df.groupby(['Region_Code'])['Policy_Sales_Channel'].transform('nunique')/df['Region_population']*100).astype('int')
df.loc[df['Channel_popularity']==8,'Channel_popularity'] = 7
df['Channel_mean_time'] = round(df.groupby(['Policy_Sales_Channel'])['Vintage'].transform('mean')).astype('int')


In [15]:
df['Region_population'] = pd.cut(df.Region_population,labels=np.arange(1,16), bins=15, right=True)
df['Region_safety'] = pd.cut(df['Region_safety'], bins=5,labels=np.arange(1,6), right=True)

In [16]:
def getdummies(df):
    cat_columns = ["Region_Code", 'Vehicle_Age', 'Policy_Sales_Channel', 'Channel_popularity', 'Region_population','Region_safety']
    df_without = df.drop(cat_columns, axis=1)
    for i in cat_columns:
        df[i] = df[i].astype('category')
    df = pd.get_dummies(df[cat_columns])
    df = pd.concat([df,df_without], axis=1)
    return df

In [40]:
df

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,Region_population,Region_safety,Channel_popularity,Channel_mean_time
0,1,1,44,1,28.0,0,2,1,40454.0,26.0,217,1.0,15,5,0,154
1,2,1,76,1,3.0,0,0,0,33536.0,26.0,183,0.0,2,3,1,154
2,3,1,47,1,28.0,0,2,1,38294.0,26.0,27,1.0,15,5,0,154
3,4,1,21,1,11.0,1,1,0,28619.0,152.0,203,0.0,2,2,0,155
4,5,0,29,1,41.0,1,1,0,27496.0,152.0,39,0.0,3,2,0,155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127032,508142,0,26,1,37.0,1,1,0,30867.0,152.0,56,,1,4,1,155
127033,508143,0,38,1,28.0,0,0,1,28700.0,122.0,165,,15,5,0,153
127034,508144,1,21,1,46.0,1,1,0,29802.0,152.0,74,,3,3,0,155
127035,508145,1,71,1,28.0,1,0,0,62875.0,26.0,265,,15,5,0,154


In [41]:
df_onehot = getdummies(df)

In [42]:
# fixit

X=df[df['Response'].isnull()==False].drop(['id', 'Response'],axis=1)
X_onehot=df_onehot[df_onehot['Response'].isnull()==False].drop(['id', 'Response'],axis=1)
y=df[df['Response'].isnull()==False]['Response']
y=pd.DataFrame(y,columns=['Response'])
X_main_test=df[df['Response'].isnull()==True].drop(['id','Response'],axis=1)
X_main_test_onehot=df_onehot[df_onehot['Response'].isnull()==True].drop(['id','Response'],axis=1)


In [55]:
def make_integer():
    for col in X.select_dtypes(include=['float', 'category']).columns:
        X[col] = X[col].astype('int')
        X_main_test[col] = X_main_test[col].astype('int')
    return X, X_main_test
        

In [59]:
cat_features =['Region_population', 'Region_safety', 'Channel_popularity', 'Driving_License', 'Gender', 'Vehicle_Age', 'Vehicle_Damage','Policy_Sales_Channel', 'Region_Code', 'Previously_Insured']

## Model

In [60]:
 def lgb_train(X, y, test_X, params):
    kf = KFold(n_splits=5, shuffle=True, random_state=2017)
    pred_test = 0
    pred_train = 0
    for dev_index, val_index in kf.split(X):
        train_x, valid_x = X.iloc[dev_index,:], X.iloc[val_index,:]
        train_y, valid_y = y[dev_index], y[val_index]
        lgtrain = lightgbm.Dataset(train_x, train_y,categorical_feature=cat_features)
        lgvalid = lightgbm.Dataset(valid_x, valid_y,categorical_feature=cat_features)
        model = lightgbm.train(params, lgtrain, valid_sets=[lgvalid],  verbose_eval=100)
        pred_test_iter = model.predict(test_X, num_iteration=model.best_iteration)
        pred_test_iter[pred_test_iter<0]=0
        pred_test+=pred_test_iter
        pred_train_iter = model.predict(X, num_iteration=model.best_iteration)
        pred_train_iter[pred_train_iter<0]=0
        pred_train+=pred_train_iter
    pred_test /= 5.
    pred_train  /= 5.
    return pred_test, pred_train

In [61]:
lightgbm_params = {
    'n_estimators':500,
    'learning_rate':0.1,
    'num_leaves':123,
    'colsample_bytree':0.8,
    'subsample':0.9,
    'max_depth':8,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
    'min_split_gain':0.01,
    'min_child_weight':2,  
    'metric' : 'binary_error'
}

In [62]:
sub_lgb_test, sub_lgb_train = lgb_train(X, y['Response'].astype('int'), X_main_test, lightgbm_params)



You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 832
[LightGBM] [Info] Number of data points in the train set: 304887, number of used features: 14
[LightGBM] [Info] Start training from score 0.122665




[100]	valid_0's binary_error: 0.122104


[200]	valid_0's binary_error: 0.122314




[300]	valid_0's binary_error: 0.122314




[400]	valid_0's binary_error: 0.122314




[500]	valid_0's binary_error: 0.122314
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 828
[LightGBM] [Info] Number of data points in the train set: 304887, number of used features: 14
[LightGBM] [Info] Start training from score 0.122570


[100]	valid_0's binary_error: 0.122668


[200]	valid_0's binary_error: 0.122773




[300]	valid_0's binary_error: 0.122773


[400]	valid_0's binary_error: 0.122773




[500]	valid_0's binary_error: 0.122773


You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 831
[LightGBM] [Info] Number of data points in the train set: 304887, number of used features: 14
[LightGBM] [Info] Start training from score 0.122524
[100]	valid_0's binary_error: 0.122406


[200]	valid_0's binary_error: 0.122537




[300]	valid_0's binary_error: 0.122537




[400]	valid_0's binary_error: 0.122537




[500]	valid_0's binary_error: 0.122537
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 837
[LightGBM] [Info] Number of data points in the train set: 304887, number of used features: 14
[LightGBM] [Info] Start training from score 0.122534
[100]	valid_0's binary_error: 0.122419




[200]	valid_0's binary_error: 0.122563




[300]	valid_0's binary_error: 0.122563




[400]	valid_0's binary_error: 0.122563




[500]	valid_0's binary_error: 0.122563
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 833
[LightGBM] [Info] Number of data points in the train set: 304888, number of used features: 14
[LightGBM] [Info] Start training from score 0.122524
[100]	valid_0's binary_error: 0.122683


[200]	valid_0's binary_error: 0.122801




[300]	valid_0's binary_error: 0.122801




[400]	valid_0's binary_error: 0.122801




[500]	valid_0's binary_error: 0.122801


In [63]:
def xgb_train(X, y, test_X, params):
    kf = KFold(n_splits=5, shuffle=True, random_state=2017)
    pred_test_xgb = 0
    pred_train_xgb = 0
    for dev_index, val_index in kf.split(X):
        train_x, valid_x = X.loc[dev_index,:], X.loc[val_index,:]
        train_y, valid_y = y[dev_index], y[val_index]
        xgb_train_data = xgb.DMatrix(train_x, train_y)
        xgb_val_data = xgb.DMatrix(valid_x, valid_y)
        xgb_submit_data = xgb.DMatrix(test_X)
        xgb_submit_data_train = xgb.DMatrix(X)
        xgb_model = xgb.train(params, xgb_train_data, 
                          num_boost_round=500, 
                          evals= [(xgb_train_data, 'train'), (xgb_val_data, 'valid')], 
                          verbose_eval=100
                         )
        pred_test = xgb_model.predict(xgb_submit_data, ntree_limit=xgb_model.best_ntree_limit)
        pred_train = xgb_model.predict(xgb_submit_data_train, ntree_limit=xgb_model.best_ntree_limit)
        pred_test[pred_test<0]=0
        pred_train[pred_train<0]=0
        pred_test_xgb += pred_test
        pred_train_xgb += pred_train
    pred_test_xgb /= 5.
    pred_train_xgb /= 5.
    return pred_test_xgb, pred_train_xgb

In [64]:
xgb_params = {
    'seed': 0,
    'colsample_bytree': 0.7,
    'silent': 1,
    'subsample': 0.7,
    'learning_rate': 0.075,
    'objective': 'binary:logistic',
    'max_depth': 6,
    'num_parallel_tree': 1,
    'min_child_weight': 1,
    'nrounds': 500
}

In [65]:
sub_xgb_test, sub_xgb_train = xgb_train(X_onehot, y['Response'], X_main_test_onehot, xgb_params)

Parameters: { nrounds, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-error:0.12267	valid-error:0.12218
[100]	train-error:0.12255	valid-error:0.12210
[200]	train-error:0.12191	valid-error:0.12208
[300]	train-error:0.12107	valid-error:0.12226
[400]	train-error:0.12042	valid-error:0.12241
[499]	train-error:0.11980	valid-error:0.12242
Parameters: { nrounds, silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-error:0.12256	valid-error:0.12263
[100]	train-error:0.12253	valid-error:0.12248
[200]	train-error:0.12195	valid-error:0.12246
[300]	tr

In [66]:
def cat_train(X, y, test_X):
    kf = KFold(n_splits=5, shuffle=True, random_state=2017)
    pred_test_cat = 0
    pred_train_cat = 0
    for dev_index, val_index in kf.split(X):
        train_x, valid_x = X.loc[dev_index,:], X.loc[val_index,:]
        train_y, valid_y = y[dev_index], y[val_index]
        model = CatBoostClassifier(iterations=1000,
                            learning_rate = 0.1, l2_leaf_reg = 20, depth=8, eval_metric='Accuracy', early_stopping_rounds = 200, bootstrap_type = 'Bernoulli')
        model.fit(train_x, train_y, eval_set=(valid_x, valid_y),use_best_model=True,verbose=50, 
                  cat_features= cat_features)
        pred_test = model.predict_proba(test_X)
        pred_test[pred_test<0]=0
        pred_test_cat += pred_test
        pred_train = model.predict_proba(X)
        pred_train[pred_train<0]=0
        pred_train_cat += pred_train
    pred_test_cat /= 5.
    pred_train_cat /= 5.
    return pred_test_cat, pred_train_cat

In [67]:
sub_cat_test, sub_cat_train = cat_train(X, y['Response'], X_main_test)

0:	learn: 0.8773349	test: 0.8778437	best: 0.8778437 (0)	total: 184ms	remaining: 3m 4s
50:	learn: 0.8774989	test: 0.8779224	best: 0.8779355 (45)	total: 5.42s	remaining: 1m 40s
100:	learn: 0.8780794	test: 0.8778043	best: 0.8780142 (65)	total: 10.5s	remaining: 1m 33s
150:	learn: 0.8784205	test: 0.8775682	best: 0.8780142 (65)	total: 16s	remaining: 1m 30s
200:	learn: 0.8788502	test: 0.8776600	best: 0.8780142 (65)	total: 21.9s	remaining: 1m 27s
250:	learn: 0.8793487	test: 0.8776075	best: 0.8780142 (65)	total: 28s	remaining: 1m 23s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8780142216
bestIteration = 65

Shrink model to first 66 iterations.
0:	learn: 0.8774300	test: 0.8774632	best: 0.8774632 (0)	total: 79.2ms	remaining: 1m 19s
50:	learn: 0.8775645	test: 0.8774107	best: 0.8774632 (0)	total: 5.38s	remaining: 1m 40s
100:	learn: 0.8778925	test: 0.8774763	best: 0.8775157 (81)	total: 10.8s	remaining: 1m 36s
150:	learn: 0.8782828	test: 0.8775026	best: 0.8776075 (134)	total:

In [68]:
last_test = pd.DataFrame()
last_test['lgbm'] = sub_lgb_test
last_test['xbm'] = sub_xgb_test
last_test['cat'] = sub_cat_test[:,1]


In [69]:
prob = (last_test['cat']*0.5 + last_test['lgbm']*0.125+ last_test['xbm']*0.375)

In [70]:
test['Response'] = prob

In [71]:
test = test[submission.columns]
test.to_csv('/SharedHDD/Code/Cross_sell/output.csv', index = False)