In [1]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import catboost as cat
import xgboost as xgb

from sklearn.metrics import *
from sklearn.model_selection import *

from sklearn import preprocessing
le = preprocessing.LabelEncoder()
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [2]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [3]:
train.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,1,Male,44,1,28.0,0,> 2 Years,Yes,40454.0,26.0,217,1
1,2,Male,76,1,3.0,0,1-2 Year,No,33536.0,26.0,183,0
2,3,Male,47,1,28.0,0,> 2 Years,Yes,38294.0,26.0,27,1
3,4,Male,21,1,11.0,1,< 1 Year,No,28619.0,152.0,203,0
4,5,Female,29,1,41.0,1,< 1 Year,No,27496.0,152.0,39,0


In [26]:
data=pd.concat([train,test])
data.drop(['id'],axis=1,inplace=True)

cat_col=['Gender', 'Driving_License', 'Region_Code','Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage','Policy_Sales_Channel']
for col in cat_col:
    data[col]=data[col].astype(str)
    data[col]=le.fit_transform(data[col])
    
train2=data[~data['Response'].isna()]
test2=data[data['Response'].isna()]

X_train=train2.drop(['Response'],axis=1)
y=train2['Response']
X_test=test2[X_train.columns]
cat_cols=[X_train.columns.get_loc(c) for c in cat_col]

In [27]:
clf_lgb = lgb.LGBMClassifier(boosting_type='gbdt', 
                      objective='binary',
                      n_estimators=500, 
                      learning_rate=0.1,
                      reg_alpha=3,
                      categorical_feature = cat_cols,
                      scale_pos_weight=7,
                      random_state=101
                     )

clf_cat = cat.CatBoostClassifier(
                    learning_rate=0.1,
                    n_estimators=500,
                     scale_pos_weight=7,
                    cat_features=cat_cols,
                    verbose=False,
                    )

In [28]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=101)
pred_test_lgb =np.zeros((len(X_test),1))
feat_imp =np.zeros((len(X_test.columns),1))

for train_index,test_index in skf.split(X_train,y):
    x_train,x_val = X_train.iloc[train_index],X_train.iloc[test_index]
    y_train,y_val = y.iloc[train_index],y.iloc[test_index]
    clf = clf_lgb
    clf.fit(x_train,y_train)
    print(np.round(roc_auc_score(y_val,clf.predict_proba(x_val)[:,1]),2),end=',')
    #predictions
    pred_test_lgb += clf.predict_proba(X_test)[:,1].reshape(-1,1)
    feat_imp += clf.feature_importances_.reshape(-1,1)
    
pred_test_lgb = pred_test_lgb/5

0.86,0.86,0.85,0.85,0.85,

In [29]:
sub=pd.DataFrame()
pred=pred_test_lgb
sub['Response']=pred.ravel()
sub.index=test.id
sub=sub[['Response']]
sub.to_csv('lgb.csv')
sub.head()

Unnamed: 0_level_0,Response
id,Unnamed: 1_level_1
381110,0.002123
381111,0.820021
381112,0.743223
381113,0.018233
381114,0.000814


In [30]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=101)
cv_score = []
pred_test_cat =np.zeros((len(X_test),1))

for train_index,test_index in skf.split(X_train,y):
    x_train,x_val = X_train.iloc[train_index],X_train.iloc[test_index]
    y_train,y_val = y.iloc[train_index],y.iloc[test_index]
    clf = clf_cat
    clf.fit(x_train,y_train)
    score = round(roc_auc_score(y_val,clf.predict_proba(x_val)[:,1]),2)
    cv_score.append(score)
    print(score,end=",")
    #predictions
    pred_test_cat += clf.predict_proba(X_test)[:,1].reshape(-1,1)
    
pred_test_cat = pred_test_cat/5

0.86,0.86,0.86,0.86,0.86,

In [31]:
sub=pd.DataFrame()
pred=pred_test_cat
sub['Response']=pred.ravel()
sub.index=test.id
sub=sub[['Response']]
sub.to_csv('cat.csv')
sub.head()

Unnamed: 0_level_0,Response
id,Unnamed: 1_level_1
381110,0.00321
381111,0.771082
381112,0.750622
381113,0.053375
381114,0.002245


In [None]:
policy=list(set(X_train['Policy_Sales_Channel']).intersection(set(X_test['Policy_Sales_Channel'])))
X_train['Policy_Sales_Channel']=X_train['Policy_Sales_Channel'].apply(lambda x: x if x in policy else 'Missing')
X_test['Policy_Sales_Channel']=X_test['Policy_Sales_Channel'].apply(lambda x: x if x in policy else 'Missing')

In [None]:
df_train=pd.get_dummies(data=X_train,columns=cat_col)
df_test=pd.get_dummies(data=X_test,columns=cat_col)

clf_xgb = xgb.XGBClassifier(
                      objective='binary:logistic',
                      n_estimators=100, 
                      learning_rate=0.1,
                      scale_pos_weight=7,
                      random_state=101
                     )

skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=101)
cv_score = []
pred_test_xgb =np.zeros((len(X_test),1))

for train_index,test_index in skf.split(df_train,y):
    x_train,x_val = df_train.iloc[train_index],df_train.iloc[test_index]
    y_train,y_val = y.iloc[train_index],y.iloc[test_index]
    clf = clf_xgb
    clf.fit(x_train,y_train)
    score = round(roc_auc_score(y_val,clf.predict_proba(x_val)[:,1]),2)
    cv_score.append(score)
    print(score,end=",")
    #predictions
    pred_test_xgb += clf.predict_proba(df_test)[:,1].reshape(-1,1)
    
pred_test_lr = pred_test_lr/5

In [None]:
sub=pd.DataFrame()
pred=pred_test_xgb
sub['Response']=pred.ravel()
sub.index=test.id
sub=sub[['Response']]
sub.to_csv('try1.csv')
sub.head()