In [88]:
import pandas as pd
import numpy as np

import lightgbm as lgb
import catboost as cat

from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
from scipy.misc import derivative

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))
import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)

In [89]:
train=pd.read_csv('train.csv')
test=pd.read_csv('test.csv')

In [90]:
data=pd.concat([train,test])
data.drop(['id'],axis=1,inplace=True)

cat_col=['Gender', 'Driving_License', 'Region_Code','Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage','Policy_Sales_Channel']
for col in cat_col:
    data[col]=le.fit_transform(data[col])
    
train2=data[~data['Response'].isna()]
test2=data[data['Response'].isna()]

X_train=train2.drop(['Response'],axis=1)
y=train2['Response']
X_test=test2[X_train.columns]
cat_cols=[X_train.columns.get_loc(c) for c in cat_col]

In [93]:
def focal_loss_lgb_sk(y_true, y_pred, alpha, gamma):
    a,g = alpha, gamma
    def fl(x,t):
        p = 1/(1+np.exp(-x))
        return -( a*t + (1-a)*(1-t) ) * (( 1 - ( t*p + (1-t)*(1-p)) )**g) * ( t*np.log(p)+(1-t)*np.log(1-p) )
    partial_fl = lambda x: fl(x, y_true)
    grad = derivative(partial_fl, y_pred, n=1, dx=1e-6)
    hess = derivative(partial_fl, y_pred, n=2, dx=1e-6)
    return grad, hess

def focal_loss_lgb_eval_error_sk(y_true, y_pred, alpha, gamma):
    a,g = alpha, gamma
    p = 1/(1+np.exp(-y_pred))
    loss = -( a*y_true + (1-a)*(1-y_true) ) * (( 1 - ( y_true*p + (1-y_true)*(1-p)) )**g) * ( y_true*np.log(p)+(1-y_true)*np.log(1-p) )
    return 'focal_loss', np.mean(loss), False

focal_loss = lambda x,y: focal_loss_lgb_sk(x, y, 0.25,5)
eval_error = lambda x,y: focal_loss_lgb_eval_error_sk(x, y, 0.25, 5)
clf_lgb = lgb.LGBMClassifier(objective=focal_loss, 
                             learning_rate=0.1,
                             n_estimators=500,
                             scale_pos_weight=3,
                            random_state=101)

In [94]:
skf = StratifiedKFold(n_splits=5,shuffle=True,random_state=101)
cv_score = []
pred_test_lgb =np.zeros((len(X_test),1))

for train_index,test_index in skf.split(X_train,y):
    x_train,x_val = X_train.iloc[train_index],X_train.iloc[test_index]
    y_train,y_val = y.iloc[train_index],y.iloc[test_index]
    clf =clf_lgb
    clf.fit(x_train,y_train)
    
    pred_val=clf.predict_proba(x_val)
    pred_val=1/(1+np.exp(-pred_val))#convert to probability
    
    score = round(roc_auc_score(y_val,pred_val),2)
    cv_score.append(score)
    print(score,end=",")
    
    #predictions
    test_pred=clf.predict(X_test)
    test_pred=1/(1+np.exp(-test_pred))         
    pred_test_lgb += test_pred.reshape(-1,1)
    
pred_test_lgb = pred_test_lgb/5

print(np.mean(cv_score))

0.86,0.86,0.86,0.86,0.86,0.86


In [95]:
sub=pd.DataFrame()
sub['Response']=pred_test_lgb.ravel()
sub.index=test.id
sub=sub[['Response']]
sub.to_csv('Focal.csv')
sub.head()

Unnamed: 0_level_0,Response
id,Unnamed: 1_level_1
381110,0.155613
381111,0.419332
381112,0.413537
381113,0.253055
381114,0.135821
