In [30]:
from sklearn.metrics import roc_curve, auc

def get_roc(df,score,target,title,plot=1):
    df1 = df[[score,target]].dropna()
    fpr, tpr, thresholds = roc_curve(df1[target], df1[score])
    ks=np.abs(tpr-fpr)
    if plot==1:
    # Plot ROC curve
        plt.figure(figsize=(6,4))
        plt.plot(fpr, tpr, label='AUC=%0.2f KS=%0.2f' %(auc(fpr, tpr),ks.max()))
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.grid(b=True, which='both', color='0.65',linestyle='-')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(title+'Receiver Operating Characteristic')
        plt.legend(loc="lower right")
    return auc(fpr, tpr),np.max(np.abs(tpr-fpr)),thresholds[ks.argmax()]


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
print(sklearn.__version__)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
import os,sys
os.chdir('/data/arpit.goel/30_ClaimPrediction')

0.18.1


In [57]:
train_data = pd.read_csv('01.RawData/train.csv', na_values='-1')
test_data = pd.read_csv('01.RawData/test.csv', na_values='-1')
missing_vals=train_data.median() 
cat_cols = sorted([x for x in train_data.columns if 'cat' in x])

def clean_data(df1):
    df=df1.copy()
    df['any_miss_value']=df.isnull().sum(axis=1).map(lambda x: 1 if x>0 else 0)
    df=df.fillna(missing_vals)
    df[cat_cols]=df[cat_cols].apply(lambda x: x.astype('category'))
    for column in cat_cols:
        dummies = pd.get_dummies(df[column],prefix=column)
        df = pd.concat([df,dummies],axis =1)
        df.drop([column],axis=1,inplace= True)
    return df

train_clean=clean_data(train_data)
test_clean=clean_data(test_data)

ins=train_clean.sample(frac=0.8,random_state=200)
oos=train_clean.drop(ins.index)
oot=clean_data(test_data)

idv_vars=list(set(train_clean.columns).intersection(set(test_clean.columns)))

In [58]:
clf= RandomForestClassifier(n_estimators=200,oob_score=True,random_state=13,n_jobs = -1, min_samples_leaf = 100)
clf.fit(ins[idv_vars], ins['target'])
ins['prediction']=clf.predict_proba(ins[idv_vars])[:,1]
oos['prediction']=clf.predict_proba(oos[idv_vars])[:,1]
#oot['target']=clf.predict_proba(oot[idv_vars])[:,1]
print get_roc(ins,'prediction','target','ROC',0)[0], get_roc(oos,'prediction','target','ROC',0)[0]


0.818320582441 0.629129794589


In [59]:
clf= RandomForestClassifier(n_estimators=1000,oob_score=True,random_state=13,n_jobs = -1, min_samples_leaf = 100)
clf.fit(ins[idv_vars], ins['target'])
ins['prediction']=clf.predict_proba(ins[idv_vars])[:,1]
var_imp_global=pd.Series(clf.feature_importances_,index=idv_vars).sort_values(ascending=False).cumsum()
imp_vars=list(var_imp_global[var_imp_global<0.95].index)
print len(imp_vars), imp_vars

66 ['ps_car_13', 'ps_reg_03', 'id', 'ps_ind_03', 'ps_car_14', 'ps_reg_02', 'ps_ind_15', 'ps_car_12', 'ps_car_15', 'ps_reg_01', 'ps_calc_03', 'ps_calc_01', 'ps_calc_02', 'ps_calc_10', 'ps_calc_14', 'ps_ind_17_bin', 'ps_ind_01', 'ps_calc_11', 'ps_ind_05_cat_0.0', 'ps_calc_13', 'ps_calc_08', 'ps_calc_07', 'ps_ind_07_bin', 'ps_calc_09', 'ps_calc_06', 'ps_ind_16_bin', 'ps_calc_04', 'ps_calc_12', 'ps_calc_05', 'ps_ind_06_bin', 'ps_car_04_cat_0', 'ps_car_11', 'ps_car_01_cat_7.0', 'any_miss_value', 'ps_calc_17_bin', 'ps_car_02_cat_0.0', 'ps_car_01_cat_11.0', 'ps_car_02_cat_1.0', 'ps_car_09_cat_1.0', 'ps_calc_16_bin', 'ps_ind_05_cat_6.0', 'ps_ind_04_cat_0.0', 'ps_ind_04_cat_1.0', 'ps_calc_19_bin', 'ps_car_09_cat_2.0', 'ps_car_09_cat_0.0', 'ps_ind_02_cat_1.0', 'ps_ind_08_bin', 'ps_calc_18_bin', 'ps_car_05_cat_0.0', 'ps_car_05_cat_1.0', 'ps_ind_09_bin', 'ps_car_08_cat_0', 'ps_car_07_cat_0.0', 'ps_car_08_cat_1', 'ps_ind_02_cat_2.0', 'ps_car_07_cat_1.0', 'ps_car_06_cat_11', 'ps_ind_18_bin', 'ps_car

In [47]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import roc_curve, auc

def custom_auc(ground_truth, predictions):
    fpr, tpr, _ = roc_curve(ground_truth, predictions[:, 1], pos_label=1)    
    return auc(fpr, tpr)

my_auc = make_scorer(custom_auc, greater_is_better=True, needs_proba=True)
parameters = {'n_estimators':2**np.arange(7,10),'max_depth':np.arange(10,18,3)}
rfc=RandomForestClassifier(min_samples_leaf=20,n_jobs=1,oob_score=True)
clf = GridSearchCV(rfc, parameters,scoring = my_auc, n_jobs=-1)
ins_sample=ins.sample(frac=0.2,random_state=200)
clf.fit(ins_sample[imp_vars],ins_sample['target'])
clf.grid_scores_

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=20,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=True, random_state=None,
            verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'n_estimators': array([128, 256, 512]), 'max_depth': array([10, 13, 16])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=make_scorer(custom_auc, needs_proba=True), verbose=0)

In [60]:
rfc=RandomForestClassifier(n_estimators=250,max_depth=10,min_samples_leaf=100,n_jobs=-1)
rfc.fit(ins[imp_vars],ins['target'])

imp_vars1=pd.Series(rfc.feature_importances_,index=imp_vars).sort_values(ascending=False).cumsum()
imp_vars2=list(imp_vars1[imp_vars1<0.80].index)

rfc=RandomForestClassifier(n_estimators=30,max_depth=10,min_samples_leaf=100,n_jobs=-1)
rfc.fit(ins[imp_vars2],ins['target'])

ins['prediction']=rfc.predict_proba(ins[imp_vars2])[:,1]
oos['prediction']=rfc.predict_proba(oos[imp_vars2])[:,1]
oot['target']=rfc.predict_proba(oot[imp_vars2])[:,1]

print get_roc(ins,'prediction','target','ROC',0)[0], get_roc(oos,'prediction','target','ROC',0)[0]
oot[['id','target']].to_csv('04.Results/2.RF.csv',index=False)

0.682000663857 0.62387964648


In [51]:
parameters = {'n_estimators':2**np.arange(3,7),'max_depth':[10]}
rfc=RandomForestClassifier(min_samples_leaf=100,n_jobs=1,oob_score=True)
clf = GridSearchCV(rfc, parameters,scoring = my_auc, n_jobs=-1)
clf.fit(ins[imp_vars2],ins['target'])
pd.DataFrame(clf.grid_scores_)



Unnamed: 0,parameters,mean_validation_score,cv_validation_scores
0,"{u'n_estimators': 8, u'max_depth': 10}",0.612923,"[0.616728274116, 0.611533550369, 0.610508642033]"
1,"{u'n_estimators': 16, u'max_depth': 10}",0.620743,"[0.625848356835, 0.619262839752, 0.617119129256]"
2,"{u'n_estimators': 32, u'max_depth': 10}",0.625537,"[0.630999617971, 0.624136930882, 0.621474178682]"
3,"{u'n_estimators': 64, u'max_depth': 10}",0.62704,"[0.63214964351, 0.624964711746, 0.624006440652]"


In [56]:
pd.crosstab(train_data.isnull().sum(axis=1).map(lambda x: 1 if x>0 else 0),train_data.target)


target,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,119261,5670
1,454257,16024
