In [1]:
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing

import xgboost as xgb
import cPickle as pickle


############## HYPEROPT ########################
from hyperopt import hp

from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
#####################################################

In [2]:
train_set = pd.read_csv('../data/X_train.csv')
all_columns = train_set.columns.tolist

## add response
Y_train = pd.read_csv('../data/Y_train.csv')
train_set['Converted'] = Y_train.Converted

In [3]:
split_kfold = pd.read_csv('../data/fold_train.csv')

In [4]:
categorical_var = ['SCID', 'FirstDriverDrivingLicenseNumberY', 'FirstDriverMaritalStatus', 'CarUsageId', 'CarParkingTypeId', 'FirstDriverDrivingLicenceType', 'CarDrivingEntitlement', 'CarMakeId', 'NameOfPolicyProduct', 'AffinityCodeId']
###'SelectedPackage' is constant
## 'SCID' to add eventually
already_binary = ['CoverIsNoClaimDiscountSelected',  'IsPolicyholderAHomeowner']

to_0_1 = ['CarFuelId', 'CarTransmissionId']
train_set[to_0_1] = train_set[to_0_1].apply(lambda x : x - 1)

row_id = [u'Unnamed: 0']

continuous_var = ['CarAnnualMileage', 'FirstDriverAge', 'CarInsuredValue', 'CarAge', 'VoluntaryExcess', 'PolicyHolderNoClaimDiscountYears', 'PolicyHolderResidencyArea', 'AllDriversNbConvictions', 'RatedDriverNumber','SocioDemographicId', 'DaysSinceCarPurchase']

continuous_var = continuous_var + already_binary + to_0_1





out_var = ['ReceivedDateTime', 'TodayDate']


dunno_var = ['CustomerMD5Key']

response = "Converted"


total_var = categorical_var + continuous_var + [response]

train_set = train_set[total_var]

In [5]:
### FILL NA

train_set['IsPolicyholderAHomeowner'] = train_set['IsPolicyholderAHomeowner'].fillna(-1)
train_set['SCID'] = train_set['SCID'].fillna(-1)

## Simplify categories

In [6]:
def simplify_scid(x) :
    if x == 'B01851' :
        return 0
    elif x == 'A04402' :
        return 1
    elif x == 'A10099' :
        return 2
    elif x == 'B02196' :
        return 3
    elif x == 'A08213' :
        return 4
    elif x == 'A03440' :
        return 5
    elif x == 'A04955' :
        return 6
    elif x == 'A09963' :
        return 7
    elif x == 'B01604' :
        return 8
    elif x == 'A06439' :
        return 9
    else :
        return 10
    
def simplify_affinity(x) :
    
    if x == 0 :
        return 0
    elif x == 31 :
        return 1
    elif x==39 :
        return 2
    else :
        return 3
    
def simplify_DrivingLicense(x) :
    if (x % 10) in [0,1] :
        return 0
    elif (x % 10) in [5,6] :
        return 1
    else :
        return 2

In [7]:
train_set.SCID = train_set.SCID.apply(simplify_scid)
train_set.AffinityCodeId = train_set.AffinityCodeId.apply(simplify_affinity)
train_set.FirstDriverDrivingLicenseNumberY = train_set.FirstDriverDrivingLicenseNumberY.apply(simplify_DrivingLicense)
print 'Done!'

Done!


In [8]:
## Features Antoine

In [9]:
augmented_features_antoine = pd.read_csv('./AugmentedFeaturesTrain.zip')
feat_name = [ u'VariationCarUsageId',
       u'VariationFirstDriverDrivingLicenseNumberY',
       u'VariationCoverIsNoClaimDiscountSelected',
       u'VariationIsPolicyholderAHomeowner', u'isMostFrequentCarMakeId',
       u'VariationVoluntaryExcess',
       u'VariationPolicyHolderNoClaimDiscountYears', u'VariationCarAge',
       u'VariationCarMakeId', u'rowNumberForUser', u'VariationCarInsuredValue',
       u'VariationCarParkingTypeId',
       u'isMostFrequentFirstDriverDrivingLicenseNumberY',
       u'VariationCarAnnualMileage', u'driverPerCar',
       u'VariationDaysSinceCarPurchase', u'VariationCarDrivingEntitlement']

continuous_var = continuous_var + feat_name

train_set = pd.concat([train_set, augmented_features_antoine[feat_name]], axis = 1)

In [10]:
## Features Alex

In [11]:
augmented_features_alex = pd.read_csv('./lda_features_5_train_topics_df.csv')
feature_name_alex = augmented_features_alex.columns.tolist()

continuous_var = continuous_var + feature_name_alex

train_set = pd.concat([train_set, augmented_features_alex[feature_name_alex]], axis = 1)

In [12]:

#####################################################
########## Encoding functions #######################
def OHEvsHash(df, factors, thresh ) :
	"""
	df : dataset,
	factors : categorical factors,
	thresh : when a factor should be considered to big for OHE
	"""
	OHE_factors = []
	Hash_factors = []
	for f in factors :
		nb_categories = len(df[f].unique())
		if nb_categories > thresh :
			Hash_factors.append(f)
		else :
			OHE_factors.append(f)
	return OHE_factors, Hash_factors

def encoder (b, encoder) :
	try :
		return float(encoder.transform(b))
	except :
		return -1

def create_hash_dict(df,Hash_factors) :
    encoders = {}
    for f in Hash_factors:
        dic_aux = {}
        for i,e in enumerate(df[f].unique()) :
            dic_aux[e]=i            
        encoders[f] = dic_aux
    return encoders
#####################################################


print 'Looking at categorical factors...'	
thresh = 50
print "The threshold is set to %d"%(thresh)
OHE_factors, Hash_factors = OHEvsHash(train_set, categorical_var, thresh)
print '... Done!\n'

print 'Creating hash tables...'
Hashing_dict =create_hash_dict(train_set,Hash_factors)
print '... Done!\n'





def preprocess_training_data( df, continuous_factors, OHE_factors, Hashing_dict, Hash_factors,  response ) :
    
    final_factors = []
    res_data = pd.DataFrame()
	
    print 'Creating Dummies...'
    for factor in OHE_factors :
        df[factor].fillna('null', inplace = True)
        try :
            print factor 
            res_data= pd.concat([res_data, pd.get_dummies(df[factor], prefix = factor)], axis = 1)
            final_factors.extend([str(factor).encode('utf8')+'_'+str(x).encode('utf8') for x in list(set(df[factor]))])
        except :
            print repr(factor.decode('utf8'))
            raise 
    print '...Done for Dummies!\n'
    
    print 'Hashing Factors...'
    for f in Hash_factors :
        print f
        res_data[f] = df[f].map(lambda x : Hashing_dict[f][x])
        final_factors.append(f)
    print "...Done!\n"
	
    print 'Continuous Factors...'
    final_factors.extend(continuous_factors)
    res_data =pd.concat([res_data,df[continuous_factors] ], axis = 1)
    #res_data.fillna(-1, inplace = True)
    #res_data.replace('null',-1, inplace = True)
    #res_data.replace('?',-1, inplace = True)
    res_data = res_data[final_factors]
    print "...Done for continuous factors!\n"
    return (np.matrix(res_data.values), np.array(df[response]).reshape(-1), final_factors)



Looking at categorical factors...
The threshold is set to 50
... Done!

Creating hash tables...
... Done!



In [13]:
#selected_features = pickle.load(open('./Results/selected_featv6.pic', 'rb'))
train_matrix, Y, final_factors = preprocess_training_data(train_set, continuous_var, OHE_factors, Hashing_dict, Hash_factors,  response )


Creating Dummies...
SCID
FirstDriverDrivingLicenseNumberY
FirstDriverMaritalStatus
CarUsageId
CarParkingTypeId
FirstDriverDrivingLicenceType
CarDrivingEntitlement
CarMakeId
NameOfPolicyProduct
AffinityCodeId
...Done for Dummies!

Hashing Factors...
...Done!

Continuous Factors...
...Done for continuous factors!



In [14]:
#X_train, X_check, y_train, y_check = train_test_split(train_matrix, Y,  test_size = 0.25, random_state = 42)

fold_value = 0


X_train = train_matrix[np.array([split_kfold.fold!=fold_value]).reshape(-1),:]
y_train = Y[np.array([split_kfold.fold!=fold_value]).reshape(-1)]
X_check = train_matrix[np.array([split_kfold.fold==fold_value]).reshape(-1),:]
y_check = Y[np.array([split_kfold.fold==fold_value]).reshape(-1)]

In [15]:
sum_wneg = (Y_train.Converted == 0).sum()
sum_wpos = (Y_train.Converted == 1).sum()

params = {
			 'n_estimators' : 2500,
             'eta' : 0.1, #0.1
             'max_depth' : 9,
             'min_child_weight' : 3,#50
             'subsample' :  0.85,
             'colsample_bytree' :0.75,
			 "lambda" : 10,#100
			 "alpha" :3,#100
             "gamma" : 0.2,
             'objective': "binary:logistic",
             'eval_metric' : "logloss",
             'max_delta_step':0,
             'nthread' : 14,
             'silent' : 1,
             'scale_pos_weight' : sum_wneg/float(sum_wpos),
			 "booster" : "gbtree",
			 "seed" : 43
             }

In [16]:
print "Training with params : "
print params
num_round = int(params['n_estimators'])
del params['n_estimators']
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_check, label=y_check)
watchlist = [(dtrain, 'train'),(dvalid, 'eval')]


model = xgb.train(params, dtrain, num_round, evals=watchlist, early_stopping_rounds=20,  verbose_eval=10)  #feval=logloss,
predictions = model.predict(dvalid)
train_pred = model.predict(dtrain)


Training with params : 
{'colsample_bytree': 0.75, 'silent': 1, 'eval_metric': 'logloss', 'scale_pos_weight': 127.25189427312775, 'max_delta_step': 0, 'nthread': 14, 'min_child_weight': 3, 'n_estimators': 2500, 'subsample': 0.85, 'eta': 0.1, 'objective': 'binary:logistic', 'alpha': 3, 'booster': 'gbtree', 'seed': 43, 'max_depth': 9, 'gamma': 0.2, 'lambda': 10}
[0]	train-logloss:0.630886	eval-logloss:0.631441
Multiple eval metrics have been passed: 'eval-logloss' will be used for early stopping.

Will train until eval-logloss hasn't improved in 20 rounds.
[10]	train-logloss:0.204908	eval-logloss:0.20539
[20]	train-logloss:0.086535	eval-logloss:0.086941
[30]	train-logloss:0.042528	eval-logloss:0.042843
[40]	train-logloss:0.026315	eval-logloss:0.026607
[50]	train-logloss:0.018658	eval-logloss:0.018961
[60]	train-logloss:0.015524	eval-logloss:0.015876
[70]	train-logloss:0.01369	eval-logloss:0.014157
[80]	train-logloss:0.012555	eval-logloss:0.013084
[90]	train-logloss:0.011773	eval-logloss:

In [17]:
import cPickle as pickle

def save_model( file_name, model ,continuous_factors, OHE_factors , Hashing_dict, Hash_factors,  training_factors ) :
    dico = {"model" : model, 'continuous_factors' : continuous_factors, 'OHE_factors' :  OHE_factors, "Hashing_dict" : Hashing_dict, "Hash_factors" : Hash_factors, 'training_factors' : training_factors}
    pickle.dump(dico, open( file_name, "wb" ) )
    print 'Model saved!'

In [18]:
file_path = './Results/model_v7_fold0.pic'
save_model(file_path, model, continuous_var, OHE_factors, Hashing_dict, Hash_factors, final_factors )

Model saved!
