In [1]:
# import packages
#from __future__ import division, print_function

import numpy as np
import pandas as pd
import xgboost as xgb

from datetime import datetime
from sklearn.metrics import mean_absolute_error
from sklearn.cross_validation import KFold
from scipy.stats import skew, boxcox
from sklearn import preprocessing
import itertools

In [2]:
# we create the new features before, so read the transformed data
# the transformed data is too large, so didn't upload them to github
train = pd.read_csv('train_clear.csv')
test = pd.read_csv('test_clear.csv')

train_test = pd.concat((train, test)).reset_index(drop=True)

#process additional features according to Ali.
# https://www.kaggle.com/aliajouz/allstate-claims-severity/singel-model-lb-1117/run/413221/code
train_test["cont1"] = np.sqrt(preprocessing.minmax_scale(train_test["cont1"]))
train_test["cont4"] = np.sqrt(preprocessing.minmax_scale(train_test["cont4"]))
train_test["cont5"] = np.sqrt(preprocessing.minmax_scale(train_test["cont5"]))
train_test["cont8"] = np.sqrt(preprocessing.minmax_scale(train_test["cont8"]))
train_test["cont10"] = np.sqrt(preprocessing.minmax_scale(train_test["cont10"]))
train_test["cont11"] = np.sqrt(preprocessing.minmax_scale(train_test["cont11"]))
train_test["cont12"] = np.sqrt(preprocessing.minmax_scale(train_test["cont12"]))

train_test["cont6"] = np.log(preprocessing.minmax_scale(train_test["cont6"]) + 0000.1)
train_test["cont7"] = np.log(preprocessing.minmax_scale(train_test["cont7"]) + 0000.1)
train_test["cont9"] = np.log(preprocessing.minmax_scale(train_test["cont9"]) + 0000.1)
train_test["cont13"] = np.log(preprocessing.minmax_scale(train_test["cont13"]) + 0000.1)
train_test["cont14"] = (np.maximum(train_test["cont14"] - 0.179722, 0) / 0.665122) ** 0.25

train = train_test.iloc[:train.shape[0], :]
test = train_test.iloc[train.shape[0]:, :]
X_train = train.drop(['id', 'loss'], axis=1)
y_train = train['loss']
ids = test['id'].values
X_test = test.drop(['id', 'loss'], axis=1)

# del train
# del test
# del train_test

In [3]:
shift = 200
y_train = np.log(y_train + shift)

# define object function and evaluation metric

# fair_constant = 0.7
# def fair_obj(preds, dtrain):
#     labels = dtrain.get_label()
#     x = (preds - labels)
#     den = abs(x) + fair_constant
#     grad = fair_constant * x / (den)
#     hess = fair_constant * fair_constant / (den * den)
#     return grad, hess

def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-shift,
                                      np.exp(yhat)-shift)

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        tmin, tsec = divmod((datetime.now() - start_time).total_seconds(), 60)
        print(' Time taken: %i minutes and %s seconds.' %
              (tmin, round(tsec, 2)))

In [None]:
# fit the model
n_folds = 5
cv_sum = 0
early_stopping = 100
fred = []
xgb_rounds = []

#d_train_full = xgb.DMatrix(X_train, label=y_train)
#d_train_full = xgb.DMatrix(X_train.iloc[0:5000], label=y_train.iloc[0:5000])
d_test = xgb.DMatrix(X_test)

# define the parameters for xgboost
params = {}
params['booster'] = 'gbtree'
params['objective'] = "reg:linear"
params['eta'] = 0.03
params['gamma'] = 2
params['min_child_weight'] = 10
params['colsample_bytree'] = 0.8
params['subsample'] = 0.8
params['max_depth'] = 13
params['silent'] = 1

params['eval_metric'] = 'mae'

#params['random_state'] = 1989 # remove the random state to make each fold more different
#params['base_score'] = 2

# define a function for the pursose of change training set for trial.


# In the training, implement k-folds cross validation.
def xgb_model(X_train, y_train=y_train, ids=ids, n_folds=n_folds, cv_sum=cv_sum, early_stopping=early_stopping,
              fred=fred, xgb_rounds=xgb_rounds, params=params):
    
    start_time = timer(None)
    
    # when input folds is greater than 1, implement k-folds cross validation
    # each time use the k-1 folds data to train model, and the rest 1 fold to validation
    # repeat the process for each fold.
    
    if n_folds > 1:
        kf = KFold(X_train.shape[0], n_folds = n_folds, random_state=1989)
        for i, (train_index, test_index) in enumerate(kf):
            print('\n Fold %d\n' % (i+1))
            X_tra, X_val = X_train.iloc[train_index], X_train.iloc[test_index]
            y_tra, y_val = y_train.iloc[train_index], y_train.iloc[test_index]

            d_train = xgb.DMatrix(X_tra, label=y_tra)
            d_valid = xgb.DMatrix(X_val, label=y_val)
            watchlist = [(d_train, 'train'), (d_valid, 'eval')]

            clf = xgb.train(params,
                           d_train,
                           3000,
                           watchlist,
                           verbose_eval = 50,
                           obj = fair_obj,
                           feval = xg_eval_mae,
                           early_stopping_rounds = early_stopping)

            xgb_rounds.append(clf.best_iteration)
            score_val = clf.predict(d_valid, ntree_limit=clf.best_ntree_limit)
            cv_score = mean_absolute_error(np.exp(y_val), np.exp(score_val))
            print('eval-MAE: %.6f' % cv_score)

            # use the model of each fold to predict the test set.
            y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift

            if i > 0:
                fpred = pred + y_pred
            else:
                fpred = y_pred
            pred = fpred
            cv_sum = cv_sum + cv_score

        # compute the average prediction of k folds models.
        mpred = pred / n_folds # mpred is the k-folds average prediction of test data
        score = cv_sum / n_folds
        print('\n Average eval-MAE: %.6f' % score)
        n_rounds = int(np.mean(xgb_rounds))

        timer(start_time)

        ## write results to file
        print("#\n Writing results")
        result = pd.DataFrame(mpred, columns=['loss'])
        result["id"] = ids
        result = result.set_index("id")
        print("\n %d-fold average prediction:\n" % n_folds)
        print(result.head())

        now = datetime.now()
        score = str(round((cv_sum / n_folds), 6))
        sub_file = 'submission_' + str(n_folds) +'fold-average-xgb_' + str(score) + '_' + str(
            now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

        print("\n Writing submission: %s" % sub_file)
        result.to_csv(sub_file, index=True, index_label='id')

    # if there is no k-folds, training on full data set directly. 
    elif n_folds == 1:
        watchlist = [(d_train_full, 'train_full')]
        clf = xgb.train(params,
                       d_train_full,
                       1010,
                       watchlist,
                       verbose_eval = 50,
                       obj = fair_obj,
                       feval = xg_eval_mae)
                       #early_stopping_rounds = early_stopping)
        y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift

        timer(start_time)
        now = datetime.now()
        result = pd.DataFrame(y_pred, columns=['loss'])
        result["id"] = ids
        result = result.set_index("id")
        
        sub_file = 'submission_' + str(n_folds) +'fold-average-xgb_' + '_' + str(
            now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

        print("\n Writing submission: %s" % sub_file)
        result.to_csv(sub_file, index=True, index_label='id')

In [5]:
xgb_model(X_train)


 Fold 1

[0]	train-rmse:4.88939	eval-rmse:4.88568	train-mae:3218.75	eval-mae:3212.62
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 100 rounds.
[50]	train-rmse:0.550661	eval-rmse:0.553594	train-mae:1316.01	eval-mae:1319.5
[100]	train-rmse:0.479979	eval-rmse:0.489293	train-mae:1148.86	eval-mae:1171.56
[150]	train-rmse:0.471719	eval-rmse:0.483724	train-mae:1123.73	eval-mae:1155.52
[200]	train-rmse:0.46798	eval-rmse:0.481857	train-mae:1111.77	eval-mae:1149.8
[250]	train-rmse:0.465553	eval-rmse:0.480749	train-mae:1103.92	eval-mae:1146.3
[300]	train-rmse:0.463417	eval-rmse:0.479888	train-mae:1097.29	eval-mae:1143.76
[350]	train-rmse:0.461777	eval-rmse:0.479287	train-mae:1092.11	eval-mae:1141.85
[400]	train-rmse:0.460228	eval-rmse:0.478797	train-mae:1087.37	eval-mae:1140.2
[450]	train-rmse:0.459082	eval-rmse:0.478459	train-mae:1083.79	eval-mae:1139.29
[500]	train-rmse:0.458068	eval-rmse:0.47819	train-mae:1080