In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
from scipy.stats import skew, boxcox
from math import exp, log
import xgboost as xgb
import itertools

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        tmin, tsec = divmod((datetime.now() - start_time).total_seconds(), 60)
        print(' Time taken: %i minutes and %s seconds.' %
              (tmin, round(tsec, 2)))


def scale_data(X, scaler=None):
    if not scaler:
        scaler = StandardScaler()
        scaler.fit(X)
    X = scaler.transform(X) # after scaler, dataframe is converted into numpy array
    return X, scaler

DATA_TRAIN_PATH = 'train.csv'
DATA_TEST_PATH = 'test.csv'

# add a shift on y
shift = 200



In [2]:
# these features are selected by the importance of xgboost. Use them to create interaction terms.
COMB_FEATURE = 'cat80,cat87,cat57,cat12,cat79,cat10,cat7,cat89,cat2,cat72,' \
               'cat81,cat11,cat1,cat13,cat9,cat3,cat16,cat90,cat23,cat36,' \
               'cat73,cat103,cat40,cat28,cat111,cat6,cat76,cat50,cat5,' \
               'cat4,cat14,cat38,cat24,cat82,cat25'.split(',')

# create a new object function. see https://www.kaggle.com/c/allstate-claims-severity/forums/t/24520/effect-of-mae
def fair_obj(preds, dtrain):
    fair_constant = 2
    
    labels = dtrain.get_label()
    x = (preds - labels)
    den = abs(x) + fair_constant
    grad = fair_constant * x / (den)
    hess = fair_constant * fair_constant / (den * den)
    return grad, hess

# customize evaluation function. Just compute the mae of original data, not the logarithm-transformed data
def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-shift, np.exp(yhat)-shift)    

In [54]:
def load_data(path_train=DATA_TRAIN_PATH, path_test=DATA_TEST_PATH, comb_feature=COMB_FEATURE):
    train_loader = pd.read_csv(path_train, dtype={'id': np.int32})
    train = train_loader.drop(['id', 'loss'], axis=1)
    test_loader = pd.read_csv(path_test, dtype={'id': np.int32})
    
    test = test_loader.drop(['id'], axis=1)
    ntrain = train.shape[0]
    ntest = test.shape[0]
    train_test = pd.concat((train, test)).reset_index(drop=True)
    numeric_feats = train_test.dtypes[train_test.dtypes != "object"].index # the index of .dtypes is actually the column names 
        
    # compute skew and do Box-Cox transformation
    skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna()))
    # transform features with skew > 0.25 (this can be varied to find optimal value)
    skewed_feats = skewed_feats[skewed_feats > 0.25]
    skewed_feats = skewed_feats.index
    for feats in skewed_feats:
        train_test[feats] = train_test[feats] + 1
        train_test[feats], lam = boxcox(train_test[feats])
    
    
    # create interaction of selected categorical variables
    print('')
    for comb in itertools.combinations(COMB_FEATURE, 2):
        feat = comb[0] + "_" + comb[1]
        train_test[feat] = train_test[comb[0]] + train_test[comb[1]]
        print('Analyzing Columns:', feat)
    
    # factorize categorical features
    features = train_test.columns
    cats = [feat for feat in features if 'cat' in feat] # extract the features whose name contain string 'cat'. catgorical vars
    
    for feat in cats:
        train_test[feat] = pd.factorize(train_test[feat], sort=True)[0] #.factorize() returns a tuple, [0] is array, [1] is index
    x_train = train_test.iloc[:ntrain, :]
    x_test = train_test.iloc[ntrain:, :]
    train_test_scaled, scaler = scale_data(train_test)
    train, _ = scale_data(x_train, scaler)
    test, _ = scale_data(x_test, scaler)

    train_labels = np.log(np.array(train_loader['loss'] + shift))
    train_ids = train_loader['id'].values.astype(np.int32)
    test_ids = test_loader['id'].values.astype(np.int32)

    return train, train_labels, test, train_ids, test_ids

In [55]:
train, target, test, _, ids = load_data()


Analyzing Columns: cat80_cat87
Analyzing Columns: cat80_cat57
Analyzing Columns: cat80_cat12
Analyzing Columns: cat80_cat79
Analyzing Columns: cat80_cat10
Analyzing Columns: cat80_cat7
Analyzing Columns: cat80_cat89
Analyzing Columns: cat80_cat2
Analyzing Columns: cat80_cat72
Analyzing Columns: cat80_cat81
Analyzing Columns: cat80_cat11
Analyzing Columns: cat80_cat1
Analyzing Columns: cat80_cat13
Analyzing Columns: cat80_cat9
Analyzing Columns: cat80_cat3
Analyzing Columns: cat80_cat16
Analyzing Columns: cat80_cat90
Analyzing Columns: cat80_cat23
Analyzing Columns: cat80_cat36
Analyzing Columns: cat80_cat73
Analyzing Columns: cat80_cat103
Analyzing Columns: cat80_cat40
Analyzing Columns: cat80_cat28
Analyzing Columns: cat80_cat111
Analyzing Columns: cat80_cat6
Analyzing Columns: cat80_cat76
Analyzing Columns: cat80_cat50
Analyzing Columns: cat80_cat5
Analyzing Columns: cat80_cat4
Analyzing Columns: cat80_cat14
Analyzing Columns: cat80_cat38
Analyzing Columns: cat80_cat24
Analyzing Col

KeyboardInterrupt: 

In [56]:
# write the processed data into files
train_clear = np.savetxt('train_clear.csv', train, delimiter=',')
target_clear = np.savetxt('target_clear.csv', delimiter=',')
test_clear = np.savetxt('test_clear.csv', delimiter=',')
ids = np.savetxt('ids.csv', delimiter=',')

TypeError: savetxt() missing 1 required positional argument: 'X'

In [4]:
#---------- read data, convert to DMatrix --------------
#train, target, test, _, ids = load_data()
#d_train_full = xgb.DMatrix(train, label=target)
d_train_full = xgb.DMatrix(train[0:1000, ], label=target[0:1000])
d_test = xgb.DMatrix(test)

In [5]:
folds = 1
cv_sum = 0
early_stopping = 25
start_time = timer(None)

params = {}
params['booster'] = 'gbtree'
params['objective'] = "reg:linear"
#params['eval_metric'] = 'mae'
params['eta'] = 0.03
params['gamma'] = 1
params['min_child_weight'] = 1
params['colsample_bytree'] = 0.5
params['subsample'] = 0.8
params['max_depth'] = 12
params['max_delta_step'] = 0
params['silent'] = 1
params['random_state'] = 1989
params['alpha'] = 1
params['base_score'] = 2

####################################
#  Build Model
####################################
watchlist = [(d_train_full, 'train_full')]

clf = xgb.train(params,
                d_train_full,
                #2230,
                10000,
                watchlist,
                verbose_eval = 30,
                obj = fair_obj,
                feval = xg_eval_mae
                )
                # early_stopping_rounds=early_stopping
                # I didn't use early_stopping here, as I tuned the best # iteration

        
y_pred = np.exp(clf.predict(d_test, ntree_limit=clf.best_ntree_limit)) - shift # remember to subtract shift
      
timer(start_time)

[0]	train_full-mae:7.2602
[20]	train_full-mae:5.94074
[40]	train_full-mae:4.8621
[60]	train_full-mae:3.97928
[80]	train_full-mae:3.2567
[100]	train_full-mae:2.6653
[120]	train_full-mae:2.18234
[140]	train_full-mae:1.78877
[160]	train_full-mae:1.46677
[180]	train_full-mae:1.20465
[200]	train_full-mae:0.99698
[220]	train_full-mae:0.833686
[240]	train_full-mae:0.708855
[260]	train_full-mae:0.614384
[280]	train_full-mae:0.544842
[300]	train_full-mae:0.49228
[320]	train_full-mae:0.453677
[340]	train_full-mae:0.42418
[360]	train_full-mae:0.401247
[380]	train_full-mae:0.38363
[400]	train_full-mae:0.370777
[420]	train_full-mae:0.360782
[440]	train_full-mae:0.352322
[460]	train_full-mae:0.345314
[480]	train_full-mae:0.339754
[500]	train_full-mae:0.335127
[520]	train_full-mae:0.33159
[540]	train_full-mae:0.328304
[560]	train_full-mae:0.325516
[580]	train_full-mae:0.323034
[600]	train_full-mae:0.32129
[620]	train_full-mae:0.319595
[640]	train_full-mae:0.317618
[660]	train_full-mae:0.316227
[680]	

In [13]:
clf_cv['test-mae-mean']

0    7.260458
1    7.187947
2    7.116305
3    7.045448
4    6.975157
5    6.905592
6    6.836659
7    6.768558
8    6.701061
9    6.634346
Name: test-mae-mean, dtype: float64

In [9]:
# xgb.cv returns the list of [test-mae-mean, test-mae-std, train-mae-mean, train-mae-std]
clf_cv = xgb.cv(params,
                d_train_full,
                verbose_eval = 1,
                early_stopping_rounds=early_stopping)
                # I didn't use early_stopping here, as I tuned the best # iteration

[0]	train-mae:7.2604+0.00348015	test-mae:7.26046+0.0069431
[1]	train-mae:7.188+0.00335175	test-mae:7.18795+0.00690906
[2]	train-mae:7.1163+0.00341142	test-mae:7.1163+0.0068158
[3]	train-mae:7.04533+0.00339857	test-mae:7.04545+0.006766
[4]	train-mae:6.97504+0.00336983	test-mae:6.97516+0.00678723
[5]	train-mae:6.90543+0.00331434	test-mae:6.90559+0.0067916
[6]	train-mae:6.83643+0.00347342	test-mae:6.83666+0.0066061
[7]	train-mae:6.76829+0.00353212	test-mae:6.76856+0.00650229
[8]	train-mae:6.70073+0.00342147	test-mae:6.70106+0.00657101
[9]	train-mae:6.63395+0.00339524	test-mae:6.63435+0.00656128


In [33]:
now = datetime.now()

result = pd.DataFrame(y_pred, columns=['loss'])
result["id"] = ids
result = result.set_index("id")
sub_file = 'submission_' + str(folds) +'fold-average-xgb_' + '_' + str(
    now.strftime("%Y-%m-%d-%H-%M")) + '.csv'

print("\n Writing submission: %s" % sub_file)
result.to_csv(sub_file, index=True, index_label='id')


 Writing submission: submission_1fold-average-xgb__2016-11-21-20-30.csv
