In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn import preprocessing
from scipy.stats import skew, boxcox
from IPython.core.pylabtools import figsize
import xgboost as xgb
%matplotlib inline

from os import path
to_filename = lambda name: path.join("..", "data", "allstate", name +".csv")

import seaborn as sns
sns.set_style("whitegrid")



In [2]:
train = pd.read_csv(to_filename("train"), index_col=0)
test = pd.read_csv(to_filename("test"), index_col=0)
print("shape: train {}, test {}".format(train.shape, test.shape))
print(train.head(2))

shape: train (188318, 131), test (125546, 130)
   cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10   ...        cont6  \
id                                                      ...                
1     A    B    A    B    A    A    A    A    B     A   ...     0.718367   
2     A    B    A    A    A    A    A    A    B     B   ...     0.438917   

       cont7    cont8    cont9   cont10    cont11    cont12    cont13  \
id                                                                      
1   0.335060  0.30260  0.67135  0.83510  0.569745  0.594646  0.822493   
2   0.436585  0.60087  0.35127  0.43919  0.338312  0.366307  0.611431   

      cont14     loss  
id                     
1   0.714843  2213.18  
2   0.304496  1283.60  

[2 rows x 131 columns]


In [3]:
response = np.log(train.loss)

def restore_pred(y):
    return np.exp(y)

In [4]:
cat_features = [col for col in train.columns if col.startswith("cat")]
print("Categorical columns:", cat_features)

('Categorical columns:', ['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat90', 'cat91', 'cat92', 'cat93', 'cat94', 'cat95', 'cat96', 'cat97', 'cat98', 'cat99', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'c

In [None]:
# Categorical features preprocessing
# Method 1: Encoding categorical features into int
for col in cat_features:
    encd = preprocessing.LabelEncoder()
    encd.fit(train[col].value_counts().index.union(test[col].value_counts().index))
    train[col] = encd.transform(train[col])
    test[col] = encd.transform(test[col])

In [5]:
# Method 2: Using ordered features for categorical features
col = cat_features[0]
test_col = train[col][:10].copy()
for col in cat_features:
    key_map = response.groupby(train[col]).mean().to_dict()
    train[col] = train[col].replace(key_map)
    for k in set(test[col].value_counts().index).difference(key_map.keys()):
        key_map[k] = np.NAN
    test[col] = test[col].replace(key_map)

In [None]:
# preprocess numerical features
num_features = [col for col in train.columns if col.startswith("cont")]
print("Numerical columns:", num_features)

selected_fea = list(num_features)
selected_fea.remove("cont1")
selected_fea.remove("cont2")
selected_fea.remove("cont13")
selected_fea.remove("cont14")
print(selected_fea)

In [None]:
print(train[selected_fea].head(2))

In [None]:
# Method 1: Standard Scaler
for col in selected_fea:
    sc = preprocessing.StandardScaler()
    # sc.fit(pd.concat([train[[col]], test[[col]]]))
    sc.fit(train[[col]])
    train[col] = sc.transform(train[[col]])
    test[col] = sc.transform(test[[col]])

In [None]:
print(train[selected_fea].head(2))

In [None]:
# study the skewness in the numerical features
# skewed_feats = pd.concat([train[num_features], test[num_features]]).skew()
skewed_feats = train[selected_fea].skew()
print("Skew in numeric features:", skewed_feats)

In [None]:
selected_skewed_feats = skewed_feats[skewed_feats > 0.25].index
print("selected skew feats", selected_skewed_feats)

In [None]:
# Method 2: Box-Cox transformation when numerical feature skewed
for feat in selected_skewed_feats:
    train[feat], lam = boxcox(train[feat] + 1.)
    test[feat], lam = boxcox(test[feat] + 1.)

In [6]:
dtrain = xgb.DMatrix(train.drop("loss", 1), response)
dtest = xgb.DMatrix(test)

In [19]:
params = {'objective':"reg:linear", 'silent': True, 'max_depth': 7, 'min_child_weight': 1,
          'colsample_bytree': .7, "subsample": .95, 'eta': 0.1, 'eval_metric':'mae',# "n_estimators": 20,
          "gamma": 0.55, "lambda": 1., "silent": 1}

In [18]:
cvresult = xgb.cv(params, dtrain, nfold=4, num_boost_round=50, early_stopping_rounds=20)
print(cvresult)

    test-mae-mean  test-mae-std  train-mae-mean  train-mae-std
0        6.466958      0.003119        6.466966       0.001000
1        5.820601      0.003090        5.820514       0.000885
2        5.238770      0.003016        5.238672       0.000804
3        4.715121      0.003085        4.715063       0.000751
4        4.243845      0.003164        4.243813       0.000664
5        3.819759      0.003106        3.819721       0.000621
6        3.438110      0.003145        3.438035       0.000539
7        3.094651      0.003193        3.094550       0.000506
8        2.785572      0.003216        2.785474       0.000451
9        2.507509      0.003259        2.507439       0.000370
10       2.257462      0.003260        2.257330       0.000304
11       2.032702      0.003212        2.032519       0.000286
12       1.830921      0.003225        1.830647       0.000278
13       1.649882      0.003266        1.649576       0.000198
14       1.487857      0.003333        1.487416       0

In [20]:
folds = 5

y_pred_train = np.zeros((train.shape[0], folds))
y_pred = np.zeros((test.shape[0], folds))
score = np.zeros(folds)

kf = KFold(n_splits=folds)
kf.split(train)
for i, (train_index, test_index) in enumerate(kf.split(train)):
    train_pd_ind = train.index[train_index]
    test_pd_ind = train.index[test_index]
    train_part, test_part = train.ix[train_pd_ind], train.ix[test_pd_ind]
    
    dtrain_part = xgb.DMatrix(train_part.drop("loss", 1), response[train_pd_ind])
    dtest_part = xgb.DMatrix(test_part.drop("loss", 1), response[test_pd_ind])
    params['seed'] = i * 5 + 100
    clf = xgb.train(params, dtrain_part, num_boost_round=500,
                    evals=[(dtrain_part, "train"), (dtest_part, "eval")], early_stopping_rounds=20)
    
    #limit = clf.best_iteration + 1
    limit = clf.best_ntree_limit
    print("best ntree limit", i, limit)
    
    this_pred_train = clf.predict(dtrain, ntree_limit=limit)
    y_pred_train[:, i] = this_pred_train
    
    print("mae for part train",i, mean_absolute_error(
            train_part.loss, restore_pred(clf.predict(dtrain_part, ntree_limit=clf.best_ntree_limit))))
    print("mae for part test",i, mean_absolute_error(
            test_part.loss, restore_pred(clf.predict(dtest_part, ntree_limit=clf.best_ntree_limit))))
    
    score[i] = mean_absolute_error(train.loss, restore_pred(this_pred_train))
    print("mae for all train", i, score[i])
    
    this_pred_test = clf.predict(dtest, ntree_limit=limit)
    y_pred[:, i] = this_pred_test
    

[0]	train-mae:6.46769	eval-mae:6.46361
Multiple eval metrics have been passed: 'eval-mae' will be used for early stopping.

Will train until eval-mae hasn't improved in 20 rounds.
[1]	train-mae:5.82114	eval-mae:5.81685
[2]	train-mae:5.23921	eval-mae:5.23479
[3]	train-mae:4.7154	eval-mae:4.71071
[4]	train-mae:4.24416	eval-mae:4.23934
[5]	train-mae:3.81994	eval-mae:3.81497
[6]	train-mae:3.43817	eval-mae:3.43315
[7]	train-mae:3.09465	eval-mae:3.08955
[8]	train-mae:2.78553	eval-mae:2.78032
[9]	train-mae:2.50742	eval-mae:2.50219
[10]	train-mae:2.25735	eval-mae:2.25214
[11]	train-mae:2.0325	eval-mae:2.02715
[12]	train-mae:1.83058	eval-mae:1.82507
[13]	train-mae:1.64947	eval-mae:1.64387
[14]	train-mae:1.4873	eval-mae:1.48166
[15]	train-mae:1.34249	eval-mae:1.33693
[16]	train-mae:1.21372	eval-mae:1.20851
[17]	train-mae:1.09978	eval-mae:1.09502
[18]	train-mae:0.999791	eval-mae:0.99561
[19]	train-mae:0.912561	eval-mae:0.909059
[20]	train-mae:0.83693	eval-mae:0.834041
[21]	train-mae:0.771915	eval

In [None]:
print("mae final train", mean_absolute_error(train.loss, np.mean(restore_pred(y_pred_train), axis=1)))

In [None]:
import datetime
result = pd.DataFrame({"id": test.index, "loss": np.mean(restore_pred(y_pred), axis=1)})
result.to_csv("result{:%Y%m%d%H%-M}.csv".format(datetime.datetime.now()), index=None)

# Using XGBRegressor and important features

In [None]:
from sklearn.grid_search import GridSearchCV
from xgboost import XGBRegressor

In [None]:
params_reg = dict(params)
params_reg.pop("eta")
params_reg.pop('eval_metric')
params_reg.pop('lambda')

In [None]:
reg = XGBRegressor(**params_reg)
reg.fit(train.drop("loss", 1), train.loss)

In [None]:
train_predprob = reg.score()

In [None]:
reg_booster = reg.booster()

In [None]:
figsize(18, 5)
feat_imp = pd.Series(reg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')

In [None]:
important_features = list(feat_imp[feat_imp > 4].index)
print("important features:", important_features)

In [None]:
dtrain_imp = xgb.DMatrix(train[important_features], train.loss)
cvresult = xgb.cv(params, dtrain_imp, nfold=4, num_boost_round=50)
print(cvresult)

In [None]:
params2 = {'base_score': 0.1, 'colsample_bytree': 0.9,
 'eta': 0.3,
 'eval_metric': 'mae',
 'max_depth': 7,
 'min_child_weight': 3,
 'n_estimators': 10,
 'objective': 'reg:linear',
 'seed': 1,
 'silent': True}
regb = xgb.train(params2, dtrain_imp, num_boost_round=50, evals=[(dtrain_imp, "train")])

In [None]:
param_test1 = {
 'max_depth':range(3,10,2),
 'min_child_weight':range(1,6,2)
}
gsearch1 = GridSearchCV(estimator=reg, 
 param_grid = param_test1, scoring='neg_mean_squared_error',n_jobs=4, iid=False, cv=5)

In [None]:
gsearch1.fit(train.drop("loss", 1), train.loss)