In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn import preprocessing
from IPython.core.pylabtools import figsize
import xgboost as xgb
%matplotlib inline

from os import path
to_filename = lambda name: path.join("..", "data", "allstate", name +".csv")



In [2]:
train = pd.read_csv(to_filename("train"), index_col=0)
test = pd.read_csv(to_filename("test"), index_col=0)
print("shape: train {}, test {}".format(train.shape, test.shape))
print(train.head(2))

shape: train (188318, 131), test (125546, 130)
   cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10   ...        cont6  \
id                                                      ...                
1     A    B    A    B    A    A    A    A    B     A   ...     0.718367   
2     A    B    A    A    A    A    A    A    B     B   ...     0.438917   

       cont7    cont8    cont9   cont10    cont11    cont12    cont13  \
id                                                                      
1   0.335060  0.30260  0.67135  0.83510  0.569745  0.594646  0.822493   
2   0.436585  0.60087  0.35127  0.43919  0.338312  0.366307  0.611431   

      cont14     loss  
id                     
1   0.714843  2213.18  
2   0.304496  1283.60  

[2 rows x 131 columns]


In [3]:
# preprocess categorical features
cat_features = [col for col in train.columns if col.startswith("cat")]
print("Categorical columns:", cat_features)

for col in cat_features:
    encd = preprocessing.LabelEncoder()
    encd.fit(train[col].value_counts().index.union(test[col].value_counts().index))
    train[col] = encd.transform(train[col])
    test[col] = encd.transform(test[col])

('Categorical columns:', ['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat90', 'cat91', 'cat92', 'cat93', 'cat94', 'cat95', 'cat96', 'cat97', 'cat98', 'cat99', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'c

In [4]:
# preprocess numerical features
num_features = [col for col in train.columns if col.startswith("cont")]
print("Numerical columns:", num_features)

for col in num_features:
    sc = preprocessing.StandardScaler()
    train[col] = sc.fit_transform(train[[col]])
    test[col] = sc.transform(test[[col]])

('Numerical columns:', ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8', 'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14'])


In [5]:
response = np.log(train.loss)
mean_resp = np.mean(response)
std_resp = np.std(response)

In [7]:
dtrain = xgb.DMatrix(train.drop("loss", 1), (response - mean_resp) / std_resp)
dtest = xgb.DMatrix(test)

def restore_pred(y):
    return np.exp(y * std_resp + mean_resp)

In [57]:
params = {'objective':"reg:linear", 'silent': True, 'max_depth': 7, 'min_child_weight': 1,
          'colsample_bytree': .8, "subsample": 1., 'eta': 0.5, 'eval_metric':'mae', "n_estimators": 20,
          "gamma": 0.2, "lambda": 0.8}

In [56]:
cvresult = xgb.cv(params, dtrain, nfold=3, num_boost_round=50)
print(cvresult)

    test-mae-mean  test-mae-std  train-mae-mean  train-mae-std
0        0.698381      0.003329        0.696349       0.001023
1        0.619613      0.002812        0.615646       0.001200
2        0.583490      0.002358        0.577761       0.000464
3        0.563530      0.002273        0.556600       0.000272
4        0.554749      0.002368        0.545802       0.000574
5        0.547769      0.001749        0.537187       0.000420
6        0.542026      0.001499        0.529935       0.000045
7        0.538884      0.001951        0.525427       0.000735
8        0.537015      0.001656        0.522186       0.000948
9        0.535380      0.001730        0.519022       0.001042
10       0.534170      0.001853        0.516274       0.001045
11       0.533040      0.001691        0.514070       0.000798
12       0.532012      0.001590        0.511809       0.000566
13       0.531247      0.001732        0.509839       0.000223
14       0.530372      0.002088        0.507654       0

In [58]:
pred_test = 0.
pred_train = 0.
num_runs = 4

for i in range(num_runs):
    params['seed'] = i * 6
    clf = xgb.train(params, dtrain, num_boost_round=33, evals=[(dtrain, "train")])
    this_pred_train = restore_pred(clf.predict(dtrain))
    print("mae",i, mean_absolute_error(train.loss, this_pred_train))
    pred_train += this_pred_train
    pred_test += restore_pred(clf.predict(dtest))
print("mae final",mean_absolute_error(train.loss, pred_train / num_runs))

[0]	train-mae:0.689675
[1]	train-mae:0.611886
[2]	train-mae:0.576594
[3]	train-mae:0.556592
[4]	train-mae:0.545518
[5]	train-mae:0.538351
[6]	train-mae:0.531078
[7]	train-mae:0.527172
[8]	train-mae:0.523557
[9]	train-mae:0.521493
[10]	train-mae:0.519291
[11]	train-mae:0.517039
[12]	train-mae:0.514944
[13]	train-mae:0.512976
[14]	train-mae:0.511437
[15]	train-mae:0.509878
[16]	train-mae:0.50721
[17]	train-mae:0.506471
[18]	train-mae:0.505359
[19]	train-mae:0.504768
[20]	train-mae:0.503266
[21]	train-mae:0.502134
[22]	train-mae:0.501179
[23]	train-mae:0.500095
[24]	train-mae:0.499169
[25]	train-mae:0.498381
[26]	train-mae:0.497359
[27]	train-mae:0.496509
[28]	train-mae:0.495828
[29]	train-mae:0.49524
[30]	train-mae:0.494205
[31]	train-mae:0.493548
[32]	train-mae:0.493289
('mae', 0, 1092.7710720260943)
[0]	train-mae:0.70352
[1]	train-mae:0.608753
[2]	train-mae:0.573976
[3]	train-mae:0.555471
[4]	train-mae:0.545287
[5]	train-mae:0.538528
[6]	train-mae:0.532541
[7]	train-mae:0.529008
[8]	tr

In [59]:
import datetime
result = pd.DataFrame({"id": test.index, "loss": pred_test / num_runs})
result.to_csv("result{:%Y%m%d%H}.csv".format(datetime.datetime.now()), index=None)

# Using XGBRegressor and important features

In [None]:
from sklearn.grid_search import GridSearchCV
from xgboost import XGBRegressor

In [None]:
params_reg = dict(params)
params_reg.pop("eta")
params_reg.pop('eval_metric')

In [None]:
params["n_estimators"] = cvresult.shape[0]
reg = XGBRegressor(**params_reg)
reg.fit(train.drop("loss", 1), train.loss)

In [None]:
train_predprob = reg.score

In [None]:
reg_booster = reg.booster()

In [None]:
figsize(15, 5)
feat_imp = pd.Series(reg.booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')

In [None]:
important_features = list(feat_imp[feat_imp > 10].index)
print("important features:", important_features)

In [None]:
dtrain_imp = xgb.DMatrix(train[important_features], train.loss)
cvresult = xgb.cv(params, dtrain_imp, nfold=5)
print(cvresult)

In [None]:
params2 = {'base_score': 0.1, 'colsample_bytree': 0.9,
 'eta': 0.3,
 'eval_metric': 'mae',
 'max_depth': 7,
 'min_child_weight': 3,
 'n_estimators': 10,
 'objective': 'reg:linear',
 'seed': 1,
 'silent': True}
regb = xgb.train(params2, dtrain_imp, num_boost_round=50, evals=[(dtrain_imp, "train")])

In [None]:
params = 