In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold
from sklearn import preprocessing
import xgboost as xgb
%matplotlib inline

from os import path
to_filename = lambda name: path.join("..", "data", "allstate", name +".csv")



In [2]:
train = pd.read_csv(to_filename("train"), index_col=0)
test = pd.read_csv(to_filename("test"), index_col=0)
print("shape: train {}, test {}".format(train.shape, test.shape))
print(train.head(2))

shape: train (188318, 131), test (125546, 130)
   cat1 cat2 cat3 cat4 cat5 cat6 cat7 cat8 cat9 cat10   ...        cont6  \
id                                                      ...                
1     A    B    A    B    A    A    A    A    B     A   ...     0.718367   
2     A    B    A    A    A    A    A    A    B     B   ...     0.438917   

       cont7    cont8    cont9   cont10    cont11    cont12    cont13  \
id                                                                      
1   0.335060  0.30260  0.67135  0.83510  0.569745  0.594646  0.822493   
2   0.436585  0.60087  0.35127  0.43919  0.338312  0.366307  0.611431   

      cont14     loss  
id                     
1   0.714843  2213.18  
2   0.304496  1283.60  

[2 rows x 131 columns]


In [3]:
cat_features = [col for col in train.columns if col.startswith("cat")]
print("Categorical columns:", cat_features)

('Categorical columns:', ['cat1', 'cat2', 'cat3', 'cat4', 'cat5', 'cat6', 'cat7', 'cat8', 'cat9', 'cat10', 'cat11', 'cat12', 'cat13', 'cat14', 'cat15', 'cat16', 'cat17', 'cat18', 'cat19', 'cat20', 'cat21', 'cat22', 'cat23', 'cat24', 'cat25', 'cat26', 'cat27', 'cat28', 'cat29', 'cat30', 'cat31', 'cat32', 'cat33', 'cat34', 'cat35', 'cat36', 'cat37', 'cat38', 'cat39', 'cat40', 'cat41', 'cat42', 'cat43', 'cat44', 'cat45', 'cat46', 'cat47', 'cat48', 'cat49', 'cat50', 'cat51', 'cat52', 'cat53', 'cat54', 'cat55', 'cat56', 'cat57', 'cat58', 'cat59', 'cat60', 'cat61', 'cat62', 'cat63', 'cat64', 'cat65', 'cat66', 'cat67', 'cat68', 'cat69', 'cat70', 'cat71', 'cat72', 'cat73', 'cat74', 'cat75', 'cat76', 'cat77', 'cat78', 'cat79', 'cat80', 'cat81', 'cat82', 'cat83', 'cat84', 'cat85', 'cat86', 'cat87', 'cat88', 'cat89', 'cat90', 'cat91', 'cat92', 'cat93', 'cat94', 'cat95', 'cat96', 'cat97', 'cat98', 'cat99', 'cat100', 'cat101', 'cat102', 'cat103', 'cat104', 'cat105', 'cat106', 'cat107', 'cat108', 'c

In [4]:
# Encode the labels to int
for col in cat_features:
    encd = preprocessing.LabelEncoder()
    encd.fit(train[col].value_counts().index.union(test[col].value_counts().index))
    train[col] = encd.transform(train[col])
    test[col] = encd.transform(test[col])

In [5]:
params = {'objective':"reg:linear", 'silent': True}
params['eta'] = 0.075
params['max_depth'] = 6
params['subsample'] = 0.7
params['colsample_bytree'] = 0.7
params['min_child_weight'] = 100
params['silent'] = True
params['eval_metric'] = "mae"

In [6]:
dtrain = xgb.DMatrix(train.drop("loss", 1), train.loss)

In [8]:
cvresult = xgb.cv(params, dtrain, nfold=5)

In [12]:
print("optimum number of trees required", cvresult.shape[0])
params["n_estimators"] = cvresult.shape[0]

('optimum number of trees required', 10)


In [13]:
cvresult

Unnamed: 0,test-mae-mean,test-mae-std,train-mae-mean,train-mae-std
0,2809.273828,19.78577,2809.191553,4.75288
1,2600.46206,19.445231,2600.376465,4.462088
2,2410.862793,19.189851,2410.604004,4.213221
3,2240.740918,18.738684,2240.278027,3.816485
4,2091.144971,17.594359,2090.551807,4.000692
5,1960.201196,16.846711,1959.540991,3.529448
6,1847.905737,15.858236,1846.928052,3.600533
7,1751.788843,14.658559,1750.625342,3.533084
8,1670.096362,13.763266,1668.526489,3.575291
9,1602.080762,13.155365,1600.030469,3.510591


In [17]:
params['seed'] = 1
clf = xgb.train(params, dtrain, num_boost_round=50, evals=[(dtrain, "train")])

[0]	train-mae:2809.31
[1]	train-mae:2600.53
[2]	train-mae:2411.12
[3]	train-mae:2241.31
[4]	train-mae:2092.39
[5]	train-mae:1961.44
[6]	train-mae:1847.86
[7]	train-mae:1751.61
[8]	train-mae:1670.94
[9]	train-mae:1601.78
[10]	train-mae:1542
[11]	train-mae:1491.98
[12]	train-mae:1450.66
[13]	train-mae:1416.39
[14]	train-mae:1387.53
[15]	train-mae:1363.64
[16]	train-mae:1344.22
[17]	train-mae:1327.29
[18]	train-mae:1313.75
[19]	train-mae:1301.8
[20]	train-mae:1292.35
[21]	train-mae:1284.34
[22]	train-mae:1277.39
[23]	train-mae:1271.21
[24]	train-mae:1266.5
[25]	train-mae:1261.55
[26]	train-mae:1259.07
[27]	train-mae:1257.25
[28]	train-mae:1253.29
[29]	train-mae:1251.68
[30]	train-mae:1249.83
[31]	train-mae:1248.25
[32]	train-mae:1247.05
[33]	train-mae:1245.69
[34]	train-mae:1245.14
[35]	train-mae:1243.58
[36]	train-mae:1242.53
[37]	train-mae:1240.93
[38]	train-mae:1240.18
[39]	train-mae:1239.75
[40]	train-mae:1239.03
[41]	train-mae:1238.58
[42]	train-mae:1236.74
[43]	train-mae:1236.6
[44]

In [20]:
dtest = xgb.DMatrix(test)
pred_test = clf.predict(dtest)

In [32]:
import datetime
result = pd.DataFrame({"id": test.index, "loss": pred_test})
result.to_csv("result{:%Y%m%d%H}.csv".format(datetime.datetime.now()), index=None)