# XGBoost - quickest ways to win data science competitions

Here is an example of how a single non-ensembled model can achieve high ranking scores using XGBoost, which is an optimized distributed gradient boosting library designed to be highly efficient, flexible and portable.

In [1]:
using XGBoost

### To add mushroom example

In [2]:
dtrain=readcsv("../data/higgs/training.csv",header=true)

(
250000x33 Array{Any,2}:
 100000   138.47    51.655   97.827  …    -2.475  113.497  0.00265331  "s"
 100001   160.937   68.768  103.235     -999.0     46.226  2.23358     "b"
 100002  -999.0    162.172  125.953     -999.0     44.251  2.34739     "b"
 100003   143.905   81.417   80.943     -999.0     -0.0    5.44638     "b"
 100004   175.864   16.915  134.805     -999.0      0.0    6.24533     "b"
 100005    89.744   13.55    59.149  …     3.106  193.66   0.083414    "b"
 100006   148.754   28.862  107.782       -2.767  179.877  0.00265331  "s"
 100007   154.916   10.418   94.714     -999.0     30.638  0.0186361   "s"
 100008   105.594   50.559  100.989     -999.0      0.0    5.296       "b"
 100009   128.053   88.941   69.272     -999.0    167.735  0.00150187  "s"
 100010  -999.0     86.24    79.692  …  -999.0      0.0    2.2995      "b"
 100011   114.744   10.286   75.712       -2.079  165.64   0.30717     "b"
 100012   145.297   64.234  103.565     -999.0     93.117  1.68161     "b"

In [3]:
data=dtrain[1][:,2:31]


250000x30 Array{Any,2}:
  138.47    51.655   97.827   27.98   …     1.24     -2.475  113.497
  160.937   68.768  103.235   48.146     -999.0    -999.0     46.226
 -999.0    162.172  125.953   35.635     -999.0    -999.0     44.251
  143.905   81.417   80.943    0.414     -999.0    -999.0     -0.0  
  175.864   16.915  134.805   16.405     -999.0    -999.0      0.0  
   89.744   13.55    59.149  116.344  …     0.224     3.106  193.66 
  148.754   28.862  107.782  106.13         0.131    -2.767  179.877
  154.916   10.418   94.714   29.169     -999.0    -999.0     30.638
  105.594   50.559  100.989    4.288     -999.0    -999.0      0.0  
  128.053   88.941   69.272  193.392     -999.0    -999.0    167.735
 -999.0     86.24    79.692   27.201  …  -999.0    -999.0      0.0  
  114.744   10.286   75.712   30.816        1.773    -2.079  165.64 
  145.297   64.234  103.565  106.999     -999.0    -999.0     93.117
    ⋮                                 ⋱                             
   89.509 

In [4]:
labels  = dtrain[1][:,33]

250000-element Array{Any,1}:
 "s"
 "b"
 "b"
 "b"
 "b"
 "b"
 "s"
 "s"
 "b"
 "s"
 "b"
 "b"
 "b"
 ⋮  
 "b"
 "b"
 "b"
 "s"
 "b"
 "s"
 "b"
 "b"
 "b"
 "s"
 "b"
 "b"

In [7]:
label=zeros(length(labels))
for i=1:length(labels)
    if labels[i]=="s"
        label[i]=1.0
    end
end

In [5]:
weight=dtrain[1][:,32].*550000/250000

250000-element Array{Any,1}:
  0.00583728
  4.91389   
  5.16426   
 11.982     
 13.7397    
  0.183511  
  0.00583728
  0.0409995 
 11.6512    
  0.00330411
  5.05891   
  0.675773  
  3.69955   
  ⋮         
  4.46369   
  0.140934  
 13.9633    
  0.00330411
  1.63692   
  0.00330595
  0.140934  
  9.91118   
  5.49397   
  0.0409995 
  3.69955   
  4.13044   

In [8]:
sum_wpos=0
for i=1:length(label)
    if label[i]==1.0
        sum_wpos=sum_wpos+weight[i]
    end
end

In [9]:
sum_wneg=0
for i=1:length(label)
    if label[i]==0.0
        sum_wneg=sum_wneg+weight[i]
    end
end

In [10]:
data=convert(Array{Float64,2},data)

250000x30 Array{Float64,2}:
  138.47    51.655   97.827   27.98   …     1.24     -2.475  113.497
  160.937   68.768  103.235   48.146     -999.0    -999.0     46.226
 -999.0    162.172  125.953   35.635     -999.0    -999.0     44.251
  143.905   81.417   80.943    0.414     -999.0    -999.0     -0.0  
  175.864   16.915  134.805   16.405     -999.0    -999.0      0.0  
   89.744   13.55    59.149  116.344  …     0.224     3.106  193.66 
  148.754   28.862  107.782  106.13         0.131    -2.767  179.877
  154.916   10.418   94.714   29.169     -999.0    -999.0     30.638
  105.594   50.559  100.989    4.288     -999.0    -999.0      0.0  
  128.053   88.941   69.272  193.392     -999.0    -999.0    167.735
 -999.0     86.24    79.692   27.201  …  -999.0    -999.0      0.0  
  114.744   10.286   75.712   30.816        1.773    -2.079  165.64 
  145.297   64.234  103.565  106.999     -999.0    -999.0     93.117
    ⋮                                 ⋱                             
   89.

In [11]:
idx0 = findin(data,0.0)
data[idx0]=eps()
idx9=findin(data, -999.0)
data[idx9]=0.0


0.0

In [12]:
datas=sparse(data)

250000x30 sparse matrix with 5890031 Float64 entries:
	[1     ,      1]  =  138.47
	[2     ,      1]  =  160.937
	[4     ,      1]  =  143.905
	[5     ,      1]  =  175.864
	[6     ,      1]  =  89.744
	[7     ,      1]  =  148.754
	[8     ,      1]  =  154.916
	[9     ,      1]  =  105.594
	[10    ,      1]  =  128.053
	[12    ,      1]  =  114.744
	⋮
	[249988,     30]  =  36.544
	[249989,     30]  =  61.554
	[249990,     30]  =  78.984
	[249992,     30]  =  70.969
	[249993,     30]  =  203.569
	[249994,     30]  =  546.066
	[249995,     30]  =  174.176
	[249996,     30]  =  2.22045e-16
	[249998,     30]  =  41.992
	[249999,     30]  =  2.22045e-16
	[250000,     30]  =  2.22045e-16

In [13]:
param = ["objective"=>"binary:logitraw", "scale_pos_weight"=>sum_wneg/sum_wpos, "eta"=>0.1,"max_depth"=>6,
    "eval_metric"=>"auc","silent"=>1,"nthread"=>16];
num_round = 120


Use "Dict(a=>b, ...)" instead.


120

In [14]:
xgmat = DMatrix(datas,label=label)

XGBoost.DMatrix(Ptr{Void} @0x00000000195f1560,_setinfo)

In [23]:
@time bst = xgboost(xgmat, num_round, param=param)

[1]	train-auc:0.844962
[2]	train-auc:0.849924
[3]	train-auc:0.851364
[4]	train-auc:0.854743
[5]	train-auc:0.856286
[6]	train-auc:0.858303
[7]	train-auc:0.859944
[8]	train-auc:0.860584
[9]	train-auc:0.860700
[10]	train-auc:0.861261
[11]	train-auc:0.867643
[12]	train-auc:0.868419
[13]	train-auc:0.868212
[14]	train-auc:0.869332
[15]	train-auc:0.869887
[16]	train-auc:0.869945
[17]	train-auc:0.870510
[18]	train-auc:0.870935
[19]	train-auc:0.871397
[20]	train-auc:0.871656
[21]	train-auc:0.871444
[22]	train-auc:0.872232
[23]	train-auc:0.872552
[24]	train-auc:0.873087
[25]	train-auc:0.873273
[26]	train-auc:0.873448
[27]	train-auc:0.874152
[28]	train-auc:0.874713
[29]	train-auc:0.875216
[30]	train-auc:0.876123
[31]	train-auc:0.876455
[32]	train-auc:0.876750
[33]	train-auc:0.876988
[34]	train-auc:0.877253
[35]	train-auc:0.877925
[36]	train-auc:0.878788
[37]	train-auc:0.879686
[38]	train-auc:0.880313
[39]	train-auc:0.881699
[40]	train-auc:0.882595
[41]	train-auc:0.883466
[42]	train-auc:0.884447
[

 93.868597 seconds (21.32 k allocations: 1.036 MB)


XGBoost.Booster(Ptr{Void} @0x00000000425f0360)

Testing the model

In [16]:
#preds = predict(bst, test_X)
dtest=readcsv("../data/higgs/test.csv",header=true)


(
550000x31 Array{Float64,2}:
 350000.0  -999.0     79.589   23.916  …  -999.0    -999.0     -0.0  
 350001.0   106.398   67.49    87.949     -999.0    -999.0     47.575
 350002.0   117.794   56.226   96.358     -999.0    -999.0      0.0  
 350003.0   135.861   30.604   97.288     -999.0    -999.0      0.0  
 350004.0    74.159   82.772   58.731        0.335     2.587  599.213
 350005.0    95.709   94.168   66.28   …  -999.0    -999.0      0.0  
 350006.0    85.798   49.059   66.131     -999.0    -999.0     36.99 
 350007.0   429.273   75.057  234.61         1.833     2.788  179.059
 350008.0    70.958   66.329   60.95      -999.0    -999.0     -0.0  
 350009.0   110.539    2.82    62.136        2.397     1.479  156.294
 350010.0    89.921   75.566   73.543  …  -999.0    -999.0     45.078
 350011.0   143.089  105.195  106.832     -999.0    -999.0      0.0  
 350012.0    96.033   43.116   59.27      -999.0    -999.0      0.0  
      ⋮                                ⋱                    

In [17]:
datatest=dtest[1][:,2:31]

550000x30 Array{Float64,2}:
 -999.0     79.589   23.916    3.036  …  -999.0    -999.0     -0.0  
  106.398   67.49    87.949   49.994     -999.0    -999.0     47.575
  117.794   56.226   96.358    4.137     -999.0    -999.0      0.0  
  135.861   30.604   97.288    9.104     -999.0    -999.0      0.0  
   74.159   82.772   58.731   89.646        0.335     2.587  599.213
   95.709   94.168   66.28    14.719  …  -999.0    -999.0      0.0  
   85.798   49.059   66.131   37.074     -999.0    -999.0     36.99 
  429.273   75.057  234.61    71.019        1.833     2.788  179.059
   70.958   66.329   60.95     0.758     -999.0    -999.0     -0.0  
  110.539    2.82    62.136  179.753        2.397     1.479  156.294
   89.921   75.566   73.543   69.708  …  -999.0    -999.0     45.078
  143.089  105.195  106.832   23.149     -999.0    -999.0      0.0  
   96.033   43.116   59.27     5.141     -999.0    -999.0      0.0  
    ⋮                                 ⋱                             
  101.

In [18]:
typeof(datatest)

Array{Float64,2}

In [19]:
idx0 = findin(datatest,0.0)
datatest[idx0]=eps()
idx9=findin(datatest, -999.0)
datatest[idx9]=0.0


0.0

In [20]:
test = sparse(datatest)

550000x30 sparse matrix with 12960567 Float64 entries:
	[2     ,      1]  =  106.398
	[3     ,      1]  =  117.794
	[4     ,      1]  =  135.861
	[5     ,      1]  =  74.159
	[6     ,      1]  =  95.709
	[7     ,      1]  =  85.798
	[8     ,      1]  =  429.273
	[9     ,      1]  =  70.958
	[10    ,      1]  =  110.539
	[11    ,      1]  =  89.921
	⋮
	[549987,     30]  =  2.22045e-16
	[549989,     30]  =  246.129
	[549990,     30]  =  2.22045e-16
	[549991,     30]  =  2.22045e-16
	[549992,     30]  =  78.396
	[549993,     30]  =  2.22045e-16
	[549994,     30]  =  110.327
	[549995,     30]  =  98.494
	[549996,     30]  =  97.451
	[549999,     30]  =  43.099
	[550000,     30]  =  49.231

In [22]:
preds = predict(bst, test)

550000-element Array{Float32,1}:
  0.547127
  4.5685  
  6.3327  
  7.91321 
  0.815153
  3.26169 
  4.59102 
 -2.47866 
  2.41904 
  7.78133 
  3.20014 
  4.57928 
  6.2027  
  ⋮       
  6.15356 
  2.82534 
  4.23187 
  2.89811 
  7.60106 
  0.589281
 -0.208251
  3.11921 
  6.17333 
  3.85057 
  6.20555 
  5.32127 