#### Summary

To build and explore gradient boosting models using xgboost library.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import time as time

import sys
sys.path.append('../../common_routines/')

from relevant_functions import (get_train_data,
                                get_test_data,
                                get_all_predictor_cols,
                                get_rel_cols)

In [2]:
INPUT_DIR = '../../input/'

In [3]:
ts = time.time()
train = get_train_data(INPUT_DIR)
time.time() - ts

5.336509943008423

#### Build a basic model using xgboost

In [9]:
X_COLUMNS = get_all_predictor_cols(train)
Y_COLUMN = 'log_target'

In [10]:
X = train[X_COLUMNS].values
Y = train[[Y_COLUMN]].values

In [11]:
xgb_complete_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)

In [14]:
xgb_params = {'eta':0.01, 'eval_metric':'rmse'}

In [15]:
ts = time.time()
model_xgboost = xgb.cv(params=xgb_params,
                       dtrain=xgb_complete_data,
                       num_boost_round=1000,
                       nfold=5,
                       early_stopping_rounds=5,
                       callbacks=[xgb.callback.print_evaluation(show_stdv=False)])
time.time() - ts

[0]	train-rmse:13.9605	test-rmse:13.9604
[1]	train-rmse:13.823	test-rmse:13.8231
[2]	train-rmse:13.6869	test-rmse:13.6872
[3]	train-rmse:13.5522	test-rmse:13.5527
[4]	train-rmse:13.4189	test-rmse:13.4195
[5]	train-rmse:13.2869	test-rmse:13.2876
[6]	train-rmse:13.1562	test-rmse:13.1572
[7]	train-rmse:13.0269	test-rmse:13.0279
[8]	train-rmse:12.8989	test-rmse:12.9001
[9]	train-rmse:12.7721	test-rmse:12.7735
[10]	train-rmse:12.6467	test-rmse:12.6481
[11]	train-rmse:12.5225	test-rmse:12.5241
[12]	train-rmse:12.3996	test-rmse:12.4013
[13]	train-rmse:12.2779	test-rmse:12.2797
[14]	train-rmse:12.1574	test-rmse:12.1594
[15]	train-rmse:12.0382	test-rmse:12.0402
[16]	train-rmse:11.9202	test-rmse:11.9224
[17]	train-rmse:11.8033	test-rmse:11.8056
[18]	train-rmse:11.6877	test-rmse:11.6902
[19]	train-rmse:11.5732	test-rmse:11.5758
[20]	train-rmse:11.4599	test-rmse:11.4626
[21]	train-rmse:11.3478	test-rmse:11.3508
[22]	train-rmse:11.2368	test-rmse:11.2398
[23]	train-rmse:11.1269	test-rmse:11.1301
[24

[195]	train-rmse:2.40365	test-rmse:2.49789
[196]	train-rmse:2.38695	test-rmse:2.48218
[197]	train-rmse:2.37038	test-rmse:2.46679
[198]	train-rmse:2.35415	test-rmse:2.4515
[199]	train-rmse:2.33815	test-rmse:2.43636
[200]	train-rmse:2.3222	test-rmse:2.42133
[201]	train-rmse:2.30648	test-rmse:2.40654
[202]	train-rmse:2.2909	test-rmse:2.39201
[203]	train-rmse:2.27567	test-rmse:2.37782
[204]	train-rmse:2.26054	test-rmse:2.36372
[205]	train-rmse:2.24565	test-rmse:2.3498
[206]	train-rmse:2.23088	test-rmse:2.33612
[207]	train-rmse:2.21628	test-rmse:2.32269
[208]	train-rmse:2.20198	test-rmse:2.3095
[209]	train-rmse:2.18779	test-rmse:2.29621
[210]	train-rmse:2.17383	test-rmse:2.28336
[211]	train-rmse:2.16005	test-rmse:2.27061
[212]	train-rmse:2.14642	test-rmse:2.25796
[213]	train-rmse:2.1329	test-rmse:2.24542
[214]	train-rmse:2.11962	test-rmse:2.23304
[215]	train-rmse:2.10655	test-rmse:2.22096
[216]	train-rmse:2.09371	test-rmse:2.20905
[217]	train-rmse:2.08093	test-rmse:2.19735
[218]	train-rmse:

[387]	train-rmse:1.27073	test-rmse:1.51976
[388]	train-rmse:1.26965	test-rmse:1.51917
[389]	train-rmse:1.26855	test-rmse:1.5186
[390]	train-rmse:1.26746	test-rmse:1.51795
[391]	train-rmse:1.26646	test-rmse:1.51739
[392]	train-rmse:1.26544	test-rmse:1.51678
[393]	train-rmse:1.26447	test-rmse:1.51623
[394]	train-rmse:1.26339	test-rmse:1.51568
[395]	train-rmse:1.26243	test-rmse:1.51509
[396]	train-rmse:1.26149	test-rmse:1.51454
[397]	train-rmse:1.26062	test-rmse:1.51405
[398]	train-rmse:1.25963	test-rmse:1.51351
[399]	train-rmse:1.25881	test-rmse:1.51301
[400]	train-rmse:1.25791	test-rmse:1.51251
[401]	train-rmse:1.25695	test-rmse:1.512
[402]	train-rmse:1.256	test-rmse:1.51156
[403]	train-rmse:1.25511	test-rmse:1.51107
[404]	train-rmse:1.25427	test-rmse:1.51062
[405]	train-rmse:1.25342	test-rmse:1.51009
[406]	train-rmse:1.25258	test-rmse:1.5097
[407]	train-rmse:1.25177	test-rmse:1.50926
[408]	train-rmse:1.25106	test-rmse:1.50887
[409]	train-rmse:1.25028	test-rmse:1.50842
[410]	train-rmse:

[579]	train-rmse:1.17718	test-rmse:1.48359
[580]	train-rmse:1.17695	test-rmse:1.48358
[581]	train-rmse:1.17673	test-rmse:1.48356
[582]	train-rmse:1.17653	test-rmse:1.48352
[583]	train-rmse:1.17627	test-rmse:1.48348
[584]	train-rmse:1.17602	test-rmse:1.48344
[585]	train-rmse:1.17575	test-rmse:1.48335
[586]	train-rmse:1.17548	test-rmse:1.48329
[587]	train-rmse:1.17523	test-rmse:1.48325
[588]	train-rmse:1.17499	test-rmse:1.48317
[589]	train-rmse:1.17466	test-rmse:1.48313
[590]	train-rmse:1.17444	test-rmse:1.48316
[591]	train-rmse:1.1742	test-rmse:1.48313
[592]	train-rmse:1.17396	test-rmse:1.48307
[593]	train-rmse:1.17371	test-rmse:1.48305
[594]	train-rmse:1.17345	test-rmse:1.48306
[595]	train-rmse:1.17318	test-rmse:1.48298
[596]	train-rmse:1.17287	test-rmse:1.48293
[597]	train-rmse:1.17268	test-rmse:1.48293
[598]	train-rmse:1.17241	test-rmse:1.48289
[599]	train-rmse:1.17212	test-rmse:1.48287
[600]	train-rmse:1.17189	test-rmse:1.48281
[601]	train-rmse:1.17167	test-rmse:1.4828
[602]	train-r

1194.2121109962463

In [17]:
model_xgboost[model_xgboost['test-rmse-mean '] == model_xgboost['test-rmse-mean'].min()]['test-rmse-mean']

609    1.482555
Name: test-rmse-mean, dtype: float64

In [18]:
model_xgboost[model_xgboost['test-rmse-mean'] == model_xgboost['test-rmse-mean'].min()].index[0]

609

In [21]:
model_lgb = xgb.cv(params=xgb_params,
                   dtrain=xgb_complete_data,
                   num_boost_round=10,
                   nfold=5,
                   early_stopping_rounds=5)


In [22]:
model_lgb

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,13.960468,0.015485,13.960437,0.062727
1,13.823003,0.015311,13.823118,0.062874
2,13.686912,0.015134,13.687187,0.062999
3,13.552211,0.014974,13.552674,0.063234
4,13.41887,0.014796,13.419506,0.063392
5,13.286883,0.014638,13.287629,0.06362
6,13.156218,0.01447,13.15718,0.063787
7,13.026886,0.01431,13.027913,0.06397
8,12.898854,0.014157,12.900069,0.064153
9,12.772122,0.014017,12.773477,0.064264


In [33]:
model_lgb[model_lgb['test-rmse-mean'] == model_lgb['test-rmse-mean'].min()].index[0]

9

### Let us enhance the model by using only dense predictors

Since the results look encouraging, let us try including only dense predictors.

In [35]:
def get_prediction_score(columns, data=train, params=xgb_params):
    X = train[columns]
    Y = train[['log_target']]
    
    xgb_complete_data = xgb.DMatrix(X, Y, feature_names=columns)
    
    model_xgb = xgb.cv(params=xgb_params,
                       dtrain=xgb_complete_data,
                       num_boost_round=1000,
                       nfold=5,
                       early_stopping_rounds=5)
    
    min_test_error_rec = model_xgb[model_xgb['test-rmse-mean'] == model_xgb['test-rmse-mean'].min()]
    num_rounds = min_test_error_rec.index[0]
    min_test_error = min_test_error_rec['test-rmse-mean'].values[0]
    
    return (min_test_error, num_rounds)

In [38]:
def get_model_with_predictors_beyond_given_threshold(percent_threshold=10, data=train, params=xgb_params):
    rel_cols = get_rel_cols(percent_threshold, data)
    
    (best_score, num_iterations) = get_prediction_score(rel_cols, data, params)    
    return (best_score, num_iterations, len(rel_cols))
    

In [None]:
PERCENTAGES_TO_CHECK = [35, 30, 25, 20, 15, 10, 5]
percentage_to_prediction_score = dict()
percentage_to_num_predictors = dict()
percentage_to_num_model_iterations = dict()

In [None]:
ts = time.time()
for percent in PERCENTAGES_TO_CHECK:
    (best_score, num_iterations, num_predictors) = \
        get_model_with_predictors_beyond_given_threshold(percent_threshold=percent)
    percentage_to_prediction_score[percent] = best_score
    percentage_to_num_model_iterations[percent] = num_iterations
    percentage_to_num_predictors[percent] = num_predictors
time.time() - ts

In [None]:
PERCENTAGES_TO_CHECK = np.arange(11, 20, 1)

In [None]:
ts = time.time()
for percent in PERCENTAGES_TO_CHECK:
    (best_score, num_iterations, num_predictors) = \
        get_model_with_predictors_beyond_given_threshold(percent_threshold=percent)
    percentage_to_prediction_score[percent] = best_score
    percentage_to_num_model_iterations[percent] = num_iterations
    percentage_to_num_predictors[percent] = num_predictors
time.time() - ts

### Finding the most optimal subset of columns

Now, let us find the most optimal subset of columns which maximises the cross validation score amongst this superset of 40 columns. Let us do this using a greedy method.

In [107]:
FULL_REL_COLS_LIST = get_rel_cols(17, train)

In [109]:
def get_prediction_scores_after_column_addition(given_list, data):
    col_list_to_score = dict()
    for col in FULL_REL_COLS_LIST:
        if col not in given_list:
            cols_list = given_list + [col]
            score = get_prediction_score(cols_list, data)[0]
            col_list_to_score[str(cols_list)] = score
            
    return sorted(col_list_to_score.items(), key=operator.itemgetter(1))
    

In [110]:
ts = time.time()

columns_list = []
MAX_NUM_PREDICTORS  = len(columns_list) + 1
max_columns_list_to_score = dict()
while len(columns_list) < MAX_NUM_PREDICTORS:
    col_list_to_score = get_prediction_scores_after_column_addition(columns_list, train)
    columns_list = ast.literal_eval(col_list_to_score[0][0])
    max_columns_list_to_score[col_list_to_score[0][0]] = col_list_to_score[0][1]
time.time() - ts

37.065032958984375

In [111]:
col_list_to_score

[("['f190486d6']", 1.6449866458904614),
 ("['eeb9cd3aa']", 1.6549665060753382),
 ("['c47340d97']", 1.6564248164011501),
 ("['c5a231d81']", 1.6597331712390317),
 ("['66ace2992']", 1.6597427000135743),
 ("['58e2e02e6']", 1.662740227415354),
 ("['adb64ff71']", 1.6637054978615513),
 ("['491b9ee45']", 1.6641022211337322),
 ("['e176a204a']", 1.6641756120490236),
 ("['0572565c2']", 1.6648001845404812),
 ("['1db387535']", 1.6660756055977954),
 ("['1931ccfdd']", 1.667217694693769),
 ("['15ace8c9f']", 1.667416285644309),
 ("['024c577b9']", 1.6674215815208133),
 ("['91f701ba2']", 1.669414765433305),
 ("['0ff32eb98']", 1.6698816518903408),
 ("['26fc93eb7']", 1.6702095430990647),
 ("['963a49cdc']", 1.6710982011036655),
 ("['70feb1494']", 1.671840619636297),
 ("['6619d81fc']", 1.6726918675576397),
 ("['9fd594eec']", 1.6730804789897662),
 ("['703885424']", 1.6736472114556256),
 ("['23310aa6f']", 1.6741381954941077),
 ("['f74e8f13d']", 1.67481747766058),
 ("['58232a6fb']", 1.6761855988254997),
 ("['5c

In [112]:
max_columns_list_to_score

{"['f190486d6']": 1.6449866458904614}

### A good portion of the complete algorithm was run as a kaggle script(https://www.kaggle.com/babinu/xgboost-non-sparse-columns-shrink-for-kaggle)

I am pasting the subset , which was found to have a very low cross validation score.

In [40]:
best_pred_cols = ['f190486d6', 'eeb9cd3aa', 'c47340d97', '024c577b9', '2288333b4', 'adb64ff71', '58e2e02e6', '58e056e12', 'd6bb78916', 'fb0f5dbfe', '66ace2992', '1702b5bf0', '77deffdf0', '15ace8c9f', '122c135ed', 'e176a204a', '6eef030c1', 'bc70cbc26', '1184df5c2', '1fe5d56b9', '190db8488', '20aa07010', 'e222309b0', 'ad009c8b9', 'ef30f6be5', 'c0d2348b7', 'c10f31664', '935ca66a9', 'c8d582dd2', '939f628a7']

In [41]:
len(best_pred_cols)

30

### Make predictions on the test set.

Let us make predictions over the test set with our best model.

In [42]:
X = train[best_pred_cols]
Y = train[['log_target']]

xgb_full_data = xgb.DMatrix(X, Y, feature_names=best_pred_cols)

In [43]:
# We want to know the number of iterations this model took  as well, so that we can train the model over the complete
# data for same number of iterations and make predictions over the test set as well.
ts = time.time()
(score, num_iterations) = get_prediction_score(best_pred_cols, train, xgb_params)
print(score, num_iterations)
time.time() - ts

1.4431472000000003 872


16.58401608467102

In [46]:
ts = time.time()
model_xgb = xgb.train(params=xgb_params, 
                      dtrain=xgb_full_data, 
                      num_boost_round=num_iterations)
time.time() - ts

3.5956008434295654

In [47]:
ts = time.time()
test = get_test_data(INPUT_DIR)
time.time() - ts

74.03381705284119

In [50]:
ts = time.time()
new_X = test[best_pred_cols]
xgb_test_X = xgb.DMatrix(new_X, feature_names=best_pred_cols)
test_log_predictions = model_xgb.predict(xgb_test_X)
test_log_predictions[test_log_predictions <0] = 0
test['target'] = np.exp(test_log_predictions) - 1.0
time.time() - ts

0.44350218772888184

In [51]:
test[['ID', 'target']].to_csv('submission_gradient_boosting_xgboost.csv', index=False)