## Summary

The ideas is to apply gradient boosting only on those columns which are relatively non sparse and see how the resultant model performs.

In [1]:
import pandas as pd
import numpy as np
import time as time
import xgboost as xgb

In [2]:
INPUT_DIR = '../input/'

In [3]:
ts = time.time()
train = pd.read_csv(INPUT_DIR + 'train.csv')
time.time() - ts

5.774972915649414

In [4]:
ts = time.time()
test = pd.read_csv(INPUT_DIR + 'test.csv')
time.time() - ts

83.50776696205139

In [5]:
train['new_target'] = np.log(train['target'] + 1.0)

### Finding relatively non sparse columns

In [6]:
cols_apart_from_id_and_target = [col for col in train.columns if col not in ['ID', 'target', 'new_target']]

In [7]:
non_zero_percent_for_col = np.zeros(len(cols_apart_from_id_and_target))

In [8]:
for i in range(len(cols_apart_from_id_and_target)):
    non_zero_percent_for_col[i] = \
        100.0* len(train[cols_apart_from_id_and_target[i]].to_numpy().nonzero()[0])/len(train[cols_apart_from_id_and_target[i]].to_numpy())

In [9]:
np.argsort(non_zero_percent_for_col)[::-1]

array([2050, 4554, 3088, ..., 2035, 2032, 2427])

In [10]:
np.sort(non_zero_percent_for_col)[::-1][0:50]

array([35.09755551, 35.09755551, 34.98542274, 34.87328998, 34.69387755,
       34.69387755, 34.62659789, 34.51446513, 34.42475891, 34.37990581,
       34.35747926, 34.35747926, 34.26777304, 34.20049338, 34.15564028,
       34.11078717, 34.08836062, 34.04350751, 33.93137475, 33.93137475,
       33.90894819, 33.84166854, 33.72953577, 33.63982956, 33.52769679,
       33.50527024, 33.50527024, 33.46041713, 33.30343126, 33.30343126,
       33.2361516 , 33.12401884, 33.12401884, 32.98945952, 32.8100471 ,
       32.76519399, 32.63063467, 32.24938327, 32.22695672, 31.59901323,
       20.81184122, 20.78941467, 20.6324288 , 20.6324288 , 20.25117739,
       20.22875084, 20.20632429, 20.02691186, 20.02691186, 20.00448531])

### Get all columns that have percentage of non zero entries beyond a threshold.

In [11]:
PERCENT_THRESHOLD = 10

In [12]:
num_rel_entries = np.sum(non_zero_percent_for_col > PERCENT_THRESHOLD)

In [13]:
rel_col_indices = np.argsort(non_zero_percent_for_col)[::-1][0:num_rel_entries]

In [14]:
rel_cols = [cols_apart_from_id_and_target[col] for col in rel_col_indices]

In [15]:
rel_cols

['0ff32eb98',
 'c5a231d81',
 '91f701ba2',
 'c47340d97',
 '0572565c2',
 'adb64ff71',
 'f190486d6',
 '5c6487af1',
 'e176a204a',
 '6619d81fc',
 '70feb1494',
 '23310aa6f',
 '190db8488',
 '1db387535',
 '491b9ee45',
 '66ace2992',
 '9fd594eec',
 'fc99f9426',
 '58e2e02e6',
 '703885424',
 'eeb9cd3aa',
 '1931ccfdd',
 '324921c7b',
 '1702b5bf0',
 'fb0f5dbfe',
 'f74e8f13d',
 '20aa07010',
 '26fc93eb7',
 '58232a6fb',
 '15ace8c9f',
 'fb49e4212',
 '2ec5b290f',
 '62e59a501',
 '963a49cdc',
 '58e056e12',
 '241f0f867',
 '6eef030c1',
 'b43a7cfd5',
 'd6bb78916',
 '024c577b9',
 '11e12dbe8',
 '166008929',
 '861076e21',
 'f02ecb19c',
 'f97d9431e',
 '9de83dc23',
 'ca2b906e8',
 '935ca66a9',
 'a09a238d0',
 '77deffdf0',
 'bb0ce54e9',
 'c8d582dd2',
 'c0d2348b7',
 '8781e4b91',
 'bd6da0cca',
 'cbb673163',
 '68a945b18',
 'c10f31664',
 'ad009c8b9',
 '62fb56487',
 'ea772e115',
 'bc70cbc26',
 '939f628a7',
 '4bcf15776',
 '1fe5d56b9',
 '7e814a30d',
 '5d3b81ef8',
 'aca228668',
 'ef30f6be5',
 '070f95c99',
 '4da206d28',
 'b7c9

### Now, let us train gradient boosting model using xgboost

In [16]:
X_COLUMNS = rel_cols
Y_COLUMN = 'new_target'

In [17]:
X = train[X_COLUMNS]
Y = train[[Y_COLUMN]]

In [18]:
xgb_complete_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)
xgb_params = {'eta' : 0.01, 'eval_metric' : 'rmse'}

In [19]:
ts = time.time()
model_xgboost = xgb.cv(params=xgb_params,
                       dtrain=xgb_complete_data,
                       num_boost_round=1000,
                       nfold=5,
                       early_stopping_rounds=5,
                       callbacks=[xgb.callback.print_evaluation(show_stdv=False)])
time.time() - ts

[0]	train-rmse:13.9605	test-rmse:13.9603
[1]	train-rmse:13.823	test-rmse:13.8229
[2]	train-rmse:13.6869	test-rmse:13.6869
[3]	train-rmse:13.5521	test-rmse:13.5522
[4]	train-rmse:13.4188	test-rmse:13.4189
[5]	train-rmse:13.2868	test-rmse:13.2869
[6]	train-rmse:13.1561	test-rmse:13.1564
[7]	train-rmse:13.0268	test-rmse:13.0271
[8]	train-rmse:12.8987	test-rmse:12.8991
[9]	train-rmse:12.772	test-rmse:12.7725
[10]	train-rmse:12.6465	test-rmse:12.6471
[11]	train-rmse:12.5223	test-rmse:12.5229
[12]	train-rmse:12.3994	test-rmse:12.4001
[13]	train-rmse:12.2778	test-rmse:12.2785
[14]	train-rmse:12.1573	test-rmse:12.1581
[15]	train-rmse:12.0381	test-rmse:12.039
[16]	train-rmse:11.9201	test-rmse:11.9211
[17]	train-rmse:11.8032	test-rmse:11.8043
[18]	train-rmse:11.6876	test-rmse:11.6888
[19]	train-rmse:11.5732	test-rmse:11.5744
[20]	train-rmse:11.4599	test-rmse:11.4612
[21]	train-rmse:11.3477	test-rmse:11.3492
[22]	train-rmse:11.2367	test-rmse:11.2383
[23]	train-rmse:11.1268	test-rmse:11.1286
[24]	

[194]	train-rmse:2.43186	test-rmse:2.51909
[195]	train-rmse:2.41498	test-rmse:2.50326
[196]	train-rmse:2.39838	test-rmse:2.48753
[197]	train-rmse:2.38187	test-rmse:2.47206
[198]	train-rmse:2.36557	test-rmse:2.45683
[199]	train-rmse:2.34944	test-rmse:2.44181
[200]	train-rmse:2.33357	test-rmse:2.42691
[201]	train-rmse:2.31804	test-rmse:2.41228
[202]	train-rmse:2.30262	test-rmse:2.39776
[203]	train-rmse:2.28733	test-rmse:2.38349
[204]	train-rmse:2.27227	test-rmse:2.36942
[205]	train-rmse:2.25745	test-rmse:2.3556
[206]	train-rmse:2.24284	test-rmse:2.34198
[207]	train-rmse:2.22832	test-rmse:2.32844
[208]	train-rmse:2.21411	test-rmse:2.31516
[209]	train-rmse:2.19993	test-rmse:2.30209
[210]	train-rmse:2.1859	test-rmse:2.28914
[211]	train-rmse:2.17228	test-rmse:2.27652
[212]	train-rmse:2.15865	test-rmse:2.26387
[213]	train-rmse:2.14528	test-rmse:2.25142
[214]	train-rmse:2.132	test-rmse:2.2391
[215]	train-rmse:2.11887	test-rmse:2.22704
[216]	train-rmse:2.10598	test-rmse:2.21523
[217]	train-rmse

[386]	train-rmse:1.27015	test-rmse:1.51825
[387]	train-rmse:1.26899	test-rmse:1.51755
[388]	train-rmse:1.26785	test-rmse:1.51692
[389]	train-rmse:1.26648	test-rmse:1.51621
[390]	train-rmse:1.26531	test-rmse:1.51555
[391]	train-rmse:1.26416	test-rmse:1.51495
[392]	train-rmse:1.263	test-rmse:1.51431
[393]	train-rmse:1.26184	test-rmse:1.51371
[394]	train-rmse:1.26068	test-rmse:1.51312
[395]	train-rmse:1.25951	test-rmse:1.51249
[396]	train-rmse:1.2584	test-rmse:1.51188
[397]	train-rmse:1.25722	test-rmse:1.5113
[398]	train-rmse:1.25617	test-rmse:1.5108
[399]	train-rmse:1.25507	test-rmse:1.51026
[400]	train-rmse:1.25412	test-rmse:1.50975
[401]	train-rmse:1.25293	test-rmse:1.5092
[402]	train-rmse:1.2519	test-rmse:1.50868
[403]	train-rmse:1.25086	test-rmse:1.50816
[404]	train-rmse:1.24993	test-rmse:1.50764
[405]	train-rmse:1.24881	test-rmse:1.50709
[406]	train-rmse:1.24787	test-rmse:1.50663
[407]	train-rmse:1.24686	test-rmse:1.50617
[408]	train-rmse:1.2459	test-rmse:1.50566
[409]	train-rmse:1.

[578]	train-rmse:1.1483	test-rmse:1.47393
[579]	train-rmse:1.14796	test-rmse:1.47389
[580]	train-rmse:1.14749	test-rmse:1.47378
[581]	train-rmse:1.14714	test-rmse:1.47373
[582]	train-rmse:1.14672	test-rmse:1.47364
[583]	train-rmse:1.1464	test-rmse:1.47358
[584]	train-rmse:1.14604	test-rmse:1.47352
[585]	train-rmse:1.14571	test-rmse:1.47348
[586]	train-rmse:1.14533	test-rmse:1.4735
[587]	train-rmse:1.14495	test-rmse:1.47344
[588]	train-rmse:1.1446	test-rmse:1.4734
[589]	train-rmse:1.14419	test-rmse:1.47337
[590]	train-rmse:1.14389	test-rmse:1.47335
[591]	train-rmse:1.1436	test-rmse:1.47329
[592]	train-rmse:1.14325	test-rmse:1.47328
[593]	train-rmse:1.14285	test-rmse:1.47321
[594]	train-rmse:1.14254	test-rmse:1.47315
[595]	train-rmse:1.1423	test-rmse:1.47306
[596]	train-rmse:1.14197	test-rmse:1.47303
[597]	train-rmse:1.14152	test-rmse:1.47299
[598]	train-rmse:1.1411	test-rmse:1.47296
[599]	train-rmse:1.14066	test-rmse:1.4729
[600]	train-rmse:1.14035	test-rmse:1.47283
[601]	train-rmse:1.1

[770]	train-rmse:1.08474	test-rmse:1.46651
[771]	train-rmse:1.08435	test-rmse:1.46647
[772]	train-rmse:1.08407	test-rmse:1.46644
[773]	train-rmse:1.08389	test-rmse:1.46642
[774]	train-rmse:1.08363	test-rmse:1.46638
[775]	train-rmse:1.08327	test-rmse:1.46632
[776]	train-rmse:1.08292	test-rmse:1.46633
[777]	train-rmse:1.08264	test-rmse:1.46633
[778]	train-rmse:1.08234	test-rmse:1.46632
[779]	train-rmse:1.082	test-rmse:1.46627
[780]	train-rmse:1.08173	test-rmse:1.46629
[781]	train-rmse:1.08137	test-rmse:1.46622
[782]	train-rmse:1.08111	test-rmse:1.46621
[783]	train-rmse:1.08072	test-rmse:1.46621
[784]	train-rmse:1.08046	test-rmse:1.46619
[785]	train-rmse:1.08017	test-rmse:1.46617
[786]	train-rmse:1.07994	test-rmse:1.46613
[787]	train-rmse:1.07968	test-rmse:1.46612
[788]	train-rmse:1.07948	test-rmse:1.46609
[789]	train-rmse:1.07919	test-rmse:1.46613
[790]	train-rmse:1.07885	test-rmse:1.46606
[791]	train-rmse:1.07853	test-rmse:1.46603
[792]	train-rmse:1.07827	test-rmse:1.46601
[793]	train-r

149.77410578727722

In [20]:
model_xgboost['test-rmse-mean'].min()

1.4647958

In [None]:
import sys
sys.path.append('../common_routines/')

In [None]:
from relevant_functions import get_rel_cols

### Formalizing the code to a routine.

In [41]:
 def get_model_with_predictors_beyond_given_non_zero_threshold(percent_threshold=10, 
                                                              show_evaluation = True,
                                                              data=train):
    
    rel_cols = get_rel_cols(percent_threshold, data)

    ### Now, let us train gradient boosting model using xgboost

    X_COLUMNS = rel_cols
    Y_COLUMN = 'new_target'

    X = train[X_COLUMNS]
    Y = train[[Y_COLUMN]]

    xgb_complete_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)
    xgb_params = {'eta' : 0.01, 'eval_metric' : 'rmse'}

    model_xgboost = xgb.cv(params=xgb_params,
                           dtrain=xgb_complete_data,
                           num_boost_round=1000,
                           nfold=5,
                           early_stopping_rounds=5)

    return (len(rel_cols), model_xgboost)


In [22]:
PERCENTAGES_TO_CHECK = [35, 30, 25, 20, 15, 10, 5]
percentage_to_prediction_score = dict()
percentage_to_num_predictors = dict()

for percent in PERCENTAGES_TO_CHECK:
    (num_predictors, model_output) = \
        get_model_with_predictors_beyond_given_non_zero_threshold(percent_threshold=percent)
    percentage_to_prediction_score[percent] = model_output['test-rmse-mean'].min()
    percentage_to_num_predictors[percent] = num_predictors
    
    

In [23]:
percentage_to_prediction_score

{35: 1.63959,
 30: 1.4655642,
 25: 1.4655642,
 20: 1.467795,
 15: 1.4614864,
 10: 1.4647958,
 5: 1.4656516}

In [24]:
percentage_to_num_predictors

{35: 2, 30: 40, 25: 40, 20: 50, 15: 191, 10: 376, 5: 997}

### Let us drill down on a finer percent.

In [25]:
np.arange(11, 20, 1)

array([11, 12, 13, 14, 15, 16, 17, 18, 19])

In [26]:
for percent in np.arange(11, 20, 1):
    (num_predictors, model_output) = \
        get_model_with_predictors_beyond_given_non_zero_threshold(percent_threshold=percent)
    percentage_to_prediction_score[percent] = model_output['test-rmse-mean'].min()
    percentage_to_num_predictors[percent] = num_predictors


In [27]:
percentage_to_prediction_score

{35: 1.63959,
 30: 1.4655642,
 25: 1.4655642,
 20: 1.467795,
 15: 1.4614864,
 10: 1.4647958,
 5: 1.4656516,
 11: 1.463816,
 12: 1.4669808,
 13: 1.4644094,
 14: 1.4682358,
 16: 1.464707,
 17: 1.4535566000000002,
 18: 1.4639058,
 19: 1.4709160000000001}

In [28]:
percentage_to_num_predictors

{35: 2,
 30: 40,
 25: 40,
 20: 50,
 15: 191,
 10: 376,
 5: 997,
 11: 337,
 12: 316,
 13: 276,
 14: 244,
 16: 137,
 17: 93,
 18: 69,
 19: 58}

## Find the optimum percentage value and train the corresponding model over  complete data.

In [29]:
optim_percent = min(percentage_to_prediction_score, key=percentage_to_prediction_score.get)

In [30]:
optim_percent

17

In [31]:
(num_predictors, model_output) = \
    get_model_with_predictors_beyond_given_non_zero_threshold(percent_threshold=optim_percent)


In [32]:
model_output

Unnamed: 0,train-rmse-mean,train-rmse-std,test-rmse-mean,test-rmse-std
0,13.960452,0.015496,13.960341,0.062699
1,13.822960,0.015330,13.822884,0.062829
2,13.686862,0.015166,13.686893,0.062894
3,13.552143,0.015000,13.552225,0.063054
4,13.418788,0.014842,13.418927,0.063162
...,...,...,...,...
767,1.105753,0.012836,1.453671,0.029961
768,1.105526,0.012757,1.453624,0.029975
769,1.105135,0.012705,1.453569,0.029961
770,1.104931,0.012629,1.453570,0.029963


In [33]:
model_output.index.max()

771

In [34]:
model_over_complete_data = xgb.train(params=xgb_params,
                                     dtrain=xgb_complete_data,
                                     num_boost_round=model_output.index.max())

In [35]:
model_over_complete_data.best_ntree_limit 

771

## Generate predictions for test data

In [None]:
rel_cols = get_rel_cols(optim_percent, train)

In [None]:
xgb_test_data = xgb.DMatrix(test[rel_cols], feature_names=rel_cols)

In [None]:
test_predictions = np.exp(model_over_complete_data.predict(xgb_test_data)) - 1.0

In [None]:
test['target'] = test_predictions

In [None]:
test[['ID', 'target']].to_csv('submission_xgboost_sparse.csv', index=False)