## Summary

To apply gradient boosting technique from the sklearn package to the problem of predicting customer value.

In [16]:
import time as time

import sys
sys.path.append('../../common_routines/')

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from relevant_functions import (get_train_data,
                                get_test_data,
                                get_rel_cols,
                                get_all_predictor_cols)

import numpy as np

In [5]:
INPUT_DIR = '../../input/'

In [6]:
ts = time.time()
train = get_train_data(INPUT_DIR)
time.time() - ts

5.279732942581177

### Model using all available predictors

Let us build a model using all predictors and see how well it performs.

In [7]:
all_predictor_cols = get_all_predictor_cols(train)

In [12]:
X = train[all_predictor_cols]
Y = train[['log_target']].values.ravel()

In [13]:
ts = time.time()
cross_val_scores = cross_val_score(GradientBoostingRegressor(random_state=0, learning_rate=0.1, n_estimators=100),
                                   X, Y, cv=5, scoring='neg_mean_squared_error')
time.time() - ts

147.63075709342957

In [18]:
print(np.sqrt(-cross_val_scores))
print(np.sqrt(-cross_val_scores.mean()))

[1.46833663 1.53454225 1.44316053 1.48606831 1.58876612]
1.5050662104157906


In [19]:
ts = time.time()
cross_val_scores = cross_val_score(GradientBoostingRegressor(random_state=0, learning_rate=0.1, n_estimators=200),
                                   X, Y, cv=5, scoring='neg_mean_squared_error')
time.time() - ts

268.5531258583069

In [20]:
print(np.sqrt(-cross_val_scores))
print(np.sqrt(-cross_val_scores.mean()))

[1.4690199  1.52943539 1.43917139 1.48067777 1.58690963]
1.5019384249554357


In [21]:
ts = time.time()
cross_val_scores = cross_val_score(GradientBoostingRegressor(random_state=0, learning_rate=0.01, n_estimators=200),
                                   X, Y, cv=5, scoring='neg_mean_squared_error')
time.time() - ts

380.2964279651642

In [22]:
print(np.sqrt(-cross_val_scores))
print(np.sqrt(-cross_val_scores.mean()))

[1.52914944 1.59711899 1.52405807 1.55364761 1.68480377]
1.5788745799930106


In [23]:
ts = time.time()
cross_val_scores = cross_val_score(GradientBoostingRegressor(random_state=0, learning_rate=0.1, n_estimators=300),
                                   X, Y, cv=5, scoring='neg_mean_squared_error')
time.time() - ts

399.2518820762634

In [24]:
print(np.sqrt(-cross_val_scores))
print(np.sqrt(-cross_val_scores.mean()))

[1.47522469 1.53654334 1.4428019  1.48449103 1.58492203]
1.505630905692091


### Conclusion

We do not see much of an improvement on increasing the number of estimators. Hence, let us fit model over the entire training data and generate predictions over the test set.

In [27]:
ts = time.time()
X = train[all_predictor_cols]
Y = train[['log_target']].values.ravel()
my_model = GradientBoostingRegressor(random_state=0, learning_rate=0.1, n_estimators=300)
my_model.fit(X, Y)
time.time() - ts

95.1397271156311

In [28]:
ts = time.time()
test = get_test_data(INPUT_DIR)
time.time() - ts

73.24436402320862

In [29]:
ts = time.time()
new_X = test[all_predictor_cols]
test_log_predictions = my_model.predict(new_X)
test_log_predictions[test_log_predictions < 0] = 0
test['target'] = np.exp(test_log_predictions) - 1.0
time.time() - ts

7.9863831996917725

In [30]:
test[['ID', 'target']].to_csv('submission_gradient_boosting_sklearn.csv', index=False)