### Summary

The idea here is to apply xgboost techniques on those predictors that are selected by lasso method. This way, we combine dimensionality reduction as well as non parametric techniques.

In [1]:
import pandas as pd
import numpy as np
import time as time
import xgboost as xgb

In [2]:
import sys
sys.path.append('../common_routines/')
from relevant_functions import fit_pipeline_and_cross_validate

In [3]:
INPUT_DIR = '../input/'

In [4]:
ts = time.time()
train = pd.read_csv(INPUT_DIR + 'train.csv')
time.time() - ts

5.350039958953857

In [5]:
ts = time.time()
test = pd.read_csv(INPUT_DIR + 'test.csv')
time.time() - ts

70.34377908706665

In [6]:
train['new_target'] = np.log(train['target'] + 1.0)

# Lasso section
In this section we construct a lasso model.

#### A pipeline for impelmenting lasso model with normalization.

In [8]:
def get_lasso_pipe_with_scaling(alpha=2000):
    my_pipe = make_pipeline(StandardScaler(),linear_model.Lasso(alpha=alpha))
    return my_pipe

In [9]:
X_COLUMNS = [col for col in train.columns if col not in ['ID', 'target', 'new_target']]
Y_COLUMN = 'new_target'

In [10]:
# We are already aware of the data conversion to float and hence we suppress warnings regarding the same.
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [58]:
alpha_to_cross_val_score = dict()

In [59]:
alpha_to_corr_pipe = dict()

In [60]:
ALPHAS = np.concatenate([np.arange(0.07, 0.1, 0.01), np.arange(0.1, 0.4, 0.05)])

In [61]:
ts = time.time()
for alpha_val in ALPHAS:
    (my_pipe, cross_val_score1) = fit_pipeline_and_cross_validate(
        get_lasso_pipe_with_scaling(alpha=alpha_val), 
        train, 
        X_COLUMNS,
        'new_target')
    alpha_to_cross_val_score[alpha_val] = cross_val_score1
    alpha_to_corr_pipe[alpha_val] = my_pipe
time.time() -ts

85.30825328826904

In [62]:
alpha_to_cross_val_score

{0.07: 1.7103005886854181,
 0.08: 1.6763484889195612,
 0.09: 1.6792880196613027,
 0.1: 1.6826951971543447,
 0.15000000000000002: 1.6987425429799081,
 0.20000000000000004: 1.7097112765431706,
 0.25000000000000006: 1.7234586102207547,
 0.30000000000000004: 1.7384425526123928,
 0.3500000000000001: 1.7487191694414956,
 0.40000000000000013: 1.7499973439733552}

In [63]:
optim_alpha = min(alpha_to_cross_val_score, key=alpha_to_cross_val_score.get)

In [64]:
optim_alpha

0.08

In [66]:
optim_lass_model = alpha_to_corr_pipe.get(optim_alpha)

In [67]:
optim_lass_model.named_steps['lasso'].coef_!=0


array([False, False, False, ..., False, False, False])

In [68]:
np.sum(optim_lass_model.named_steps['lasso'].coef_!=0)


74

In [69]:
np.nonzero(optim_lass_model.named_steps['lasso'].coef_)[0]

array([   8,  118,  168,  225,  424,  435,  493,  651,  693,  787,  878,
       1031, 1044, 1142, 1276, 1530, 1652, 1681, 1699, 1836, 1887, 1894,
       1916, 1942, 1990, 2037, 2166, 2235, 2248, 2424, 2435, 2500, 2594,
       2755, 2757, 2888, 3023, 3029, 3066, 3086, 3094, 3183, 3217, 3323,
       3358, 3397, 3428, 3471, 3660, 3664, 3753, 3771, 3836, 3866, 3928,
       4066, 4084, 4091, 4103, 4138, 4243, 4268, 4307, 4358, 4414, 4434,
       4466, 4507, 4552, 4572, 4581, 4656, 4660, 4818])

# Xgboost section
In this section we construct an xgboost model on the predictors obtained using lasso regression.

In [70]:
X_COLUMNS = [X_COLUMNS[col] for col in np.nonzero(optim_lass_model.named_steps['lasso'].coef_)[0]]
Y_COLUMN = 'new_target'

In [71]:
X = train[X_COLUMNS].values
Y = train[[Y_COLUMN]].values

In [72]:
np.shape(X)

(4459, 74)

In [73]:
xgb_complete_data = xgb.DMatrix(X, Y, feature_names=X_COLUMNS)

In [74]:
xgb_params = {'eta':0.01, 'eval_metric':'rmse'}

In [75]:
ts = time.time()
model_xgboost = xgb.cv(params=xgb_params,
                       dtrain=xgb_complete_data,
                       num_boost_round=1000,
                       nfold=5,
                       early_stopping_rounds=5,
                       callbacks=[xgb.callback.print_evaluation(show_stdv=False)]) 
time.time() - ts

[0]	train-rmse:13.9605	test-rmse:13.9604
[1]	train-rmse:13.823	test-rmse:13.823
[2]	train-rmse:13.6869	test-rmse:13.687
[3]	train-rmse:13.5522	test-rmse:13.5524
[4]	train-rmse:13.4189	test-rmse:13.4191
[5]	train-rmse:13.2869	test-rmse:13.2872
[6]	train-rmse:13.1562	test-rmse:13.1566
[7]	train-rmse:13.0269	test-rmse:13.0274
[8]	train-rmse:12.8989	test-rmse:12.8994
[9]	train-rmse:12.7721	test-rmse:12.7728
[10]	train-rmse:12.6467	test-rmse:12.6474
[11]	train-rmse:12.5225	test-rmse:12.5233
[12]	train-rmse:12.3996	test-rmse:12.4004
[13]	train-rmse:12.2779	test-rmse:12.2789
[14]	train-rmse:12.1574	test-rmse:12.1585
[15]	train-rmse:12.0382	test-rmse:12.0393
[16]	train-rmse:11.9202	test-rmse:11.9215
[17]	train-rmse:11.8034	test-rmse:11.8047
[18]	train-rmse:11.6877	test-rmse:11.6891
[19]	train-rmse:11.5733	test-rmse:11.5748
[20]	train-rmse:11.46	test-rmse:11.4616
[21]	train-rmse:11.3478	test-rmse:11.3496
[22]	train-rmse:11.2368	test-rmse:11.2386
[23]	train-rmse:11.127	test-rmse:11.1289
[24]	tra

[195]	train-rmse:2.43551	test-rmse:2.49705
[196]	train-rmse:2.41925	test-rmse:2.48144
[197]	train-rmse:2.40313	test-rmse:2.46594
[198]	train-rmse:2.38711	test-rmse:2.45065
[199]	train-rmse:2.37137	test-rmse:2.43561
[200]	train-rmse:2.35575	test-rmse:2.42072
[201]	train-rmse:2.34045	test-rmse:2.40611
[202]	train-rmse:2.32528	test-rmse:2.39169
[203]	train-rmse:2.31028	test-rmse:2.37736
[204]	train-rmse:2.29559	test-rmse:2.36334
[205]	train-rmse:2.28102	test-rmse:2.34942
[206]	train-rmse:2.26671	test-rmse:2.33578
[207]	train-rmse:2.25249	test-rmse:2.32223
[208]	train-rmse:2.23848	test-rmse:2.30893
[209]	train-rmse:2.22462	test-rmse:2.29577
[210]	train-rmse:2.21099	test-rmse:2.28288
[211]	train-rmse:2.19753	test-rmse:2.27016
[212]	train-rmse:2.18416	test-rmse:2.25753
[213]	train-rmse:2.17115	test-rmse:2.2452
[214]	train-rmse:2.15829	test-rmse:2.23308
[215]	train-rmse:2.14548	test-rmse:2.22095
[216]	train-rmse:2.13285	test-rmse:2.20904
[217]	train-rmse:2.12048	test-rmse:2.19735
[218]	train-

[387]	train-rmse:1.33535	test-rmse:1.51598
[388]	train-rmse:1.33425	test-rmse:1.51538
[389]	train-rmse:1.33315	test-rmse:1.5147
[390]	train-rmse:1.33215	test-rmse:1.51402
[391]	train-rmse:1.33111	test-rmse:1.51334
[392]	train-rmse:1.32996	test-rmse:1.51265
[393]	train-rmse:1.32902	test-rmse:1.51205
[394]	train-rmse:1.32817	test-rmse:1.51149
[395]	train-rmse:1.32716	test-rmse:1.51087
[396]	train-rmse:1.32616	test-rmse:1.51026
[397]	train-rmse:1.32514	test-rmse:1.5096
[398]	train-rmse:1.32414	test-rmse:1.50894
[399]	train-rmse:1.32318	test-rmse:1.5083
[400]	train-rmse:1.32231	test-rmse:1.50775
[401]	train-rmse:1.32138	test-rmse:1.50717
[402]	train-rmse:1.32047	test-rmse:1.50667
[403]	train-rmse:1.31959	test-rmse:1.50617
[404]	train-rmse:1.31876	test-rmse:1.50572
[405]	train-rmse:1.31786	test-rmse:1.50519
[406]	train-rmse:1.31702	test-rmse:1.50469
[407]	train-rmse:1.31615	test-rmse:1.5042
[408]	train-rmse:1.31533	test-rmse:1.5037
[409]	train-rmse:1.31454	test-rmse:1.50322
[410]	train-rmse

[579]	train-rmse:1.23026	test-rmse:1.47065
[580]	train-rmse:1.22992	test-rmse:1.47053
[581]	train-rmse:1.22969	test-rmse:1.47051
[582]	train-rmse:1.22931	test-rmse:1.47047
[583]	train-rmse:1.22897	test-rmse:1.47038
[584]	train-rmse:1.22873	test-rmse:1.47032
[585]	train-rmse:1.22836	test-rmse:1.47025
[586]	train-rmse:1.22819	test-rmse:1.47019
[587]	train-rmse:1.22779	test-rmse:1.47015
[588]	train-rmse:1.22751	test-rmse:1.47005
[589]	train-rmse:1.22722	test-rmse:1.47001
[590]	train-rmse:1.22697	test-rmse:1.46996
[591]	train-rmse:1.22672	test-rmse:1.46989
[592]	train-rmse:1.22627	test-rmse:1.46985
[593]	train-rmse:1.22599	test-rmse:1.46978
[594]	train-rmse:1.22559	test-rmse:1.46972
[595]	train-rmse:1.22527	test-rmse:1.46966
[596]	train-rmse:1.22497	test-rmse:1.46962
[597]	train-rmse:1.22473	test-rmse:1.46958
[598]	train-rmse:1.2244	test-rmse:1.46952
[599]	train-rmse:1.22413	test-rmse:1.46946
[600]	train-rmse:1.22385	test-rmse:1.46935
[601]	train-rmse:1.22366	test-rmse:1.46931
[602]	train-

[771]	train-rmse:1.1845	test-rmse:1.46262
[772]	train-rmse:1.18435	test-rmse:1.46259
[773]	train-rmse:1.1842	test-rmse:1.4626
[774]	train-rmse:1.18404	test-rmse:1.46258
[775]	train-rmse:1.18391	test-rmse:1.46258
[776]	train-rmse:1.18371	test-rmse:1.46254
[777]	train-rmse:1.18356	test-rmse:1.4625
[778]	train-rmse:1.18329	test-rmse:1.46249
[779]	train-rmse:1.18306	test-rmse:1.46245
[780]	train-rmse:1.18291	test-rmse:1.46238
[781]	train-rmse:1.18272	test-rmse:1.46238
[782]	train-rmse:1.18251	test-rmse:1.46236
[783]	train-rmse:1.18235	test-rmse:1.46232
[784]	train-rmse:1.18214	test-rmse:1.46228
[785]	train-rmse:1.18201	test-rmse:1.46228
[786]	train-rmse:1.18181	test-rmse:1.46222
[787]	train-rmse:1.18163	test-rmse:1.46218
[788]	train-rmse:1.18146	test-rmse:1.46212
[789]	train-rmse:1.18123	test-rmse:1.4621
[790]	train-rmse:1.1811	test-rmse:1.46211
[791]	train-rmse:1.18098	test-rmse:1.46211
[792]	train-rmse:1.18081	test-rmse:1.46211
[793]	train-rmse:1.1806	test-rmse:1.4621
[794]	train-rmse:1.

36.52139115333557

In [77]:
model_xgboost[model_xgboost['test-rmse-mean'] == model_xgboost['test-rmse-mean'].min()]['test-rmse-mean']

840    1.460938
Name: test-rmse-mean, dtype: float64

In [80]:
model_xgboost[model_xgboost['test-rmse-mean'] == model_xgboost['test-rmse-mean'].min()].index[0]

840

### The score looks to have increased considerably. Let us generate predictions for the test data.

In [81]:
model_full_train_data = xgb.train(params=xgb_params,
                                  dtrain=xgb_complete_data,
                                  num_boost_round=model_xgboost[model_xgboost['test-rmse-mean'] == model_xgboost['test-rmse-mean'].min()].index[0])

In [82]:
model_full_train_data.best_ntree_limit

840

In [85]:
xgb_test_data = xgb.DMatrix(test[X_COLUMNS], feature_names=X_COLUMNS)

In [87]:
test_predictions = np.exp(model_full_train_data.predict(xgb_test_data)) -1.0

In [89]:
test['target'] = test_predictions

In [90]:
test[['ID', 'target']].to_csv('submission_lasso_xgboost.csv', index=False)