## Summary

To apply lasso technique to the given problem of predicting customer value.

In [27]:
import time

import sys
sys.path.append('../../common_routines/')

from relevant_functions import (get_train_data,
                                get_test_data,
                                get_all_predictor_cols,
                                get_rel_cols,
                                fit_pipeline_and_cross_validate)

from sklearn.pipeline import Pipeline
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

import numpy as np

In [2]:
INPUT_DIR = '../../input/'

In [3]:
ts = time.time()
train = get_train_data(INPUT_DIR)
time.time() - ts

5.154975891113281

### Construct a lasso pipeline

Let us a construct a pipeline with lasso and do cross validation using the same.

In [4]:
def get_lasso_pipeline(alpha):
    my_pipe = Pipeline([('standardize', StandardScaler()),
                         ('lasso', Lasso(alpha=alpha))])
    return my_pipe

In [5]:
all_predictor_cols = get_all_predictor_cols(train)

In [30]:
def compute_lasso_cross_val_scores(X, Y, alpha):
    cross_val_scores = cross_val_score(get_lasso_pipeline(alpha), X, Y, cv=5, scoring='neg_mean_squared_error')
    return np.sqrt(-cross_val_scores.mean())


In [41]:
X = train[all_predictor_cols]
Y = train[['log_target']]

In [42]:
alpha_to_cross_val_score = dict()

In [43]:
for i in range(1, 2):
    alpha_to_cross_val_score[i] = compute_lasso_cross_val_scores(X, Y, alpha=i)

In [44]:
alpha_val = 0.9
alpha_to_cross_val_score[alpha_val] = compute_lasso_cross_val_scores(X, Y, alpha=alpha_val)

In [45]:
alpha_val = 0.8
alpha_to_cross_val_score[alpha_val] = compute_lasso_cross_val_scores(X, Y, alpha=alpha_val)

In [46]:
alpha_to_cross_val_score

{1: 1.7514306049469586, 0.9: 1.7514306049469586, 0.8: 1.7514306049469586}

In [47]:
ts = time.time()
for num in range(1, 21):
    alpha_val = num * 0.01
    alpha_to_cross_val_score[alpha_val] = compute_lasso_cross_val_scores(X, Y, alpha=alpha_val)
time.time() - ts

  positive)


184.7812840938568

In [48]:
alpha_to_cross_val_score

{1: 1.7514306049469586,
 0.9: 1.7514306049469586,
 0.8: 1.7514306049469586,
 0.01: 784.8663261220631,
 0.02: 574.591286309348,
 0.03: 388.2168158058309,
 0.04: 204.68925146617278,
 0.05: 52.55413038151087,
 0.06: 1.8679788109110822,
 0.07: 1.712354970192694,
 0.08: 1.6769576437419382,
 0.09: 1.6798809949740479,
 0.1: 1.683298721314856,
 0.11: 1.6870429817463448,
 0.12: 1.6907825438745814,
 0.13: 1.6939645937925927,
 0.14: 1.696822668124505,
 0.15: 1.6992788482682242,
 0.16: 1.7014638774568045,
 0.17: 1.7034399969174876,
 0.18: 1.7055648349841093,
 0.19: 1.7078122663509916,
 0.2: 1.7102115401579214}

### Running the same experiment on log transformed predictors

Let us repeat the same procedure, this time on log transformed predictors as opposed to the original ones.

In [49]:
train_new = train.copy()

In [50]:
for col in all_predictor_cols:
    train_new['log_' + col] = np.log(train_new[col] + 1.0)

In [51]:
all_log_predictor_cols = ['log_' + col for col in all_predictor_cols ]

In [73]:
alpha_to_log_cross_val_score = dict()

In [78]:
X = train_new[all_log_predictor_cols]
Y = train_new[['log_target']]

In [74]:
alpha_val = 1.0
alpha_to_log_cross_val_score[alpha_val] = compute_lasso_cross_val_scores(X, Y, alpha=alpha_val)

In [75]:
alpha_to_log_cross_val_score

{1.0: 1.7514306049469586}

In [76]:
alpha_val = 0.5
alpha_to_log_cross_val_score[alpha_val] = compute_lasso_cross_val_scores(X, Y, alpha=alpha_val)

In [77]:
alpha_to_log_cross_val_score

{1.0: 1.7514306049469586, 0.5: 1.7514306049469586}

In [79]:
alpha_val = 0.1
alpha_to_log_cross_val_score[alpha_val] = compute_lasso_cross_val_scores(X, Y, alpha=alpha_val)

In [60]:
alpha_to_log_cross_val_score

{1.0: 1.7514306049469586, 0.5: 1.7514306049469586, 0.1: 1.683298721314856}

In [82]:
ts = time.time()
for i in range(1, 10):
    alpha_val = i * 0.01
    alpha_to_log_cross_val_score[alpha_val] = compute_lasso_cross_val_scores(X, Y, alpha=alpha_val)
time.time() - ts

  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


219.24669003486633

In [83]:
alpha_to_log_cross_val_score

{1.0: 1.7514306049469586,
 0.5: 1.7514306049469586,
 0.1: 1.716458653632676,
 0.01: 1.9019334658311804,
 0.02: 1.737851454379462,
 0.03: 1.6931295778618805,
 0.04: 1.6822058995410223,
 0.05: 1.6848812744533253,
 0.06: 1.69098874902219,
 0.07: 1.697134497509347,
 0.08: 1.7039296691328047,
 0.09: 1.710245272412486}

### Conclusion

We do not look to be getting any benefit with log transforming the predictors. Hence, we proceed with the most optimal model obtained using original predictors.

In [84]:
ts = time.time()
test = get_test_data(INPUT_DIR) 
time.time() - ts

68.43389010429382

In [85]:
my_pipe = get_lasso_pipeline(alpha=0.08)
X = train[all_predictor_cols]
Y = train[['log_target']]
my_pipe.fit(X, Y)

Pipeline(memory=None,
         steps=[('standardize',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lasso',
                 Lasso(alpha=0.08, copy_X=True, fit_intercept=True,
                       max_iter=1000, normalize=False, positive=False,
                       precompute=False, random_state=None, selection='cyclic',
                       tol=0.0001, warm_start=False))],
         verbose=False)

In [86]:
new_X = test[all_predictor_cols]
test_log_predictions = my_pipe.predict(new_X)

test_log_predictions[test_log_predictions <0 ] = 0

test['target'] = np.exp(test_log_predictions) - 1.0

In [87]:
test[['ID', 'target']].to_csv('submission_lasso.csv', index=False)