#### Summary 

The idea here is to exploit some of the work done in earlier projects to obtain an optimal lasso model for this problem.

In [3]:
# Required imports
import pandas as pd
import numpy as np
import time as time

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Imputer
from sklearn import linear_model
from sklearn.preprocessing import StandardScaler


###### We define many model evaluation helper functions here (and almost all of them are self explanatory). 
###### They have been borrowed from the relevant_functions module, which was written for the housing prediction problem (https://www.kaggle.com/c/house-prices-advanced-regression-techniques)

In [4]:
def evaluate_model_score(my_model, X, Y):
    predictions = my_model.predict(X)
    return evaluate_model_score_given_predictions(predictions, Y)

def make_predictions(my_model, X):
    predictions = my_model.predict(X)
    return predictions

def evaluate_model_score_given_predictions(predictions, Y):
    mean_of_squared_error1 = \
        mean_squared_error((np.log(np.abs(Y + 1))), (np.log(np.abs(predictions + 1))))
    return np.sqrt(mean_of_squared_error1)

def evaluate_neg_model_score(my_model, X, Y):
    return (-1) * evaluate_model_score(my_model, X, Y)


def cross_val_scores_given_model(my_model, X, Y, cv=5):
    cross_val_score1 = cross_val_score(my_model, 
                                       X, Y, 
                                       scoring=evaluate_model_score,
                                       cv=cv)
    return cross_val_score1

def cross_val_score_given_model(my_model, X, Y, cv=5):
    cross_val_score1 = cross_val_score(my_model, 
                                       X, Y,
                                       scoring=evaluate_model_score, 
                                       cv=cv)
                                       
    return cross_val_score1.mean()

def fit_pipeline_and_cross_validate(my_pipeline,
                                    train_data,
                                    X_columns,
                                    Y_column='target'):
    X = train_data[X_columns]
    Y = train_data[[Y_column]].values.ravel()
    my_pipeline.fit(X, Y)
    return (my_pipeline, cross_val_score_given_model(my_pipeline, X, Y))


In [5]:
INPUT_DIR = '../input/'

In [6]:
ts = time.time()
train_data = pd.read_csv(INPUT_DIR + 'train.csv')
time.time() - ts

5.071911096572876

In [7]:
ts = time.time()
test_data = pd.read_csv(INPUT_DIR + 'test.csv')
time.time() - ts

68.35545420646667

#### A pipeline for impelmenting lasso model with normalization.

In [8]:
def get_lasso_pipe_with_scaling(alpha=2000):
    my_pipe = make_pipeline(StandardScaler(),linear_model.Lasso(alpha=alpha))
    return my_pipe

In [9]:
X_COLUMNS = [col for col in train_data.columns if col not in ['ID', 'target']]
Y_COLUMN = 'target'

In [10]:
# We are already aware of the data conversion to float and hence we suppress warnings regarding the same.
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)

In [15]:
ALPHAS = np.concatenate([np.arange(100000, 1500000, 100000), np.arange(50000, 100000, 10000)])

In [33]:
alpha_to_cross_val_score = dict()

In [34]:
ts = time.time()
for alpha_val in ALPHAS:
    (my_pipe, cross_val_score1) = fit_pipeline_and_cross_validate(
        get_lasso_pipe_with_scaling(alpha=alpha_val), 
        train_data, 
        X_COLUMNS)
    alpha_to_cross_val_score[alpha_val] = cross_val_score1
time.time() -ts

360.959095954895

In [35]:
alpha_to_cross_val_score

{50000: 2.049342716648238,
 60000: 2.0198157269999446,
 70000: 2.002283987637136,
 80000: 1.977493900486174,
 90000: 1.9698114242428697,
 100000: 1.9618027160556892,
 200000: 1.9544845895654,
 300000: 1.96405332141402,
 400000: 1.9758551317784125,
 500000: 1.9850499883352337,
 600000: 1.9918723217614354,
 700000: 1.9977906171437056,
 800000: 2.0032383786330477,
 900000: 2.0083959792986263,
 1000000: 2.013499222571732,
 1100000: 2.0185609929491095,
 1200000: 2.023471759265643,
 1300000: 2.028317006226269,
 1400000: 2.03311781069742}

#### Make predictions on test data with the model constructed using the most optimal value of alpha

In [27]:
def fit_pipeline_and_make_predictions_on_test_set(my_pipeline,
                                                  train_data,
                                                  test_data,
                                                  X_columns,
                                                  Y_column='target'):
    X = train_data[X_columns]
    Y = train_data[[Y_column]].values.ravel()
    my_pipeline.fit(X, Y)
    
    X_test = test_data[X_columns]
    
    return (my_pipeline, make_predictions(my_pipeline, X_test))

In [28]:
(final_pipeline, predictions_on_test_data) = \
    fit_pipeline_and_make_predictions_on_test_set(get_lasso_pipe_with_scaling(min(alpha_to_cross_val_score, 
                                                                                  key=alpha_to_cross_val_score.get)), 
                                                  train_data, 
                                                  test_data, 
                                                  X_COLUMNS)

In [29]:
test_data['target'] = [np.abs(x) for x in predictions_on_test_data]

In [30]:
test_data[['ID', 'target']].to_csv('submission_lasso_sklearn2.csv', index=False)