In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error
import pickle
import csv
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression
np.random.seed(42)
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline



# Importing my already scaled data

In [2]:
X_train_sc = pd.read_csv('../data/X_train_sc.csv')
X_test_sc = pd.read_csv('../data/X_test_sc.csv')
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

# Setting my pipeline and parameters for grid search

In [3]:
params = {
    'kbest__k': [10, 20, 25],
    'lasso__alpha': np.logspace(-3, 5, 10)
}

In [4]:
pipe = Pipeline([
    ('ss', StandardScaler()),
    ('kbest', SelectKBest(f_regression, k=5)),
    ('lasso', Lasso())
])

# Instantiating and fitting my model

In [5]:
gs = GridSearchCV(pipe, params)

In [6]:
gs.fit(X_train.select_dtypes(exclude = 'object'), y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kbest', SelectKBest(k=5, score_func=<function f_regression at 0x1a0c8a30d0>)), ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'kbest__k': [10, 20, 25], 'lasso__alpha': array([1.00000e-03, 7.74264e-03, 5.99484e-02, 4.64159e-01, 3.59381e+00,
       2.78256e+01, 2.15443e+02, 1.66810e+03, 1.29155e+04, 1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

# Scoring my model

In [7]:
gs.score(X_train.select_dtypes(exclude = 'object'),y_train)

0.8314301862714326

In [8]:
gs.score(X_test.select_dtypes(exclude = 'object'), y_test)

0.8827466336923878

# Calculating RMSE for my model

In [9]:
y_hat_train = gs.predict(X_train_sc)
y_hat_test = gs.predict(X_test_sc)

In [10]:
mean_squared_error(y_train, y_hat_train) ** .5

971886.4442767567

In [11]:
mean_squared_error(y_test, y_hat_test) ** .5

972389.1492045288

In [12]:
with open('../assets/columns.pkl', 'rb') as f:
    columns = pickle.load(f)

# Finding the best parameters and the coefficients

In [13]:
gs.best_params_

{'kbest__k': 25, 'lasso__alpha': 1668.1005372000557}

In [14]:
gs.best_estimator_.steps[-1][1].coef_

array([ 1.84318361e+04,  5.79918024e+03,  4.36635874e+03,  2.48365037e+03,
        3.85720763e+03,  3.33790011e+03,  1.77008335e+04,  0.00000000e+00,
        0.00000000e+00,  6.49090731e+03,  0.00000000e+00,  1.79091905e+03,
        2.90898026e+03, -1.56487032e+03,  0.00000000e+00,  7.09503902e+03,
       -0.00000000e+00,  4.33317241e+03,  2.37580055e+03, -0.00000000e+00,
        9.03220181e+03, -1.06121628e+03, -4.19471594e+02, -3.02967656e-13,
        9.08989525e+03])

# Saving my model for future use

In [18]:
with open('../assets/gs.pkl', 'wb+') as f:
    pickle.dump(gs, f)