In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error
import pickle
import csv
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression
np.random.seed(42)
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# Importing my already scaled data

In [13]:
X_train_sc = pd.read_csv('../data/X_train_sc.csv')
X_test_sc = pd.read_csv('../data/X_test_sc.csv')
X_train = pd.read_csv('../data/X_train.csv')
X_test = pd.read_csv('../data/X_test.csv')
y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

# Setting my pipeline and parameters for grid search

In [25]:
params = {
    'var_thresh__threshold': [0.05, .05, .08, .25, .5],
    'kbest__k': [10, 20, 25],
    'lasso__alpha': np.logspace(-3, 5, 10)
}

In [26]:
pipe = Pipeline([
    ('var_thresh', VarianceThreshold(.05)),
    ('ss', StandardScaler()),
    ('kbest', SelectKBest(f_regression, k=5)),
    ('lasso', Lasso())
])

# Instantiating and fitting my model

In [27]:
gs = GridSearchCV(pipe, params)

In [28]:
gs.fit(X_train.select_dtypes(exclude = 'object'), y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('var_thresh', VarianceThreshold(threshold=0.05)), ('ss', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kbest', SelectKBest(k=5, score_func=<function f_regression at 0x1a11fdd0d0>)), ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'var_thresh__threshold': [0.05, 0.05, 0.08, 0.25, 0.5], 'kbest__k': [10, 20, 25], 'lasso__alpha': array([1.00000e-03, 7.74264e-03, 5.99484e-02, 4.64159e-01, 3.59381e+00,
       2.78256e+01, 2.15443e+02, 1.66810e+03, 1.29155e+04, 1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

# Scoring my model

In [30]:
gs.score(X_train.select_dtypes(exclude = 'object'),y_train)

0.8177044151138267

In [32]:
gs.score(X_test.select_dtypes(exclude = 'object'), y_test)

0.8719347241983019

# Calculating RMSE for my model

In [33]:
y_hat_train = gs.predict(X_train_sc)
y_hat_test = gs.predict(X_test_sc)

In [34]:
mean_squared_error(y_train, y_hat_train) ** .5

1227277.6098747193

In [35]:
mean_squared_error(y_test, y_hat_test) ** .5

1227233.053313346

In [36]:
with open('../assets/columns.pkl', 'rb') as f:
    columns = pickle.load(f)

# Finding the best parameters and the coefficients

In [37]:
gs.best_params_

{'kbest__k': 25,
 'lasso__alpha': 1668.1005372000557,
 'var_thresh__threshold': 0.05}

In [14]:
gs.best_estimator_.steps[-1][1].coef_

array([  3712.76453104,   7323.17313878,   4955.63421369,   2697.58710097,
         5353.50394965,  -2475.11246872,  19344.74751301,   8442.75487109,
         1473.95234006, -34279.70753421,   -648.68902803,   6082.1011013 ,
            0.        ,   -920.82204518,    650.07271525,   2484.23973744,
         -768.79169358,   7715.0310829 ,  11928.26517321,   5453.55414395,
        -3296.59167583,    239.94650662,  -1183.62275812,   1001.92903881,
         9645.62623752,  15041.76821344,  10458.83647863,   2130.96334071,
         3592.80091288,   2937.82144208,    890.88721427,   6057.2181359 ,
         5143.59925866,   2202.85230195,  -1654.26104666,  -2117.54155132,
        -9589.37325374,   4101.81445204,  -2501.69305143, -38586.28152927,
       -35005.72193445,   -497.92421635, -30157.03498455,   6819.32190284,
         3687.38075067,    482.8306009 ,  -2206.44249684,   1427.21013933,
         6034.3830733 ,   2331.59627691])

In [64]:
v  = gs.best_estimator_.steps[-2][1]

v.get_support()


array([False, False, False,  True,  True,  True,  True, False, False,
        True,  True, False, False,  True, False, False,  True, False,
       False, False,  True,  True,  True,  True,  True, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False,

In [55]:
#coef = pd.DataFrame(gs.best_estimator_.steps[-1][1].coef_, index = columns, columns=['weight'])
#coef.sort_values(by='weight', ascending=False).head(20)

ValueError: Shape of passed values is (1, 50), indices imply (1, 400)

In [14]:
#coef = pd.DataFrame(gs.coef_, index = columns, columns=['weight'])
#coef.sort_values(by='weight', ascending=True).head(20)

AttributeError: 'GridSearchCV' object has no attribute 'coef_'

# Saving my model for future use

In [41]:
with open('../assets/gs.pkl', 'wb+') as f:
    pickle.dump(gs, f)