In [329]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import SelectKBest, VarianceThreshold, f_regression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import r2_score, mean_squared_error

%matplotlib inline

In [330]:
# import the data 
df = pd.read_csv('clean_data.csv')

In [331]:
# subset data into train set  and call it modeling_df and make appropriate changes
train = df[df['set']== 'train']


# define y 
y = train['target']

#remove the target columns and the set column from train data and create a list of features
train.drop(columns = ['target', 'set'], axis = 1,  inplace = True )
features = list(train.columns)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [332]:
# define X
X = train[features]

In [333]:
# instantiate linear regression, lasso and ridge models
lr = LinearRegression()
lasso = LassoCV()
ridge = RidgeCV()

# Baseline Scores 

### The Simplest Model 

Let's get the cross val score for the simplest possible model: predicting SalePrice with just the square footage of the house. This model is going to be highly biased. 

In [334]:
# set variables to save the baseline scores 
lr_scores = cross_val_score(lr, X[['sf']], y).mean()
lasso_scores = cross_val_score(lasso, X[['sf']], y).mean()
ridge_scores = cross_val_score(ridge, X[['sf']], y).mean()

print(lr_scores)
print(lasso_scores)
print(ridge_scores)

0.6223348295629084
0.6223382472874212
0.6223348310507872




### The Most Complex Model 

And here let's get the cross val scores of the most complex possible model: this model includes all of the features in the dataset and therefore has very high variance. 

In [335]:
# set variables to save the baseline scores 
lr_scores = cross_val_score(lr, X, y).mean()
lasso_scores = cross_val_score(lasso, X, y).mean()
ridge_scores = cross_val_score(ridge, X, y).mean()

print(lr_scores)
print(lasso_scores)
print(ridge_scores)



0.8922682498109796
0.7894966055375604
0.8918778618743297




So my goal then is to beat the simplest model and to come as high in train and test scores as possible to be close to or over the high variance model's score while eliminating most of the irrelevant features. In order to achieve such a score I will build a pipeline and will test models manually with a few different tweaks first. After that I will runa . GridSearch over my models (one for linear regression, lasso and ridge each) and tune the hyperparameters to see if I can get an even better score.

# Simple Pipeline

In [336]:
# train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state = 42)

## Model Variation 1: Does Not Include Engineered Features

All features passed to the models after VarianceThreshold, PowerTransformer, StandardScaler and SelectKBest transformers have been applied to data with defualt parameters.

### Lasso

In [337]:
pipe_lasso = Pipeline([
    ('var_thresh', VarianceThreshold(.05)),
    #('polyfeatures', PolynomialFeatures(interaction_only = True)),
    ('power_t', PowerTransformer()),
    ('ss', StandardScaler()),
    ('kbest', SelectKBest()),
    ('lasso', LassoCV())
])

pipe_lasso.fit(X_train, y_train)
train_score = pipe_lasso.score(X_train, y_train)
print('train cv score: ' , train_score)
y_pred = pipe_lasso.predict(X_test)
print('test_score: ',r2_score(y_test, y_pred))

  x = um.multiply(x, x, out=x)


train cv score:  0.7889120772530963
test_score:  0.7824722324709801




### Linear Regression

In [338]:
pipe_lr = Pipeline([
    ('var_thresh', VarianceThreshold(.05)),
    ('power_t', PowerTransformer()),
    ('ss', StandardScaler()),
    ('kbest', SelectKBest(f_regression, k=5)),
    ('lr', LinearRegression())
])

pipe_lr.fit(X_train, y_train)
train_score = pipe_lr.score(X_train, y_train)
print('train cv score: ' , train_score)
y_pred = pipe_lr.predict(X_test)
print('test_score: ',r2_score(y_test, y_pred))

  x = um.multiply(x, x, out=x)


train cv score:  0.7731626053121392
test_score:  0.7684631936663632


### Ridge 

In [339]:
pipe_ridge = Pipeline([
    ('var_thresh', VarianceThreshold(.05)),
    ('power_t', PowerTransformer()),
    ('ss', StandardScaler()),
    ('kbest', SelectKBest(f_regression, k=5)),
    ('ridge', RidgeCV())
])

pipe_ridge.fit(X_train, y_train)
train_score = pipe_ridge.score(X_train, y_train)
print('ridge train cv score: ' , train_score)
y_pred = pipe_ridge.predict(X_test)
print('ridge test_score: ',r2_score(y_test, y_pred))

  x = um.multiply(x, x, out=x)


ridge train cv score:  0.7731425563774031
ridge test_score:  0.768028812625693


## Model  Variation 2: Includes Engineered Features

Engeneered features that I found were useful during the exploratory data analyses phase are added in addition to all other features. Then the same transforms as in **Model 1** are applied. 

In [340]:
# engineering some features  
train['year'] = train['Year Built']*train['Year Remod/Add']
train['garage'] = train['Garage Area'] * train['Garage Cars']
train['outside'] = train['Open Porch SF'] * train['Wood Deck SF']

# redefining X and y
X_eng = train
y_eng = y


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [341]:
# train_test_split 

X_eng_train, X_eng_test, y_eng_train, y_eng_test = train_test_split(X_eng, y_eng, random_state = 42)

### Lasso

In [342]:
pipe_lasso = Pipeline([
    ('var_thresh', VarianceThreshold(.05)),
    ('power_t', PowerTransformer()),
    ('ss', StandardScaler()),
    ('kbest', SelectKBest()),
    ('lasso', LassoCV())
])

pipe_lasso.fit(X_eng_train, y_eng_train)
train_score = pipe_lasso.score(X_eng_train, y_eng_train)
print('train cv score: ' , train_score)
y_pred = pipe_lasso.predict(X_eng_test)
print('test_score: ',r2_score(y_eng_test, y_pred))

  x = um.multiply(x, x, out=x)


train cv score:  0.794417071282924
test_score:  0.7858093188850566




### Ridge

In [343]:
pipe_ridge = Pipeline([
    ('var_thresh', VarianceThreshold(.05)),
    ('power_t', PowerTransformer()),
    ('ss', StandardScaler()),
    ('kbest', SelectKBest()),
    ('ridge', RidgeCV())
])

pipe_ridge.fit(X_eng_train, y_eng_train)
train_score = pipe_ridge.score(X_eng_train, y_eng_train)
print('train cv score: ' , train_score)
y_pred = pipe_ridge.predict(X_eng_test)
print('test_score: ',r2_score(y_eng_test, y_pred))

  x = um.multiply(x, x, out=x)


train cv score:  0.794488984901996
test_score:  0.7859650842909908


### Linear Regression 

In [344]:
pipe_lr = Pipeline([
    ('var_thresh', VarianceThreshold(.05)),
    ('power_t', PowerTransformer()),
    ('ss', StandardScaler()),
    ('kbest', SelectKBest()),
    ('lr', LinearRegression())
])

pipe_lr.fit(X_eng_train, y_eng_train)
train_score = pipe_lr.score(X_eng_train, y_eng_train)
print('train cv score: ' , train_score)
y_pred = pipe_lr.predict(X_eng_test)
print('test_score: ',r2_score(y_eng_test, y_pred))

  x = um.multiply(x, x, out=x)


train cv score:  0.7944909146073097
test_score:  0.7860189458082409


So it looks like inclusion of engineered features does increase the training and testing accuracy scores so I will add the engineered features into my GridSearch. 

# Hyperparamter Tuning with GridSearchCV

I will run the GridSearch over Ridge, Lasso and Linear Regression using **model variation 2** from above with a variety of hyperparameters to tune over for each. 

## Model Variation 2


In the GridSearch Above, I set the parameter k_best = all so that I can next sort the features by their largest positive and largest negative weights and rerun a subset of those in the model again. 

### Linear Regression 

In [308]:
pipe_lr = Pipeline([
    ('var_thresh', VarianceThreshold(.05)),
    ('power_t', PowerTransformer()),
    ('ss', StandardScaler()),
    ('kbest', SelectKBest(f_regression)),
    ('lr', LinearRegression())
])

params = {
    'var_thresh__threshold': [0, .05, .1, .25, .75],
    'power_t__method': ['yeo-johnson'],
    'ss__with_mean': [True, False],
    'kbest__k': [25],
    'lr__fit_intercept': [True, False] 
}


gs = GridSearchCV(pipe_lr, params)
gs.fit(X_eng_train, y_eng_train)

gs.fit(X_eng_train, y_eng_train)
y_pred = gs.predict(X_eng_test)
print('grid search train score: ',gs.score(X_eng_train, y_eng_train))
print('grid search test score: ',gs.score(X_eng_test, y_eng_test))

print(gs.best_params_)

print('coef :',gs.best_estimator_.named_steps['lr'].coef_)

print('intercept :',gs.best_estimator_.named_steps['lr'].intercept_)

columns = X_eng_train.columns
columns = columns[gs.best_estimator_.named_steps['var_thresh'].get_support()]
columns = columns[gs.best_estimator_.named_steps['kbest'].get_support()]

feature_weights = pd.DataFrame(gs.best_estimator_.named_steps['lr'].coef_, 
             index = columns, 
             columns=['weight'])



  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x =

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x =

  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x =

grid search train score:  0.8372824481424403
grid search test score:  0.8191960393681387
{'kbest__k': 25, 'lr__fit_intercept': True, 'power_t__method': 'yeo-johnson', 'ss__with_mean': True, 'var_thresh__threshold': 0.75}
coef : [   -334.69449781    1481.50742134    3484.36899966   -3443.6875362
    4458.97889702    1625.29234448    1783.43817728    8139.90004588
    4084.98528718    -289.54105671  -33374.22851836    4864.33797238
     906.99992186   -2491.13964781    2588.64411626    1318.27963431
    3895.44859773  -35721.51548428    1070.81296127   93814.64553194
   55228.78553221   81759.69295977 -114698.90424859    4191.14282533
    1076.354504  ]
intercept : 171449.84321745057


  x = um.multiply(x, x, out=x)


### Lasso 

In [324]:
pipe_lasso = Pipeline([
    ('var_thresh', VarianceThreshold(.05)),
    ('power_t', PowerTransformer()),
    ('ss', StandardScaler()),
    ('kbest', SelectKBest(f_regression)),
    ('lasso', LassoCV(n_jobs = -1))
])

params = {
    'var_thresh__threshold': [0, .05, .1, .25, .75],
    'power_t__method': ['yeo-johnson'],
    'ss__with_mean': [True, False],
    'kbest__k': [30],
    'lasso__n_alphas': [1, 5, 10] 
}


gs = GridSearchCV(pipe_lasso, params)
gs.fit(X_eng_train, y_eng_train)

gs.fit(X_eng_train, y_eng_train)
y_pred = gs.predict(X_eng_test)
print('grid search train score: ',gs.score(X_eng_train, y_eng_train))
print('grid search test score: ',gs.score(X_eng_test, y_eng_test))

print(gs.best_params_)

print('coef :',gs.best_estimator_.named_steps['lasso'].coef_)

print('intercept :',gs.best_estimator_.named_steps['lasso'].intercept_)

columns = X_eng_train.columns
columns = columns[gs.best_estimator_.named_steps['var_thresh'].get_support()]
columns = columns[gs.best_estimator_.named_steps['kbest'].get_support()]

feature_weights = pd.DataFrame(gs.best_estimator_.named_steps['lasso'].coef_, 
             index = columns, 
             columns=['weight'])



  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


  x = um.multiply(x, x, out=x)


grid search train score:  0.8432153140547486
grid search test score:  0.8135502207454691
{'kbest__k': 30, 'lasso__n_alphas': 1, 'power_t__method': 'yeo-johnson', 'ss__with_mean': True, 'var_thresh__threshold': 0.25}
coef : [   346.78683095   3495.59752257  -2534.75209503   6288.11745983
   5410.07143086   8263.10560605  -1011.49004965   -485.60490129
     -0.          -2402.53842633   2775.16674603   2435.59917846
 -22416.84504249   1928.23115822   3162.87081719   6548.51705177
   1635.82931842   2942.66620818    638.07222764   7910.0974103
    773.56677189  -1661.12156438 -28252.06216379    809.87390338
   6597.46378999   4989.64291558  63100.88040821     -0.
   9226.68442864     -0.        ]
intercept : 171449.84321745057


### Ridge 

In [345]:
pipe_ridge = Pipeline([
    ('var_thresh', VarianceThreshold(.05)),
    ('power_t', PowerTransformer()),
    ('ss', StandardScaler()),
    ('kbest', SelectKBest(f_regression)),
    ('ridge', RidgeCV())
])

params = {
    'var_thresh__threshold': [0, .05, .1, .25, .75],
    'power_t__method': ['yeo-johnson'],
    'ss__with_mean': [True, False],
    'kbest__k': [20],
    'ridge__alphas': [(0.1, 1.0, 10)] 
}


gs = GridSearchCV(pipe_ridge, params)
gs.fit(X_eng_train, y_eng_train)

gs.fit(X_eng_train, y_eng_train)
y_pred = gs.predict(X_eng_test)
print('grid search train score: ',gs.score(X_eng_train, y_eng_train))
print('grid search test score: ',gs.score(X_eng_test, y_eng_test))

print(gs.best_params_)

print('coef :',gs.best_estimator_.named_steps['ridge'].coef_)

print('intercept :',gs.best_estimator_.named_steps['ridge'].intercept_)

columns = X_eng_train.columns
columns = columns[gs.best_estimator_.named_steps['var_thresh'].get_support()]
columns = columns[gs.best_estimator_.named_steps['kbest'].get_support()]

feature_weights = pd.DataFrame(gs.best_estimator_.named_steps['ridge'].coef_, 
             index = columns, 
             columns=['weight'])


  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x =

  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  ret = umr_sum(x, axis, dtype, out, keepdims)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)
  x = um.multiply(x, x, out=x)


grid search train score:  0.8309312502347093
grid search test score:  0.81455915704368
{'kbest__k': 20, 'power_t__method': 'yeo-johnson', 'ridge__alphas': (0.1, 1.0, 10), 'ss__with_mean': True, 'var_thresh__threshold': 0.75}
coef : [    566.79428841    3928.1607807     7812.60942631     792.70882549
    8626.68000754    4668.23528898    -312.71786737  -31026.4915334
    4996.91288743     863.37687374    2647.26120247    1135.68622167
  -36336.90394665     629.50198277   85329.01912229   51205.63433565
   77947.59505963 -103750.57986042    4137.59010253     882.52488817]
intercept : 171449.84321745057


  x = um.multiply(x, x, out=x)


In [348]:
print(list(feature_weights.sort_values(by = 'weight', ascending = False).index))

['Year Built', 'sf', 'Year Remod/Add', 'Fireplace Qu', 'BsmtFin SF 1', 'Lot Area', 'Garage Area', 'garage', 'Bsmt Exposure', 'Mas Vnr Area', 'Open Porch SF', 'outside', 'Lot Frontage', 'BsmtFin Type 1', 'Wood Deck SF', '1st Flr SF', 'Garage Yr Blt', 'Gr Liv Area', 'Total Bsmt SF', 'year']
