## Model Fitting on the Ames, Iowa Dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_feature = pd.read_csv('../data/feature.csv',index_col=0)

In [3]:
target = pd.read_csv('../data/target.csv',header=None,index_col=0)

In [4]:
cat_df = df_feature.select_dtypes(include=['object'])

In [5]:
num_df = df_feature.select_dtypes(exclude=['object'])

In [15]:
columns_to_encode = cat_df.columns.values

In [25]:
columns_to_encode = ['alley', 'bldgtype', 'bsmtcond', 'bsmtexposure', 'bsmtfintype1',
                     'bsmtfintype2', 'bsmtqual', 'centralair', 'condition1',
                     'condition2', 'electrical', 'extercond', 'exterior1st',
                     'exterior2nd', 'exterqual', 'fence', 'fireplacequ', 'foundation',
                     'functional', 'garagecond', 'garagefinish', 'garagequal',
                     'garagetype', 'heating', 'heatingqc', 'housestyle', 'kitchenqual',
                     'landcontour', 'landslope', 'lotconfig', 'lotshape', 'masvnrtype',
                     'mszoning', 'neighborhood', 'paveddrive', 'roofmatl', 'roofstyle',
                     'salecondition', 'saletype', 'street', 'utilities']

In [17]:
columns_to_scale  = num_df.columns.values

In [24]:
columns_to_scale = ['1stFlrSF', '2ndFlrSF', '3SsnPorch', 'bedroomabvgr', 'bsmtfinsf1',
                    'bsmtfinsf2', 'bsmtfullbath', 'bsmthalfbath', 'bsmtunfsf',
                    'enclosedporch', 'fireplaces', 'fullbath', 'garagearea',
                    'garagecars', 'garageyrblt', 'grlivarea', 'halfbath',
                    'kitchenabvgr', 'lotarea', 'lotfrontage', 'lowqualfinsf',
                    'masvnrarea', 'miscval', 'mosold', 'mssubclass', 'openporchsf',
                    'overallcond', 'overallqual', 'poolarea', 'screenporch',
                    'totalbsmtsf', 'totrmsabvgrd', 'wooddecksf', 'yearbuilt',
                    'yearremodadd', 'yrsold']

In [39]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe    = OneHotEncoder()

# Scale and Encode Separate Columns
scaled_num  = scaler.fit_transform(df_feature[columns_to_scale]) 


In [47]:
le = LabelEncoder()
ohe_cat_df = cat_df.apply(le.fit_transform)


In [45]:
scaled_num_df = pd.DataFrame(scaled_num, columns= columns_to_scale)

In [48]:
features_dummies = pd.merge(scaled_num_df,ohe_cat_df, left_index=True, right_index=True)

In [49]:
features_dummies.head(3)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,bedroomabvgr,bsmtfinsf1,bsmtfinsf2,bsmtfullbath,bsmthalfbath,bsmtunfsf,enclosedporch,...,masvnrtype,mszoning,neighborhood,paveddrive,roofmatl,roofstyle,salecondition,saletype,street,utilities
0,-0.793434,1.161852,-0.116339,0.163779,0.575425,-0.288653,1.10781,-0.241061,-0.944591,-0.359325,...,1,3,5,2,1,1,4,8,1,0
1,0.25714,-0.795163,-0.116339,0.163779,1.171992,-0.288653,-0.819964,3.948809,-0.641228,-0.359325,...,2,3,24,2,1,1,4,8,1,0
2,-0.627826,1.189351,-0.116339,0.163779,0.092907,-0.288653,1.10781,-0.241061,-0.301643,-0.359325,...,1,3,5,2,1,1,4,8,1,0


In [50]:
target.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0,208500
1,181500
2,223500
3,140000
4,250000


### Import from sklearn & Training Test Split

In [51]:
#Import preprocessing from sklearn
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

#Import all models from sklearn
from sklearn.linear_model import Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [52]:
X_train, X_test, y_train, y_test = train_test_split(features_dummies, target,test_size=0.30, random_state=805)

In [53]:
X_train.shape

(1022, 77)

## 1. Fit a regression model using default settings with each of the following kinds of models:

   - ridge regression
   - lasso regression
   - knn
   - decision tree
   - support vector machines

#### Pipeline Scale & Ridge

In [54]:
ridge_pipe = Ridge()

In [55]:
ridge_pipe.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [56]:
print('Train score: {}'.format(ridge_pipe.score(X_train, y_train)))
print('Test score: {}'.format(ridge_pipe.score(X_test, y_test)))

Train score: 0.8664750058398232
Test score: 0.7745496507753106


#### Pipeline Scale  & Lasso

In [59]:
lasso_pipe =  Lasso(max_iter=100000)

In [60]:
lasso_pipe.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=100000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [61]:
print('Train score: {}'.format(lasso_pipe.score(X_train, y_train)))
print('Test score: {}'.format(lasso_pipe.score(X_test, y_test)))

Train score: 0.8667318560719903
Test score: 0.772242626303985


#### Pipeline Scale & knn

In [62]:
knn_pipe = KNeighborsRegressor()

In [63]:
knn_pipe.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [64]:
print('Train score: {}'.format(knn_pipe.score(X_train, y_train)))
print('Test score: {}'.format(knn_pipe.score(X_test, y_test)))

Train score: 0.8321076923924154
Test score: 0.7063282839757166


#### Decision tree

In [65]:
tree = DecisionTreeRegressor()

In [66]:
tree.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [67]:
print('Train score: {}'.format(tree.score(X_train, y_train)))
print('Test score: {}'.format(tree.score(X_test, y_test)))

Train score: 1.0
Test score: 0.810180542049804


#### Random Forest

In [68]:
rfr = RandomForestRegressor()

In [69]:
rfr.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [70]:
print('Train score: {}'.format(rfr.score(X_train, y_train)))
print('Test score: {}'.format(rfr.score(X_test, y_test)))

Train score: 0.9732599282618465
Test score: 0.8611669193242374


#### SVM

In [71]:
svr_pipe = LinearSVR()

In [74]:
svr_pipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


LinearSVR(C=1.0, dual=True, epsilon=0.0, fit_intercept=True,
     intercept_scaling=1.0, loss='epsilon_insensitive', max_iter=1000,
     random_state=None, tol=0.0001, verbose=0)

In [75]:
print('Train score: {}'.format(svr_pipe.score(X_train, y_train)))
print('Test score: {}'.format(svr_pipe.score(X_test, y_test)))

Train score: -0.006969580425395305
Test score: 0.019025121925958177


## 2. Report your results. 

Discuss what your results tell you in terms of the underlying structure of the data. Remember that you can think about the ridge and lasso in terms of linearity, the decision tree and knn in terms of non-linearity, and knn and svm in terms of locality.

In [76]:
from sklearn import metrics

In [77]:
print('Ridge MAE:', metrics.mean_absolute_error(y_test, ridge_pipe.predict(X_test)))
print('Ridge MSE:', metrics.mean_squared_error(y_test, ridge_pipe.predict(X_test)))
print('Ridge RMSE:', np.sqrt(metrics.mean_squared_error(y_test, ridge_pipe.predict(X_test))))

Ridge MAE: 21542.817865131314
Ridge MSE: 1539597465.3033586
Ridge RMSE: 39237.70463856619


In [78]:
print('lasso MAE:', metrics.mean_absolute_error(y_test, lasso_pipe.predict(X_test)))
print('lasso MSE:', metrics.mean_squared_error(y_test, lasso_pipe.predict(X_test)))
print('lasso RMSE:', np.sqrt(metrics.mean_squared_error(y_test, lasso_pipe.predict(X_test))))

lasso MAE: 21714.367553257965
lasso MSE: 1555352105.030733
lasso RMSE: 39437.9525968417


In [79]:
print('knn MAE:', metrics.mean_absolute_error(y_test, knn_pipe.predict(X_test)))
print('knn MSE:', metrics.mean_squared_error(y_test, knn_pipe.predict(X_test)))
print('knn RMSE:', np.sqrt(metrics.mean_squared_error(y_test,knn_pipe.predict(X_test))))

knn MAE: 25469.851141552514
knn MSE: 2005480280.5901368
knn RMSE: 44782.58903402233


In [80]:
print('Tree MAE:', metrics.mean_absolute_error(y_test, tree.predict(X_test)))
print('Tree MSE:', metrics.mean_squared_error(y_test, tree.predict(X_test)))
print('Tree RMSE:', np.sqrt(metrics.mean_squared_error(y_test, tree.predict(X_test))))

Tree MAE: 24840.77397260274
Tree MSE: 1296274578.107306
Tree RMSE: 36003.81338285302


In [81]:
print('RandomForest MAE:', metrics.mean_absolute_error(y_test, rfr.predict(X_test)))
print('RandomForest MSE:', metrics.mean_squared_error(y_test, rfr.predict(X_test)))
print('RandomForest RMSE:', np.sqrt(metrics.mean_squared_error(y_test, rfr.predict(X_test))))

RandomForest MAE: 18817.629452054796
RandomForest MSE: 948089279.2746804
RandomForest RMSE: 30791.05843056845


In [82]:
print('SVR MAE:', metrics.mean_absolute_error(y_test, svr_pipe.predict(X_test)))
print('SVR MSE:', metrics.mean_squared_error(y_test, svr_pipe.predict(X_test)))
print('SVR RMSE:', np.sqrt(metrics.mean_squared_error(y_test, svr_pipe.predict(X_test))))

SVR MAE: 51843.388942690464
SVR MSE: 6699064521.314433
SVR RMSE: 81847.81317368493


## 3. Use a cross-validated grid search to refine three of the above models

### GridSearchCV on Lasso, Ridge & Random Forest

In [90]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': [20,25,30,35,40],
              'n_estimators': range(10,30)
             }

forest_reg = RandomForestRegressor()

grid_search = GridSearchCV(forest_reg, param_grid, cv=5,
                           scoring='neg_mean_squared_error', return_train_score=True)
grid_search.fit(X_train, y_train)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


GridSearchCV(cv=5, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'max_depth': [20, 25, 30, 35, 40], 'n_estimators': range(10, 30)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [91]:
print('Train score: {}'.format(grid_search.score(X_train, y_train)))
print('Test score: {}'.format(grid_search.score(X_test, y_test)))

Train score: -123098989.07011195
Test score: -886076830.6499435


In [92]:
cvres = grid_search.cv_results_

In [93]:
for mean_score, params in zip(cvres["mean_test_score"],cvres["params"]):
    print(np.sqrt(-mean_score),params)

33693.592945999924 {'max_depth': 20, 'n_estimators': 10}
33051.030596179604 {'max_depth': 20, 'n_estimators': 11}
32002.79939467831 {'max_depth': 20, 'n_estimators': 12}
31450.57223785804 {'max_depth': 20, 'n_estimators': 13}
32288.09854714857 {'max_depth': 20, 'n_estimators': 14}
31565.057135191153 {'max_depth': 20, 'n_estimators': 15}
31227.62068305958 {'max_depth': 20, 'n_estimators': 16}
30843.901588104247 {'max_depth': 20, 'n_estimators': 17}
31679.347875483323 {'max_depth': 20, 'n_estimators': 18}
32079.80754201038 {'max_depth': 20, 'n_estimators': 19}
30717.75304965897 {'max_depth': 20, 'n_estimators': 20}
31652.56638192414 {'max_depth': 20, 'n_estimators': 21}
30338.194537833733 {'max_depth': 20, 'n_estimators': 22}
30963.109439375457 {'max_depth': 20, 'n_estimators': 23}
31873.060925465074 {'max_depth': 20, 'n_estimators': 24}
30332.359344785684 {'max_depth': 20, 'n_estimators': 25}
31512.43129039736 {'max_depth': 20, 'n_estimators': 26}
30351.894626065223 {'max_depth': 20, 'n

In [94]:
print("Best RMSE :",np.sqrt(-grid_search.best_score_))

Best RMSE : 29687.951131426056


In [95]:
imp_feat = grid_search.best_estimator_.feature_importances_
featimp_df = pd.DataFrame(sorted(zip(imp_feat,features_dummies.columns.values)), columns= ['feature_importance','features'])
featimp_df.sort_values(by='feature_importance', ascending=False).head(50)

Unnamed: 0,feature_importance,features
76,0.542543,overallqual
75,0.105345,grlivarea
74,0.045344,garagearea
73,0.033117,totalbsmtsf
72,0.027027,2ndFlrSF
71,0.025231,garagecars
70,0.024472,bsmtfinsf1
69,0.019045,1stFlrSF
68,0.014187,masvnrarea
67,0.012067,lotarea


#### GridSearch Ridge

In [97]:
ridge_param_grid = {'alpha': range(300,400)}

ridge_reg = Ridge()
ridge_grid_search = GridSearchCV(ridge_reg, param_grid = ridge_param_grid,return_train_score=True)
ridge_grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': range(300, 400)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [98]:
ridge_grid_search.best_params_

{'alpha': 390}

In [108]:
print('Train score: {}'.format(ridge_grid_search.score(X_train, y_train)))
print('Test score: {}'.format(ridge_grid_search.score(X_test, y_test)))

Train score: 0.8472570456533971
Test score: 0.7887862611959511


In [99]:
ridge_grid_search.best_score_

0.7929619703742068

In [100]:
pd.DataFrame(ridge_grid_search.cv_results_).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
mean_fit_time,0.00866723,0.00733368,0.00833376,0.00333357,0.00666666,0.00333333,0.00666674,0.00333325,0.00333341,0.00666674,...,0.00666698,0.00600036,0.00533374,0.00433358,0.00333341,0.0036668,0.00233332,0.00333341,0,0
mean_score_time,0.00133332,0.0013334,0.00100009,0,0,0,0,0,0,0,...,0.0013334,0.0013334,0.0013334,0.000666618,0.00100017,0.000333389,0.000666698,0,0,0.00333325
mean_test_score,0.792287,0.792303,0.792319,0.792335,0.792351,0.792366,0.792381,0.792396,0.792411,0.792425,...,0.792962,0.792962,0.792962,0.792962,0.792961,0.792961,0.79296,0.792959,0.792958,0.792957
mean_train_score,0.856724,0.856658,0.856593,0.856527,0.856461,0.856396,0.856331,0.856265,0.8562,0.856135,...,0.850969,0.850907,0.850844,0.850782,0.85072,0.850657,0.850595,0.850533,0.850471,0.850409
param_alpha,300,301,302,303,304,305,306,307,308,309,...,390,391,392,393,394,395,396,397,398,399
params,{'alpha': 300},{'alpha': 301},{'alpha': 302},{'alpha': 303},{'alpha': 304},{'alpha': 305},{'alpha': 306},{'alpha': 307},{'alpha': 308},{'alpha': 309},...,{'alpha': 390},{'alpha': 391},{'alpha': 392},{'alpha': 393},{'alpha': 394},{'alpha': 395},{'alpha': 396},{'alpha': 397},{'alpha': 398},{'alpha': 399}
rank_test_score,100,99,98,97,96,95,94,93,92,91,...,1,2,4,6,8,10,12,14,16,18
split0_test_score,0.837552,0.837486,0.83742,0.837354,0.837289,0.837223,0.837157,0.837091,0.837026,0.83696,...,0.831638,0.831572,0.831507,0.831441,0.831376,0.83131,0.831245,0.831179,0.831114,0.831048
split0_train_score,0.837017,0.836957,0.836896,0.836836,0.836776,0.836716,0.836655,0.836595,0.836535,0.836475,...,0.831723,0.831665,0.831608,0.831551,0.831494,0.831436,0.831379,0.831322,0.831265,0.831208
split1_test_score,0.677072,0.677172,0.677271,0.677371,0.67747,0.677568,0.677666,0.677764,0.677862,0.677959,...,0.68478,0.684853,0.684926,0.684998,0.68507,0.685142,0.685214,0.685286,0.685357,0.685428


### GridSearch Lasso

In [104]:
lasso_params = [
    { 'alpha': np.logspace(.1, 1, 12),
    'max_iter': [1000, 10000, 100000]},
    { 'alpha': np.logspace(-3, 3, 5),
    'max_iter': [1000, 10000, 100000]},
    
]
n_folds = 10

In [105]:
lasso_gs = GridSearchCV(Lasso(), param_grid= lasso_params, cv=n_folds, n_jobs=-1)

In [106]:
lasso_gs.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'alpha': array([ 1.25893,  1.51991,  1.835  ,  2.21541,  2.67469,  3.22917,
        3.8986 ,  4.70682,  5.68258,  6.86062,  8.28289, 10.     ]), 'max_iter': [1000, 10000, 100000]}, {'alpha': array([1.00000e-03, 3.16228e-02, 1.00000e+00, 3.16228e+01, 1.00000e+03]), 'max_iter': [1000, 10000, 100000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [107]:
lasso_gs.best_estimator_

Lasso(alpha=1000.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [111]:
print('Train score: {}'.format(lasso_gs.score(X_train, y_train)))
print('Test score: {}'.format(lasso_gs.score(X_test, y_test)))

Train score: 0.8503561774955546
Test score: 0.78989866042309


## 4. Prepare a complexity curve for at least one attribute for each of the three models that you are tuning.

## 5. (Optional) Use a principal component analysis on your original data* to create principal component features. Augment the original dataset with these principal component features and repeat steps 1 and 2.
## 6. (Optional) Submit your results to Kaggle.