## Model Fitting on the Ames, Iowa Dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_feature = pd.read_csv('../data/feature.csv',index_col=0)

In [3]:
target = pd.read_csv('../data/target.csv',header=None,index_col=0)

In [4]:
cat_df = df_feature.select_dtypes(include=['object'])

In [5]:
num_df = df_feature.select_dtypes(exclude=['object'])

In [6]:
features_dummies = pd.get_dummies(cat_df)

In [7]:
features_dummies.head(3)

Unnamed: 0,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Street_Grvl,Street_Pave,Alley_Grvl,Alley_Pave,LotShape_IR1,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,0,0,0,1,0,0,1,1,0,0,...,0,0,0,1,0,0,0,0,1,0
1,0,0,0,1,0,0,1,1,0,0,...,0,0,0,1,0,0,0,0,1,0
2,0,0,0,1,0,0,1,1,0,1,...,0,0,0,1,0,0,0,0,1,0


In [8]:
features_dummies = pd.merge(num_df,features_dummies, left_index=True, right_index=True)

In [9]:
features_dummies.head(3)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,60,65.0,8450,7,5,2003,2003,196.0,706,0,...,0,0,0,1,0,0,0,0,1,0
1,20,80.0,9600,6,8,1976,1976,0.0,978,0,...,0,0,0,1,0,0,0,0,1,0
2,60,68.0,11250,7,5,2001,2002,162.0,486,0,...,0,0,0,1,0,0,0,0,1,0


In [10]:
target.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0,208500
1,181500
2,223500
3,140000
4,250000


### Import from sklearn & Training Test Split

In [11]:
#Import preprocessing from sklearn
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

#Import all models from sklearn
from sklearn.linear_model import Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

In [12]:
X_train, X_test, y_train, y_test = train_test_split(features_dummies, target, random_state = 42)

In [13]:
X_train.shape

(1095, 288)

## 1. Fit a regression model using default settings with each of the following kinds of models:

   - ridge regression
   - lasso regression
   - knn
   - decision tree
   - support vector machines

#### Pipeline Scale & Ridge

In [14]:
ridge_pipe = make_pipeline(StandardScaler(),
                           Ridge())

In [15]:
ridge_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('ridge', Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001))])

In [16]:
ridge_pipe.score(X_train, y_train)

0.9353681660736672

In [17]:
ridge_pipe.score(X_test, y_test)

0.8761095309925925

#### Pipeline Scale  & Lasso

In [18]:
lasso_pipe = make_pipeline(StandardScaler(),
                             Lasso(max_iter=100000))

In [19]:
lasso_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=100000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False))])

In [20]:
lasso_pipe.score(X_train, y_train)

0.9353704090830115

In [21]:
lasso_pipe.score(X_test, y_test)

0.8764157423938678

In [22]:
lasso_pipe.named_steps

{'lasso': Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=100000,
    normalize=False, positive=False, precompute=False, random_state=None,
    selection='cyclic', tol=0.0001, warm_start=False),
 'standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True)}

In [23]:
lasso_pipe.steps

[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)),
 ('lasso', Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=100000,
     normalize=False, positive=False, precompute=False, random_state=None,
     selection='cyclic', tol=0.0001, warm_start=False))]

#### Pipeline Scale & knn

In [24]:
knn_pipe = make_pipeline(StandardScaler(),KNeighborsRegressor())

In [25]:
knn_pipe.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('kneighborsregressor', KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform'))])

In [26]:
knn_pipe.score(X_train, y_train)

0.8106337820254708

In [27]:
knn_pipe.score(X_test, y_test)

0.7497243298821902

#### Decision tree

In [28]:
tree = DecisionTreeRegressor()

In [29]:
tree.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [30]:
tree.score(X_train, y_train)

1.0

In [31]:
tree.score(X_test, y_test)

0.7216831516759988

#### Random Forest

In [32]:
rfr = RandomForestRegressor()

In [33]:
rfr.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [34]:
rfr.score(X_train, y_train)

0.9747874977105689

In [35]:
rfr.score(X_test, y_test)

0.8602934606551862

#### SVM

In [36]:
svr_pipe = make_pipeline(StandardScaler(),SVR())

In [37]:
svr_pipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svr', SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False))])

## 2. Report your results. 

Discuss what your results tell you in terms of the underlying structure of the data. Remember that you can think about the ridge and lasso in terms of linearity, the decision tree and knn in terms of non-linearity, and knn and svm in terms of locality.

## 3. Use a cross-validated grid search to refine three of the above models

### GridSearchCV on Lasso, Random Forest & 

In [42]:
from sklearn.model_selection import GridSearchCV, ShuffleSplit
from sklearn.linear_model import ElasticNet
from sklearn.feature_selection import SelectFromModel, SelectKBest, f_regression

In [43]:
pipe_for_gs = Pipeline([
    ('skb', SelectKBest(score_func=f_regression, k=40)),
    ('scaler', StandardScaler()),
    ('sfm', SelectFromModel(Lasso())),
    ('regr', ElasticNet())
])

In [44]:
params = {
    'regr__l1_ratio':[.1,.3,.5,.7,.9],
    'regr__alpha':np.logspace(-3,3,7)
}

In [45]:
gspipe = GridSearchCV(pipe_for_gs,
                      param_grid=params,
                      n_jobs=-1,
                      cv=ShuffleSplit(n_splits=5, random_state=42))

In [46]:
gspipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
  corr /= X_norms
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


GridSearchCV(cv=ShuffleSplit(n_splits=5, random_state=42, test_size='default',
       train_size=None),
       error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('skb', SelectKBest(k=40, score_func=<function f_regression at 0x0000000012B3D2F0>)), ('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('sfm', SelectFromModel(estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, pr...alse, precompute=False,
      random_state=None, selection='cyclic', tol=0.0001, warm_start=False))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'regr__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9], 'regr__alpha': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [47]:
gspipe.best_score_

0.6699199419760044

In [48]:
gspipe.best_estimator_.named_steps

{'regr': ElasticNet(alpha=10.0, copy_X=True, fit_intercept=True, l1_ratio=0.7,
       max_iter=1000, normalize=False, positive=False, precompute=False,
       random_state=None, selection='cyclic', tol=0.0001, warm_start=False),
 'scaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'sfm': SelectFromModel(estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
    normalize=False, positive=False, precompute=False, random_state=None,
    selection='cyclic', tol=0.0001, warm_start=False),
         norm_order=1, prefit=False, threshold=None),
 'skb': SelectKBest(k=40, score_func=<function f_regression at 0x0000000012B3D2F0>)}

## 4. Prepare a complexity curve for at least one attribute for each of the three models that you are tuning.

## 5. (Optional) Use a principal component analysis on your original data* to create principal component features. Augment the original dataset with these principal component features and repeat steps 1 and 2.
## 6. (Optional) Submit your results to Kaggle.