## Model Fitting on the Ames, Iowa Dataset

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
df_feature = pd.read_csv('../data/feature.csv',index_col=0)

In [4]:
target = pd.read_csv('../data/target.csv',header=None,index_col=0)

In [6]:
cat_df = df_feature.select_dtypes(include=['object'])

In [7]:
num_df = df_feature.select_dtypes(exclude=['object'])

In [8]:
columns_to_encode = cat_df.columns.values

In [10]:
columns_to_scale  = num_df.columns.values

In [11]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe    = OneHotEncoder()

# Scale and Encode Separate Columns
scaled_num  = scaler.fit_transform(df_feature[columns_to_scale]) 


In [12]:
le = LabelEncoder()
ohe_cat_df = cat_df.apply(le.fit_transform)


In [13]:
scaled_num_df = pd.DataFrame(scaled_num, columns= columns_to_scale)

In [14]:
features_dummies = pd.merge(scaled_num_df,ohe_cat_df, left_index=True, right_index=True)

In [15]:
features_dummies.head(3)

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,bedroomabvgr,bsmtfinsf1,bsmtfinsf2,bsmtfullbath,bsmthalfbath,bsmtunfsf,enclosedporch,...,masvnrtype,mszoning,neighborhood,paveddrive,roofmatl,roofstyle,salecondition,saletype,street,utilities
0,-0.793434,1.161852,-0.116339,0.163779,0.575425,-0.288653,1.10781,-0.241061,-0.944591,-0.359325,...,1,3,5,2,1,1,4,8,1,0
1,0.25714,-0.795163,-0.116339,0.163779,1.171992,-0.288653,-0.819964,3.948809,-0.641228,-0.359325,...,2,3,24,2,1,1,4,8,1,0
2,-0.627826,1.189351,-0.116339,0.163779,0.092907,-0.288653,1.10781,-0.241061,-0.301643,-0.359325,...,1,3,5,2,1,1,4,8,1,0


In [16]:
target.head()

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
0,208500
1,181500
2,223500
3,140000
4,250000


### Import from sklearn & Training Test Split

In [17]:
#Import preprocessing from sklearn
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split

#Import all models from sklearn
from sklearn.linear_model import Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [26]:
X_train, X_test, y_train, y_test = train_test_split(features_dummies, target,test_size=0.30)

In [27]:
X_train.shape

(1022, 77)

## 1. Fit a regression model using default settings with each of the following kinds of models:

   - ridge regression
   - lasso regression
   - knn
   - decision tree
   - support vector machines

#### Pipeline Scale & Ridge

In [28]:
ridge_pipe = Ridge()

In [29]:
ridge_pipe.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [30]:
print('Train score: {}'.format(ridge_pipe.score(X_train, y_train)))
print('Test score: {}'.format(ridge_pipe.score(X_test, y_test)))

Train score: 0.8688957252174139
Test score: 0.8017328865175014


#### Pipeline Scale  & Lasso

In [31]:
lasso_pipe =  Lasso(max_iter=100000)

In [32]:
lasso_pipe.fit(X_train, y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=100000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [33]:
print('Train score: {}'.format(lasso_pipe.score(X_train, y_train)))
print('Test score: {}'.format(lasso_pipe.score(X_test, y_test)))

Train score: 0.8690587616025282
Test score: 0.8012198677401239


#### Pipeline Scale & knn

In [34]:
knn_pipe = KNeighborsRegressor()

In [35]:
knn_pipe.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
          metric_params=None, n_jobs=1, n_neighbors=5, p=2,
          weights='uniform')

In [36]:
print('Train score: {}'.format(knn_pipe.score(X_train, y_train)))
print('Test score: {}'.format(knn_pipe.score(X_test, y_test)))

Train score: 0.8349987408150237
Test score: 0.6908390088924001


#### Decision tree

In [37]:
tree = DecisionTreeRegressor()

In [38]:
tree.fit(X_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [39]:
print('Train score: {}'.format(tree.score(X_train, y_train)))
print('Test score: {}'.format(tree.score(X_test, y_test)))

Train score: 1.0
Test score: 0.7840144907325658


#### Random Forest

In [40]:
rfr = RandomForestRegressor()

In [41]:
rfr.fit(X_train, y_train)

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [42]:
print('Train score: {}'.format(rfr.score(X_train, y_train)))
print('Test score: {}'.format(rfr.score(X_test, y_test)))

Train score: 0.9659048608965343
Test score: 0.8435520090572739


#### SVM

In [43]:
svr_pipe = SVR(kernel='linear')

In [44]:
svr_pipe.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [45]:
print('Train score: {}'.format(svr_pipe.score(X_train, y_train)))
print('Test score: {}'.format(svr_pipe.score(X_test, y_test)))

Train score: 0.1452443841550879
Test score: 0.11634887847627184


## 3. Use a cross-validated grid search to refine three of the above models

### GridSearchCV on Lasso, Ridge & Random Forest

#### GridSearch Ridge

In [47]:
ridge_param_grid = {'alpha': range(200,500)}

ridge_reg = Ridge()
ridge_grid_search = GridSearchCV(ridge_reg, param_grid = ridge_param_grid,return_train_score=True)
ridge_grid_search.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'alpha': range(200, 500)}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring=None, verbose=0)

In [48]:
ridge_grid_search.best_params_

{'alpha': 438}

In [49]:
print('Train score: {}'.format(ridge_grid_search.score(X_train, y_train)))
print('Test score: {}'.format(ridge_grid_search.score(X_test, y_test)))

Train score: 0.8486762795245364
Test score: 0.7981213091818835


In [50]:
ridge_grid_search.best_score_

0.8009431883809843

### GridSearch Lasso

In [51]:
lasso_params = { 'alpha': [.0001, .001, .01, .1, 1, 10, 100, 1000, 2000, 5000, 10000],
                'max_iter': [100, 1000, 10000, 100000]}

In [54]:
lasso_gs = GridSearchCV(Lasso(), param_grid= lasso_params, n_jobs=-1)

In [55]:
lasso_gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 2000, 5000, 10000], 'max_iter': [100, 1000, 10000, 100000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [56]:
lasso_gs.best_estimator_

Lasso(alpha=5000, copy_X=True, fit_intercept=True, max_iter=100,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [57]:
print('Train score: {}'.format(lasso_gs.score(X_train, y_train)))
print('Test score: {}'.format(lasso_gs.score(X_test, y_test)))

Train score: 0.8139883846267704
Test score: 0.7610933845984743


### Use Random Forest to Predict Kaggle Result

In [119]:
kaggle_df = pd.read_csv('../data/kaggle.csv',header=0,index_col=0)

In [120]:
kaggle_cat_df = kaggle_df.select_dtypes(include=['object'])

In [121]:
kaggle_num_df = kaggle_df.select_dtypes(exclude=['object'])

In [116]:
columns_to_encode = kaggle_cat_df.columns.values

In [117]:
columns_to_scale  = kaggle_num_df.columns.values

In [105]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

# Instantiate encoder/scaler
scaler = StandardScaler()
ohe    = OneHotEncoder()

# Scale and Encode Separate Columns
scaled_num  = scaler.fit_transform(kaggle_num_df[columns_to_scale]) 


In [126]:
le = LabelEncoder()
ohe_cat_df = kaggle_cat_df.apply(le.fit_transform)


In [127]:
scaled_num_df = pd.DataFrame(scaled_num, columns= columns_to_scale)

In [152]:
kaggle_features_dummies = pd.merge(scaled_num_df,ohe_cat_df,left_index=True, right_index=True)

In [198]:
fea = kaggle_features_dummies.columns.values

In [207]:
kaggle_features_dummies= kaggle_features_dummies.drop(['Bsmtcond'], axis=1)

In [209]:
predicted = rfr.predict(kaggle_features_dummies)

In [211]:
result_df = pd.DataFrame(predicted)

In [212]:
result_df

Unnamed: 0,0
0,290843.3
1,175650.0
2,317436.6
3,305681.1
4,329223.3
5,308423.3
6,231287.8
7,336138.7
8,339916.6
9,320636.6


In [213]:
result_df.to_csv('../data/test_result.csv')