<span style="color:darkblue"><font size="5"> DeCockHousePrice Dataset: SalePrice Prediction</font></span> 
    
**Linear model, SVM, Trees, Ensembles All together**

In [142]:
import pandas as pd
import seaborn as sns; sns.set(color_codes=True)
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
import warnings; warnings.simplefilter('ignore')
import numpy as np
np.random.seed(10)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Data

In [261]:
df=pd.read_csv('house_price.csv')

In [206]:
X=df.drop('SalePrice',axis=1).values
y=df.loc[:,'SalePrice'].values

y=np.log(y)
y[0]

# broadly Model Selection

In [17]:
from sklearn.ensemble import (RandomForestRegressor,
                              GradientBoostingRegressor,AdaBoostRegressor,ExtraTreesRegressor)
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from pipelinehelper import PipelineHelper
from sklearn.model_selection import GridSearchCV

from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from pipelinehelper import PipelineHelper
from sklearn.model_selection import GridSearchCV

In [208]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X,y)
print(X_trainval.shape)
y_trainval[0]

In [79]:
seedNum=42
models=[('tree',DecisionTreeRegressor(random_state=seedNum)),
        ('abr',AdaBoostRegressor(random_state=seedNum)),\
        ('rfr',RandomForestRegressor(random_state=seedNum)),
        ('etr',ExtraTreesRegressor(random_state=seedNum)),
        ('xgbr',XGBRegressor(random_state=seedNum)),
        ('gbr', GradientBoostingRegressor(random_state=seedNum))]

In [182]:
results=[]
names=[]
metrics=[]
for name, model in models:
    cv_results=cross_val_score(model,X_trainval,y_trainval,cv=5)
    results.append(cv_results)
    metrics.append(cv_results.mean())
    
    print('%s: %f\n'%(name,cv_results.mean()))
best_model=models[metrics.index(max(metrics))]
print(f'The Best Model is:\n {best_model}')

tree: 0.717777

abr: 0.814987

rfr: 0.858804

etr: 0.866519

xgbr: 0.860045

gbr: 0.882266

The Best Model is:
 ('gbr', GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False))


In [81]:
#Perfomances of GradientBoostingRegressor and ExtraTreesRegressor approximate,search together
pipe = Pipeline([('regressor', \
                  PipelineHelper([('etr',ExtraTreesRegressor()),('gbr', GradientBoostingRegressor())]))])
search_space = [{'regressor': [GradientBoostingRegressor()],
               'regressor__n_estimators: [10, 100, 300, 500],
                 'regressor__learning_rate' : [0.01,0.1,1],
                  'regressor__min_samples_split': [2,4,6],
                'regressor__min_samples_leaf':[1,2,4]
                },
                 {'regressor': [ExtraTreesRegressor()],
                 'regressor__max_depth': [10, 30, 50, 70],
                 'regressor__min_samples_leaf': [1, 2, 4],
                 'regressor__n_estimators': [10, 100, 500],
                 'regressor__max_features': [1, 2, 3]}]
# Create grid search 
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0)
best_model = clf.fit(X_trainval, y_trainval)

In [82]:
best_model.best_estimator_.get_params()['regressor']

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=4,
                          min_weight_fraction_leaf=0.0, n_estimators=300,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [100]:
print(f'Average R2 score of Cross Validation: {best_model.best_score_}')

Average R2 score of Cross Validation: 0.881784383115401


In [147]:
print(f'Performance on unseen test set: {best_model.score(X_test,y_test)}')

Performance on unseen test set: 0.8993592559773289


# Bias and Variance Analysis

In [198]:
def model_bias_variance(X,y,model):
    from mlxtend.evaluate import bias_variance_decomp
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.33,
                                                    random_state=123,
                                                    shuffle=True)
    avg_expected_loss, avg_bias, avg_var = bias_variance_decomp(
    model, X_train, y_train, X_test, y_test, 
        loss='mse',
        random_seed=123)

    print('Average expected loss: %.3f' % avg_expected_loss)
    print('Average bias: %.3f' % avg_bias)
    print('Average variance: %.3f' % avg_var)

In [256]:
model= GradientBoostingRegressor(learning_rate=0.1, loss='ls', max_depth=3,
                          min_samples_leaf=1, min_samples_split=4,
                          min_weight_fraction_leaf=0.0, n_estimators=300)

print('Bias and Variance of GBR model:')
model_bias_variance(X,y,model=model)

Bias and Variance of GBR model:
Average expected loss: 0.387
Average bias: 0.361
Average variance: 0.026


# Compared with the other models
Performance of **GradientBoostRegressor** on unseen data:
    $R^2  score=0.8993$

Performance of **SVM** on unseen data: 
    $R^2  score=0.8227$

Performance of **Ridge** on unseen data: 
   $R^2  score=0.8241$

- **GradientBoostRegressor is doing much better than linear model and SVM**


- **But bias still exists**

# VotingRegressor

   **Could VotingRegressor do better than GradientBoostRegressor itself ?** 

In [197]:
from sklearn.ensemble import VotingRegressor

## Searching for best parameters for each model

In [212]:
etr=ExtraTreesRegressor()
param_grid = {'max_depth': [30,70,100,None],
                 'min_samples_leaf': [1, 2, 4],
                 'n_estimators': [100, 200,300]}
grid_search=GridSearchCV(etr,param_grid,cv=5)
grid_search.fit(X_trainval,y_trainval)
grid_search.best_params_

{'max_depth': 100, 'min_samples_leaf': 1, 'n_estimators': 200}

In [193]:
rfr=RandomForestRegressor()
param_grid = {'max_depth': [30,70,100],
                 'min_samples_leaf': [1, 2, 4],
                 'n_estimators': [100, 200,300]}
grid_search=GridSearchCV(rfr,param_grid,cv=5)
grid_search.fit(X_trainval,y_trainval)
grid_search.best_score_

0.8596234829469539

In [189]:
grid_search.best_params_

{'max_depth': 50, 'min_samples_leaf': 1, 'n_estimators': 200}

In [194]:
grid_search.best_params_

{'max_depth': 70, 'min_samples_leaf': 1, 'n_estimators': 300}

In [195]:
xgbr=XGBRegressor()

param_grid = {'max_depth': [30,70,100],
              'learning_rate':[0.01,0.1,1],
                 'n_estimators': [100,200,300]}
grid_search=GridSearchCV(xgbr,param_grid,cv=5,n_jobs=4)

grid_search.fit(X_trainval,y_trainval)

grid_search.best_score_

0.8752651555217407

In [196]:
grid_search.best_params_

{'learning_rate': 0.1, 'max_depth': 70, 'n_estimators': 100}

## Voting regression model training and evaluation

In [214]:
estimators = [ 
    ('rfr',RandomForestRegressor(max_depth=70, min_samples_leaf=1,n_estimators=300)),
    ('xgbr',XGBRegressor(learning_rate=0.1, max_depth=70, n_estimators=100)),
    ('etr',ExtraTreesRegressor(max_depth=100, min_samples_leaf=1, n_estimators=200)),
    ('gbr',GradientBoostingRegressor(n_estimators=300,learning_rate=0.1,min_samples_leaf=1, min_samples_split=4))]
vc = VotingRegressor(estimators=estimators, n_jobs=4)

In [220]:
vc.fit(X_trainval,y_trainval)
vc.score(X_test,y_test)

0.8839619384444075

In [219]:
for est,name in zip(vc.estimators_,vc.estimators):
    print (name[0], est.score(X_test,y_test))

rfr 0.8615918272599266
xgbr 0.8723926126035061
etr 0.8728964317403988
gbr 0.8833058205245841


## Comparing VotingRegressor with GradientBoostRegressor
Performance of **GradientBoostRegressor** on unseen data:
    $R^2  score=0.8993$
    
    
Performance of **VotingRegressor** on unseen data:
    $R^2  score=0.8839$
   

<span style="color:darkblue"><font size="6"> Could I do better ??</font></span>


**solution:**
- Try AdaBoostRegressor with another estimator MARS(Multivariate Adaptive Regression Splines) instead of "DesisionTree"
- MARS can be viewed as a modification of 'CART tree' method. It is well suited for high dimentional problem

# AdaBoostRegressor
    (base_estimator="Multivariate Adaptive Regression Splines")

In [223]:
df_mars=pd.read_csv('house-price-mars.csv')

In [224]:
df_mars.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterCond,BsmtCond,...,MiscFeature_TenC,SaleType_COD,SaleType_New,SaleType_Other,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal
0,14.056118,9.084455,5,7,5,2003,2003,20.18696,3,3,...,0,0,0,0,1,0,0,0,0,1
1,15.771653,9.213244,7,6,8,1976,1976,0.0,3,3,...,0,0,0,0,1,0,0,0,0,1
2,14.413847,9.373359,5,7,5,2001,2002,18.427529,3,3,...,0,0,0,0,1,0,0,0,0,1
3,13.441145,9.207973,6,7,5,1915,1970,0.0,3,4,...,0,0,0,0,1,1,0,0,0,0
4,16.201359,9.612758,10,8,5,2000,2000,26.525806,3,3,...,0,0,0,0,1,0,0,0,0,1


In [225]:
# MARS can be viewed as a combination of linear model and tree model which could benifit from 
#data scaling and feature selection
import statsmodels.regression.linear_model as sm
Xm=df_mars.drop('SalePrice',axis=1).values
ym=df_mars.loc[:,'SalePrice'].values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
Xm = scaler.fit_transform(Xm)
ym=np.log(ym)

numVars = Xm.shape[1]-1
for i in range(0, numVars):
    regressor_OLS = sm.OLS(ym, Xm).fit()
    maxVar = max(regressor_OLS.pvalues).astype(float)
    if maxVar > 0.05:
        for j in range(0, numVars - i):
            if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                Xm = np.delete(Xm, j, 1)

In [259]:
print(Xm.shape)
print(ym[0])

(1449, 121)
12.247694320220994


In [245]:
Xm_trainval, Xm_test, ym_trainval, ym_test = train_test_split(Xm, ym,
                                                    test_size=0.3,
                                                    random_state=123,
                                                        shuffle=True)

In [244]:
from pyearth import Earth
# Mars alone looks not bad
mars=Earth(max_degree=1, penalty=0.1, endspan=1000)
cross_val_score(mars,Xm_trainval,ym_trainval,cv=5).mean()

0.9070371189213573

In [247]:
# Searching for best parameters for AdaBoostRegressor
abr=AdaBoostRegressor(base_estimator=mars)
param_grid = {'learning_rate': [0.01,0.05,0.1,0.5],\
              'n_estimators':[15,25,35],'loss': ['linear','square','exponential']}
grid_search=GridSearchCV(abr,param_grid,cv=5)

grid_search.fit(Xm_trainval,ym_trainval)
grid_search.best_params_

{'learning_rate': 0.01, 'loss': 'exponential', 'n_estimators': 25}

In [248]:
#perfomance on unseen data
grid_search.score(Xm_test,ym_test)

0.9160350939035947

In [253]:
ym_pred=grid_search.predict(Xm_test)
rmse=np.sqrt(mean_squared_error(ym_test,ym_pred))
rmse
#This score is around the top 20% level in Kaggle 

0.11650711319447676

**Ranking all the models have been applied on this dataset by now**

Performance of **AdaBoostRegressor(mars)** on unseen data:
    $R^2  score=0.9160$
    
Performance of **GradientBoostRegressor(tree)** on unseen data:
    $R^2  score=0.8993$

Performance of **SVM** on unseen data: 
    $R^2  score=0.8227$

Performance of **Ridge** on unseen data: 
   $R^2  score=0.8241$