<span style="color:darkblue"><font size="5"> DeCockHousePrice Dataset: SalePrice Prediction</font></span> 
    
  MARS

In [175]:
import pandas as pd
import seaborn as sns; sns.set(color_codes=True)
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
%matplotlib inline
import warnings; warnings.simplefilter('ignore')
import numpy as np
np.random.seed(10)
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import (RandomTreesEmbedding, RandomForestClassifier,
                              GradientBoostingClassifier)
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.pipeline import make_pipeline

In [176]:
df=pd.read_csv('data/train.csv')

# Data cleaning

In [177]:

# Outliers
df = df.drop(df[(df['OverallQual'] < 5)
                                  & (df['SalePrice'] > 200000)].index)
df = df.drop(df[(df['GrLivArea'] > 4000)
                                  & (df['SalePrice'] < 200000)].index)
df = df.drop(df[(df['GarageArea'] > 1200)
                                  & (df['SalePrice'] < 200000)].index)
df = df.drop(df[(df['TotalBsmtSF'] > 3000)
                                  & (df['SalePrice'] > 320000)].index)
df = df.drop(df[(df['1stFlrSF'] < 3000)
                                  & (df['SalePrice'] > 600000)].index)
df = df.drop(df[(df['1stFlrSF'] > 3000)
                                  & (df['SalePrice'] < 200000)].index)

In [178]:
cg_cols = ['Alley', 'PoolQC', 'MiscFeature', 'Fence', 'FireplaceQu', 'GarageType',
    'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtQual', 'BsmtCond',
    'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'MasVnrType']
dc_cols = ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath',
    'BsmtHalfBath', 'GarageYrBlt', 'GarageArea', 'GarageCars', 'MasVnrArea']
ot_cols = ['Electrical', 'Exterior1st', 'Exterior2nd', 'Functional', 'KitchenQual',
    'SaleType', 'Utilities']
for col in dc_cols:
    df[col].replace(np.nan, 0, inplace=True)

for col in cg_cols:
    df[col].replace(np.nan, 'None', inplace=True)

for col in ot_cols:
    df[col].replace(np.nan, df[col].mode()[0], inplace=True)
    
# Filling MSZoning according to MSSubClass
df['MSZoning'] = df.groupby('MSSubClass')['MSZoning'].apply(
    lambda x: x.fillna(x.mode()[0]))

# Filling LotFrontage according to Neighborhood

df['LotFrontage']=df.groupby(['Neighborhood'])['LotFrontage'].apply(lambda x: x.fillna(x.median()))

In [179]:
df.drop('Id',axis=1,inplace=True)

#binning with rare values
col_rare = ['Condition1', 'Condition2', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
    'Heating', 'Electrical', 'Functional', 'SaleType']

for col in col_rare:
    mask = df[col].isin(
        df[col].value_counts()[df[col].value_counts() < 10].index)
    df[col][mask] = 'Other'

# Features which numerical on data but should be treated as category.
df['MSSubClass'] = df['MSSubClass'].astype(str)

# cyclical feature transformation
from math import pi
df['MoSold']=2*pi*df['MoSold']/df['MoSold'].max()
df["cos_MoSold"] = np.cos(df["MoSold"])
df["sin_MoSold"] = np.sin(df["MoSold"])

In [180]:
from scipy.special import boxcox1p

possible_skewed = [
    'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
    'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea',
    'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
    'ScreenPorch', 'PoolArea', 'LowQualFinSF', 'MiscVal'
]

# Finding skewness of the numerical features

skew_features = np.abs(df[possible_skewed].apply(lambda x: skew(x)).sort_values(
    ascending=False))

# Filtering skewed features

high_skew = skew_features[skew_features > 0.3]

# Taking indexes of high skew

skew_index = high_skew.index

# Applying boxcox transformation to fix skewness

for i in skew_index:
    df[i] = boxcox1p(df[i], boxcox_normmax(df[i] + 1))

In [181]:
#log transform skewed numeric features:
from scipy.stats import skew,boxcox_normmax
# numeric_feats = df.dtypes[df.dtypes != "object"].index

# skewed_feats = df[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
# skewed_feats = skewed_feats[skewed_feats > 0.75]
# skewed_feats = skewed_feats.index

# df[skewed_feats] = np.log1p(df[skewed_feats])

In [182]:
neigh_map={'MeadowV': 1,'IDOTRR': 1,
    'BrDale': 1, 'BrkSide': 2,
    'OldTown': 2, 'Edwards': 2,
    'Sawyer': 3,    'Blueste': 3,
    'SWISU': 3,    'NPkVill': 3,
    'NAmes': 3,    'Mitchel': 4,
    'SawyerW': 5,    'NWAmes': 5,
    'Gilbert': 5,    'Blmngtn': 5,
    'CollgCr': 5,    'ClearCr': 6,
    'Crawfor': 6,    'Veenker': 7,
    'Somerst': 7,    'Timber': 8,
    'StoneBr': 9,    'NridgHt': 10,
    'NoRidge': 10}
    
df['Neighborhood'] = df['Neighborhood'].map(neigh_map).astype('int')

ext_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['ExterCond'] = df['ExterCond'].map(ext_map).astype('int')
df['ExterQual'] = df['ExterQual'].map(ext_map).astype('int')

bsm_map = {'None': 0, 'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['BsmtQual'] = df['BsmtQual'].map(bsm_map).astype('int')
df['BsmtCond'] = df['BsmtCond'].map(bsm_map).astype('int')
bsmf_map = {'None': 0, 'Unf': 1,'LwQ': 2,'Rec': 3, 'BLQ': 4, 'ALQ': 5,'GLQ': 6}

df['BsmtFinType1'] = df['BsmtFinType1'].map(bsmf_map).astype('int')
df['BsmtFinType2'] = df['BsmtFinType2'].map(bsmf_map).astype('int')
heat_map = {'Po': 1, 'Fa': 2, 'TA': 3, 'Gd': 4, 'Ex': 5}
df['HeatingQC'] = df['HeatingQC'].map(heat_map).astype('int')
df['KitchenQual'] = df['KitchenQual'].map(heat_map).astype('int')
df['FireplaceQu'] = df['FireplaceQu'].map(bsm_map).astype('int')
df['GarageCond'] = df['GarageCond'].map(bsm_map).astype('int')
df['GarageQual'] = df['GarageQual'].map(bsm_map).astype('int')

# Getting dummy variables for nominal categorical features
df = pd.get_dummies(data=df)

In [183]:
df.columns

Index(['LotFrontage', 'LotArea', 'Neighborhood', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'ExterCond',
       ...
       'SaleType_COD', 'SaleType_New', 'SaleType_Other', 'SaleType_WD',
       'SaleCondition_Abnorml', 'SaleCondition_AdjLand',
       'SaleCondition_Alloca', 'SaleCondition_Family', 'SaleCondition_Normal',
       'SaleCondition_Partial'],
      dtype='object', length=214)

In [184]:
# Features to drop

to_drop = [
    'YrSold',
    'MoSold',
    'ExterQual',
    'BsmtQual',
    'GarageQual',
    'KitchenQual',
    'HeatingQC',
]

# Dropping ML-irrelevant features

df.drop(columns=to_drop, inplace=True)

In [185]:
df.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterCond,BsmtCond,...,SaleType_COD,SaleType_New,SaleType_Other,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,14.056118,9.084455,5,7,5,2003,2003,20.18696,3,3,...,0,0,0,1,0,0,0,0,1,0
1,15.771653,9.213244,7,6,8,1976,1976,0.0,3,3,...,0,0,0,1,0,0,0,0,1,0
2,14.413847,9.373359,5,7,5,2001,2002,18.427529,3,3,...,0,0,0,1,0,0,0,0,1,0
3,13.441145,9.207973,6,7,5,1915,1970,0.0,3,4,...,0,0,0,1,1,0,0,0,0,0
4,16.201359,9.612758,10,8,5,2000,2000,26.525806,3,3,...,0,0,0,1,0,0,0,0,1,0


In [186]:
corr = df.corr()
#Remove one of two features that have a correlation higher than 0.9
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
selected_columns = df.columns[columns]

df = df[selected_columns]

In [344]:
df.to_csv('house-price-mars.csv',index=False)
df=pd.read_csv('house-price-mars.csv')

In [345]:
df.head()

Unnamed: 0,LotFrontage,LotArea,Neighborhood,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterCond,BsmtCond,...,MiscFeature_TenC,SaleType_COD,SaleType_New,SaleType_Other,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal
0,14.056118,9.084455,5,7,5,2003,2003,20.18696,3,3,...,0,0,0,0,1,0,0,0,0,1
1,15.771653,9.213244,7,6,8,1976,1976,0.0,3,3,...,0,0,0,0,1,0,0,0,0,1
2,14.413847,9.373359,5,7,5,2001,2002,18.427529,3,3,...,0,0,0,0,1,0,0,0,0,1
3,13.441145,9.207973,6,7,5,1915,1970,0.0,3,4,...,0,0,0,0,1,1,0,0,0,0
4,16.201359,9.612758,10,8,5,2000,2000,26.525806,3,3,...,0,0,0,0,1,0,0,0,0,1


In [187]:
import statsmodels.regression.linear_model as sm
X=df.drop('SalePrice',axis=1).values
y=df.loc[:,'SalePrice'].values
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
y=np.log(y)

numVars = X.shape[1]-1
for i in range(0, numVars):
    regressor_OLS = sm.OLS(y, X).fit()
    maxVar = max(regressor_OLS.pvalues).astype(float)
    if maxVar > 0.05:
        for j in range(0, numVars - i):
            if (regressor_OLS.pvalues[j].astype(float) == maxVar):
                X = np.delete(X, j, 1)

In [342]:
X.shape

(1449, 121)

In [189]:
y

array([12.24769432, 12.10901093, 12.31716669, ..., 12.49312952,
       11.86446223, 11.90158345])

# Data transformation

In [174]:
from sklearn.decomposition import PCA
pca = PCA(n_components=60)
X_ = pca.fit_transform(X)
pca.explained_variance_ratio_.sum()

0.9655702524818541

# Model training and evaluation
    SVR versus Ridge

In [252]:
grid_search.best_score_

AttributeError: 'GridSearchCV' object has no attribute 'best_score_'

In [524]:
grid_search.score(X_test,y_test)

0.8227719769328563

In [525]:
y_pred=grid_search.predict(X_test)
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, y_pred))

0.16380566128834084

In [251]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score


# Bias and Variance Analysis and Conclusions

In [100]:
X_[0][0:10]

array([ 1.13516161, -0.75166162,  0.06820775, -1.22487572, -0.67611433,
       -0.36734264, -0.15909281, -0.1327051 ,  0.01944202,  0.11268019])

In [101]:
y[0]

12.247694320220994

In [367]:
from pyearth import Earth

mars = Earth(max_degree=1, penalty=0.1, endspan=5)

In [118]:


from sklearn.model_selection import cross_val_score,GridSearchCV

In [348]:
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    random_state=123,
                                                    shuffle=True)

In [None]:
grid_search.fit(X_train,y_train)
grid_search.best_params_

In [268]:
grid_search.best_score_

0.911422585988517

In [351]:
abr=AdaBoostRegressor(base_estimator=Earth(max_degree=1, penalty=0.1, endspan=1000))

param_grid = {'learning_rate': [0.01,0.05,0.1,0.5],'n_estimators':[15,25,35],'loss': ['linear','square','exponential']}
grid_search=GridSearchCV(abr,param_grid,cv=5)

grid_search.fit(X_trainval,y_trainval)
grid_search.best_params_

In [353]:
grid_search.best_score_

0.9106686018185044

In [355]:
grid_search.score(X_test,y_test)

0.9155385364876967

In [368]:
# gbrmars=AdaBoostRegressor(base_estimator=mars,)

cross_val_score(mars,X_trainval,y_trainval,cv=5).mean()

# boosted_model = AdaBoostRegressor(base_estimator=mars, n_estimators=25,
#                                   learning_rate=0.1, loss="exponential")

0.9072079977069205

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt
X_trainval,X_

rms = sqrt(mean_squared_error(y_actual, y_predicted))

In [332]:
param_grid = {'penalty': [0.1, 0.5,1,5,10,100],'endspan': [10,20,50,100,1000]}
grid_search_=GridSearchCV(AdaBoostRegressor(),param_grid,cv=5)

grid_search_.fit(X_trainval,y_trainval)

ValueError: Invalid parameter endspan for estimator AdaBoostRegressor(base_estimator=None, learning_rate=1.0, loss='linear',
                  n_estimators=50, random_state=None). Check the list of available parameters with `estimator.get_params().keys()`.

In [260]:
cross_val_score(boosted_model,X_trainval,y_trainval,cv=5).mean()


KeyboardInterrupt: 

In [328]:
boosted_model.fit(X_trainval,y_trainval)

AdaBoostRegressor(base_estimator=Earth(allow_linear=None, allow_missing=False,
                                       check_every=None, enable_pruning=True,
                                       endspan=1000, endspan_alpha=None,
                                       fast_K=None, fast_h=None,
                                       feature_importance_type=None,
                                       max_degree=1, max_terms=None,
                                       min_search_points=None, minspan=None,
                                       minspan_alpha=None, penalty=0.1,
                                       smooth=None, thresh=None, use_fast=None,
                                       verbose=0, zero_tol=None),
                  learning_rate=0.1, loss='exponential', n_estimators=25,
                  random_state=None)

In [360]:
mars.fit(X_trainval,y_trainval)

Earth(allow_linear=None, allow_missing=False, check_every=None,
      enable_pruning=True, endspan=1000, endspan_alpha=None, fast_K=None,
      fast_h=None, feature_importance_type=None, max_degree=1, max_terms=None,
      min_search_points=None, minspan=None, minspan_alpha=None, penalty=0.1,
      smooth=None, thresh=None, use_fast=None, verbose=0, zero_tol=None)

In [361]:
mars.score(X_trainval,y_trainval)

0.919927568573073

In [362]:
mars.score(X_test,y_test)

0.9124554399281934

In [363]:
y_pred=mars.predict(X_test)

In [364]:
from sklearn.metrics import mean_squared_error
rmse=np.sqrt(mean_squared_error(y_test,y_pred))
rmse

0.11896470184567015

In [366]:
y_trainval[0]

12.182804018590819