### House Prices: Advanced Regression Techniques

Predict sales prices and practice feature engineering, RFs, and gradient boosting

In [25]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv('train.csv')
train = train.set_index('Id')
test = pd.read_csv('test.csv')
test = test.set_index('Id')

In [3]:
train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


### Cleaning Dataset and Removing Missingness

In [4]:
#Putting training and testing dataset together to evaluate NA's and Engineer Features
test['SalePrice'] = -1
full_set = pd.concat([train,test],axis=0)
full_set = full_set[train.columns]

In [5]:
#Columns with at least one NA value
nas_train = np.sum(train.isna()).reset_index()
nas_train.columns = ['feature', 'NAs_Train']
nas_train.set_index('feature', inplace=True)

nas_test = np.sum(test.isna()).reset_index()
nas_test.columns = ['feature', 'NAs_Test']
nas_test.set_index('feature', inplace=True)

nas_total = pd.concat([nas_train, nas_test], axis='columns')
nas_total['Total'] = nas_total['NAs_Test']+nas_total['NAs_Train']
nas_total = nas_total[nas_total['Total']>0].sort_values('Total',ascending=False)
print(nas_total)

              NAs_Train  NAs_Test  Total
feature                                 
PoolQC             1453      1456   2909
MiscFeature        1406      1408   2814
Alley              1369      1352   2721
Fence              1179      1169   2348
FireplaceQu         690       730   1420
LotFrontage         259       227    486
GarageFinish         81        78    159
GarageQual           81        78    159
GarageCond           81        78    159
GarageYrBlt          81        78    159
GarageType           81        76    157
BsmtExposure         38        44     82
BsmtCond             37        45     82
BsmtQual             37        44     81
BsmtFinType2         38        42     80
BsmtFinType1         37        42     79
MasVnrType            8        16     24
MasVnrArea            8        15     23
MSZoning              0         4      4
BsmtFullBath          0         2      2
BsmtHalfBath          0         2      2
Functional            0         2      2
Utilities       

I will not use columns with plenty of NA's since they don't provide a lot of information.
Variables with a very small proportion, specially in the testing dataset I will fill them up with the most common value (median).

In [6]:
columns_to_remove = nas_total.index[0:6]
columns_to_fill = nas_total.index[6:]
print("Columns to Remove: ",columns_to_remove)
print("\nColumns to Fill: ",columns_to_fill)

Columns to Remove:  Index(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu',
       'LotFrontage'],
      dtype='object', name='feature')

Columns to Fill:  Index(['GarageFinish', 'GarageQual', 'GarageCond', 'GarageYrBlt', 'GarageType',
       'BsmtExposure', 'BsmtCond', 'BsmtQual', 'BsmtFinType2', 'BsmtFinType1',
       'MasVnrType', 'MasVnrArea', 'MSZoning', 'BsmtFullBath', 'BsmtHalfBath',
       'Functional', 'Utilities', 'GarageArea', 'GarageCars', 'Electrical',
       'KitchenQual', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1',
       'Exterior2nd', 'Exterior1st', 'SaleType'],
      dtype='object', name='feature')


__Final dataframe with cleaned data.__

In [7]:
clean_df = full_set.drop(columns_to_remove, axis=1)
for feature in columns_to_fill:
    mode_value = clean_df[feature].mode()[0]
    clean_df[feature].fillna(value=mode_value,inplace=True)
print(clean_df.shape)
print(full_set.shape)

(2919, 74)
(2919, 80)


__Final dataframe with cleaned data and hot encoding.__

In [8]:
numeric_columns = ['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF','TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea','GarageYrBlt','GarageArea','WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal']
categorical_columns = ['MSSubClass','MSZoning','Street','LotShape','LandContour','Utilities','LotConfig','LandSlope','Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','Exterior1st','Exterior2nd','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2','Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','GarageType','GarageFinish','GarageQual','GarageCond','PavedDrive','SaleType','SaleCondition']
data = pd.get_dummies(clean_df, prefix_sep='_', columns=categorical_columns, drop_first=False)

### Training Set and Testing Set

In [9]:
yTr = data.SalePrice[data['SalePrice']>=0]
xTr = data[data['SalePrice']>=0].drop(['SalePrice'], axis=1)
xTe = data[data['SalePrice']==-1].drop(['SalePrice'], axis=1)

#x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state = 0)

### 1. Basic Linear Regression Model

In [12]:
ols = linear_model.LinearRegression()
area_columns = ['GrLivArea','LotArea','TotalBsmtSF','OverallQual']
x = xTr[area_columns]
y = yTr[:]
ols.fit(x, y)
print("Accuracy: ",round(ols.score(x, y),3))
print("MSE: ",round(mean_squared_error(ols.predict(x),y)**.5))
ols_scores = -cross_val_score(ols, x, y, scoring='neg_mean_squared_error', cv = 10)
print("Cross-Val Score =",round(np.mean(ols_scores**.5),3))

Accuracy:  0.748
MSE:  39856.0
Cross-Val Score = 39534.779


### 2. Saturated Linear Regression Model

In [13]:
ols = linear_model.LinearRegression()
x = xTr
y = yTr[:]
ols.fit(x, y)
ols_scores = -cross_val_score(ols, x, y, scoring='neg_mean_squared_error', cv = 10)
print("Accuracy: ",round(ols.score(x, y),3))
print("MSE: ",round(mean_squared_error(ols.predict(x),y)**.5))
print("Cross-Val Score =",round(np.mean(ols_scores**.5),3))

Accuracy:  0.931
MSE:  20914.0
Cross-Val Score = 32552.147


### 3. Ridge Regression Model

In [16]:
x = xTr[:]
y = yTr[:]
ridge = linear_model.RidgeCV(alphas=np.arange(0.05,4,0.01),normalize=True)
ridge.fit(x, y)
print("Best alpha = ",ridge.alpha_)
ridge = linear_model.Ridge(ridge.alpha_,normalize=True)
ridge.fit(x, y)
ridge_scores = -cross_val_score(ridge, x, y, scoring='neg_mean_squared_error', cv = 10)
print("Accuracy: ",round(ridge.score(x, y),3))
print("MSE: ",round(mean_squared_error(ridge.predict(x),y)**.5))
print("Cross-Val Score =",round(np.mean(ridge_scores**.5),3))

Best alpha =  0.44000000000000006
Accuracy:  0.91
MSE:  23799.0
Cross-Val Score = 29129.06


### 4. Lasso Regression Model

In [17]:
x = xTr[:]
y = yTr[:]

lasso = linear_model.LassoCV(eps=0.001, n_alphas=100, cv=5, normalize=True)

lasso.fit(x, y)
print("Best alpha = ",lasso.alpha_)
alpha=lasso.alpha_
lasso = linear_model.Lasso(alpha=lasso.alpha_, normalize=True)
lasso.fit(x, y)
lasso_scores = -cross_val_score(lasso, x, y, scoring='neg_mean_squared_error', cv = 10)
print("Accuracy: ",round(lasso.score(x, y),3))
print("MSE: ",round(mean_squared_error(lasso.predict(x),y)**.5))
print("Cross-Val Score =",round(np.mean(lasso_scores**.5),3))

Best alpha =  23.3028608239031
Accuracy:  0.916
MSE:  23076.0
Cross-Val Score = 29633.715


### 5. CART Regression Tree

In [444]:
x = xTr[:]
y = yTr[:]

In [20]:
for i in range(3,16):
    tree = DecisionTreeRegressor(max_depth=i)
    tree.fit(x, y)
    tree_scores = -cross_val_score(tree, x, y, scoring='neg_mean_squared_error', cv = 10)
    print("Tree Depth = ",i,"\tAccuracy: ",round(tree.score(x, y),3),"\tMSE: ",round(mean_squared_error(tree.predict(x),y)**.5),"\tCross-Val Score =",round(np.mean(tree_scores**.5),3))

Tree Depth =  3 	Accuracy:  0.744 	MSE:  40142.0 	Cross-Val Score = 42869.114
Tree Depth =  4 	Accuracy:  0.81 	MSE:  34632.0 	Cross-Val Score = 40761.703
Tree Depth =  5 	Accuracy:  0.862 	MSE:  29523.0 	Cross-Val Score = 39361.282
Tree Depth =  6 	Accuracy:  0.903 	MSE:  24705.0 	Cross-Val Score = 38161.829
Tree Depth =  7 	Accuracy:  0.937 	MSE:  19998.0 	Cross-Val Score = 36656.998
Tree Depth =  8 	Accuracy:  0.959 	MSE:  16099.0 	Cross-Val Score = 38449.188
Tree Depth =  9 	Accuracy:  0.973 	MSE:  13014.0 	Cross-Val Score = 37791.391
Tree Depth =  10 	Accuracy:  0.983 	MSE:  10265.0 	Cross-Val Score = 36411.123
Tree Depth =  11 	Accuracy:  0.989 	MSE:  8237.0 	Cross-Val Score = 39434.644
Tree Depth =  12 	Accuracy:  0.993 	MSE:  6726.0 	Cross-Val Score = 39371.547
Tree Depth =  13 	Accuracy:  0.995 	MSE:  5422.0 	Cross-Val Score = 38475.852
Tree Depth =  14 	Accuracy:  0.997 	MSE:  4306.0 	Cross-Val Score = 38411.309
Tree Depth =  15 	Accuracy:  0.998 	MSE:  3217.0 	Cross-Val Scor

### 6. Gradient Boosting Regression

In [38]:
x = xTr[:]
y = yTr[:]
boost = GradientBoostingRegressor(n_estimators=500)
boost.fit(x, y)
boost_scores = -cross_val_score(boost, x, y, scoring='neg_mean_squared_error', cv = 10)
print("Accuracy: ",round(boost.score(x, y),3))
print("MSE: ",round(mean_squared_error(boost.predict(x),y)**.5))
print("Cross-Val Score =",round(np.mean(boost_scores**.5),3))

Accuracy:  0.992
MSE:  6974.0
Cross-Val Score = 25309.239


__GridSearch for Random Forest Regressor__

In [42]:
param_grid = [
    {'learning_rate':[0.05,0.1],'max_depth':[3,4,5,6,7,8]}
            ]
boost = GradientBoostingRegressor()
grid_search = GridSearchCV(boost,param_grid, cv=5, scoring='neg_mean_squared_error',return_train_score=True)
grid_search.fit(x,y)
print(grid_search.best_params_)

{'learning_rate': 0.1, 'max_depth': 3}


In [43]:
#Best parameters
grid_search.best_estimator_

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [44]:
cvres = grid_search.cv_results_
for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):
    print((-mean_score)**.5,params)

28388.14259399001 {'learning_rate': 0.05, 'max_depth': 3}
27562.427075590207 {'learning_rate': 0.05, 'max_depth': 4}
28813.062190788824 {'learning_rate': 0.05, 'max_depth': 5}
29661.471357600894 {'learning_rate': 0.05, 'max_depth': 6}
31394.432766104946 {'learning_rate': 0.05, 'max_depth': 7}
32458.05765940475 {'learning_rate': 0.05, 'max_depth': 8}
26608.927362337923 {'learning_rate': 0.1, 'max_depth': 3}
26711.93213425203 {'learning_rate': 0.1, 'max_depth': 4}
28899.555772452735 {'learning_rate': 0.1, 'max_depth': 5}
28487.703691805953 {'learning_rate': 0.1, 'max_depth': 6}
31752.34034280946 {'learning_rate': 0.1, 'max_depth': 7}
32425.004676546087 {'learning_rate': 0.1, 'max_depth': 8}


### 7. Random Forest Regressor

In [28]:
x = xTr[:]
y = yTr[:]
forest = RandomForestRegressor(n_estimators=100)
forest.fit(x, y)
forest_scores = -cross_val_score(forest, x, y, scoring='neg_mean_squared_error', cv = 10)
print("Accuracy: ",round(forest.score(x, y),3))
print("MSE: ",round(mean_squared_error(forest.predict(x),y)**.5))
print("Cross-Val Score =",round(np.mean(forest_scores**.5),3))

Accuracy:  0.977
MSE:  11921.0
Cross-Val Score = 29292.802


__GridSearch for Random Forest Regressor__

In [35]:
param_grid = [
    {'n_estimators':[25,30,35,40,45,50],'max_features':[6,8,10,12,14,16,18,20]},
    {'bootstrap':[False],'n_estimators':[30,35,40,45,50,55,60,65,70],'max_features':[5,7,9,11,13,15,17,19]}
            ]
forest = RandomForestRegressor()
grid_search = GridSearchCV(forest,param_grid, cv=5, scoring='neg_mean_squared_error',return_train_score=True)
grid_search.fit(x,y)
print(grid_search.best_params_)

{'bootstrap': False, 'max_features': 17, 'n_estimators': 60}

In [36]:
#Best parameters
grid_search.best_estimator_

RandomForestRegressor(bootstrap=False, criterion='mse', max_depth=None,
                      max_features=17, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=60,
                      n_jobs=None, oob_score=False, random_state=None,
                      verbose=0, warm_start=False)

In [37]:
cvres = grid_search.cv_results_
for mean_score,params in zip(cvres["mean_test_score"],cvres["params"]):
    print((-mean_score)**.5,params)

34449.577630774496 {'max_features': 6, 'n_estimators': 25}
34460.817556293616 {'max_features': 6, 'n_estimators': 30}
33450.458625918094 {'max_features': 6, 'n_estimators': 35}
33921.59288666305 {'max_features': 6, 'n_estimators': 40}
33892.11707697234 {'max_features': 6, 'n_estimators': 45}
33546.85471480323 {'max_features': 6, 'n_estimators': 50}
33395.3350962573 {'max_features': 8, 'n_estimators': 25}
33099.82270829735 {'max_features': 8, 'n_estimators': 30}
32086.11621355113 {'max_features': 8, 'n_estimators': 35}
32616.420373985315 {'max_features': 8, 'n_estimators': 40}
32981.88257731737 {'max_features': 8, 'n_estimators': 45}
32517.668661002073 {'max_features': 8, 'n_estimators': 50}
33450.37010578902 {'max_features': 10, 'n_estimators': 25}
32967.23083167836 {'max_features': 10, 'n_estimators': 30}
32952.7189136646 {'max_features': 10, 'n_estimators': 35}
33151.95486188233 {'max_features': 10, 'n_estimators': 40}
32073.92979185123 {'max_features': 10, 'n_estimators': 45}
32853.