In [39]:
import numpy as np
import sys
import pandas as pd
import matplotlib
from matplotlib import pyplot as plt
import seaborn as sns

%matplotlib inline 

In [40]:
import sklearn
import sklearn.ensemble
import sklearn.feature_selection
import sklearn.linear_model
import sklearn.model_selection
import sklearn.tree
import sklearn.utils

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.model_selection import GridSearchCV


In [41]:
from collections import OrderedDict
import xgboost as xgb

In [42]:
df = pd.read_csv('./data/all_data_CLEAN.csv', index_col=0)

print(df.shape)
#df.head()

data = df.copy()

data['MSSubClass'] = data['MSSubClass'].astype(str)
# data['OverallQual'] = data['OverallQual'].astype(str)
# data['OverallCond'] = data['OverallCond'].astype(str)

(2552, 81)


In [43]:
ord_colnames = ['LotShape', 'LandSlope', 'OverallQual', 'OverallCond',
                'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
                'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
                'KitchenQual', 'Functional', 'FireplaceQu', 'GarageFinish',
                'GarageQual', 'GarageCond', 'PoolQC', 'Fence']
ord_dict = {}
ord_dict['LotShape'] = ['IR3', 'IR2', 'IR1', 'Reg']
ord_dict['LandSlope'] = ['Sev', 'Mod', 'Gtl']
ord_dict['OverallQual'] = [np.nan, 1,2,3,4,5,6,7,8,9,10]
ord_dict['OverallCond'] = [np.nan, 1,2,3,4,5,6,7,8,9,10]
ord_dict['ExterQual'] = [np.nan,'Po', 'Fa','TA', 'Gd', 'Ex']
ord_dict['ExterCond'] = [np.nan, 'Po', 'Fa','TA', 'Gd', 'Ex']
ord_dict['BsmtQual'] = ['NoneListed', 'Po', 'Fa','TA', 'Gd', 'Ex']
ord_dict['BsmtCond'] = ['NoneListed', 'Po', 'Fa','TA', 'Gd', 'Ex']
ord_dict['BsmtExposure'] = ['NoneListed', 'No', 'Mn', 'Av', 'Gd']
ord_dict['BsmtFinType1'] = ['NoneListed', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
ord_dict['BsmtFinType2'] = ['NoneListed', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']
ord_dict['HeatingQC'] = [np.nan, 'Po', 'Fa','TA', 'Gd', 'Ex']
ord_dict['KitchenQual'] = [np.nan, 'Po', 'Fa','TA', 'Gd', 'Ex']
ord_dict['Functional'] = ['Sal', 'Sev', 'Maj2', 'Maj1', 'Mod', 'Min2', 'Min1', 'Typ']
ord_dict['FireplaceQu'] = ['NoneListed', 'Po', 'Fa','TA', 'Gd', 'Ex']
ord_dict['GarageFinish'] = ['NoneListed', 'Unf', 'RFn', 'Fin']
ord_dict['GarageQual'] = ['NoneListed', 'Po', 'Fa','TA', 'Gd', 'Ex']
ord_dict['GarageCond'] = ['NoneListed', 'Po', 'Fa','TA', 'Gd', 'Ex']
ord_dict['PoolQC'] = ['NoneListed', 'Po', 'Fa','TA', 'Gd', 'Ex']
ord_dict['Fence'] = ['NoneListed', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']

In [44]:
# Trasforms Labels to Numbers
for i in range(len(ord_colnames)):
    data[ord_colnames[i]] = data[ord_colnames[i]].apply(lambda x: ord_dict[ord_colnames[i]].index(x))

# Trasforms Numbers back to Labels
# for i in range(len(ord_colnames)):
#     ames_housing_X[ord_colnames[i]] = ames_housing_X[ord_colnames[i]].apply(lambda x: ord_dict[ord_colnames[i]][x])

In [45]:
nums = ['PID', 'SalePrice','SalePricePerGLA','GrLivArea','LotFrontage', 'LotArea', 
          'YearBuilt', 'YearRemodAdd','MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 
          'BsmtUnfSF', 'TotalBsmtSF','1stFlrSF','2ndFlrSF', 'LowQualFinSF', 
          'BsmtFullBath', 'BsmtHalfBath', 'FullBath','HalfBath', 'BedroomAbvGr', 
          'KitchenAbvGr','TotRmsAbvGrd', 'Fireplaces','GarageYrBlt', 'GarageCars', 
          'GarageArea', 'WoodDeckSF', 'OpenPorchSF','EnclosedPorch', '3SsnPorch', 
          'ScreenPorch', 'PoolArea', 'MiscVal' , 'MoSold', 'YrSold']


In [46]:
cats_ords = list(set(data.columns) - set(data[nums].columns))

cats = list(set(data[cats_ords].columns) - set(data[ord_colnames].columns))
print("Nums len: ", len(nums),
      "\nOrds len: ", len(ord_colnames),
      "\nCats len: ", len(cats),
      "\nOG shape: ", len(df.columns))
enc_cats_df = pd.get_dummies(data[cats],drop_first=True)

nums_df = data[nums]

ords_df = data[ord_colnames]

#enc_cats_df.head(5)


Nums len:  36 
Ords len:  20 
Cats len:  25 
OG shape:  81


In [47]:
print("nums: ", nums_df.shape)
print("ords: ", ords_df.shape)
nums_ords = nums_df.join(ords_df)
print("nums_ords: ", nums_ords.shape)
#nums_ords.head()
print("enc_cats: ", enc_cats_df.shape)
full_df = nums_ords.join(enc_cats_df)
print("full_df: ", full_df.shape)

nums:  (2552, 36)
ords:  (2552, 20)
nums_ords:  (2552, 56)
enc_cats:  (2552, 163)
full_df:  (2552, 219)


In [48]:
y_SP = full_df['SalePrice']
y_SPPGLA = full_df['SalePricePerGLA']
X_GLA = full_df['GrLivArea']

print("With SP, GLA, and SPPGLA: ", full_df.shape)


full_df.drop(['SalePrice'], axis=1, inplace=True)
full_df.drop(['SalePricePerGLA'], axis=1, inplace=True)
full_df.drop(['GrLivArea'], axis=1, inplace=True)

print("Without SP, GLA, or SPPGLA: ", full_df.shape)

# Test data set sizes
print("SP shape: ",y_SP.shape)
print("SP shape: ",y_SPPGLA.shape)

With SP, GLA, and SPPGLA:  (2552, 219)
Without SP, GLA, or SPPGLA:  (2552, 216)
SP shape:  (2552,)
SP shape:  (2552,)


In [49]:
from sklearn.metrics import mean_squared_error

xgbr_1 = xgb.XGBRegressor(objective='reg:squarederror')

In [50]:
# GridSearch tuning
params_1 = {
    'max_depth': [3, 4, 5],
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1],
    'colsample_bytree': [0.5, 0.75, 1]
#    'subsample': [0.4, 0.8, 1]
}

gscv_1 = GridSearchCV(estimator = xgbr_1,
                      param_grid = params_1,
                      scoring = 'neg_mean_squared_error',
                      n_jobs=6, 
                      verbose=10
                  )

gscv_1.fit(full_df, y_SPPGLA)


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


Fitting 3 folds for each of 81 candidates, totalling 243 fits


[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:    1.4s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:    2.6s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:    4.7s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    7.3s
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    9.6s
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   12.5s
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:   16.5s
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:   20.8s
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed:   24.8s
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed:   29.3s
[Parallel(n_jobs=6)]: Done 101 tasks      | elapsed:   35.8s
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:   43.0s
[Parallel(n_jobs=6)]: Done 133 tasks      | elapsed:   50.3s
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:   59.4s
[Parallel(n_jobs=6)]: Done 169 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  1.3min
[Parallel(n_jobs=6)]: Do

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:squarederror',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=6,
       param_grid={'max_depth': [3, 4, 5], 'n_estimators': [100, 500, 1000], 'learning_rate': [0.01, 0.05, 0.1], 'colsample_bytree': [0.5, 0.75, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=10)

In [51]:
print("Best parameters: ", gscv_1.best_params_)
print("Lowest RMSE: ", (-gscv_1.best_score_)**(1/2.0))


Best parameters:  {'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_depth': 4, 'n_estimators': 1000}
Lowest RMSE:  12.282375381607208


In [52]:
# using the best params found via GSCV_1
best_xgbr_1 = xgb.XGBRegressor(colsample_bytree = 0.5,
                              learning_rate = 0.05,
                              max_depth = 4,
                              n_estimators = 1000)
# best_xgbr_1.fit(X_train, y_train)

# best_preds_1 = best_xgbr_1.predict(X_test)

In [53]:
### attempting cross validation, 5-fold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

kfold = KFold(n_splits=5, shuffle=True)

scores = cross_val_score(best_xgbr_1, full_df, y_SPPGLA, cv=kfold)
print("Mean cross-validation score: %.5f" % scores.mean())



  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \




  if getattr(data, 'base', None) is not None and \


Mean cross-validation score: 0.84013


In [54]:
# GridSearch tuning part 2
params_2 = {
    'max_depth': [3, 5, 7],
    'n_estimators': [100, 500, 1000],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'colsample_bytree': [0.5, 0.75, 1],
    'gamma': [0.5, 1, 1.5, 2],
    'subsample': [0.4, 0.8, 1]
}

gscv_2 = GridSearchCV(estimator = xgbr_1,
                      param_grid = params_2,
                      scoring = 'neg_mean_squared_error',
                      n_jobs=6, 
                      verbose=10
                  )

gscv_2.fit(full_df, y_SPPGLA)

[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.


Fitting 3 folds for each of 1296 candidates, totalling 3888 fits


[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:    0.2s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:    0.3s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:    2.0s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:    5.2s
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    7.3s
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   10.0s
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:   16.8s
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:   18.8s
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed:   29.0s
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed:   32.5s
[Parallel(n_jobs=6)]: Done 101 tasks      | elapsed:   38.7s
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:   41.6s
[Parallel(n_jobs=6)]: Done 133 tasks      | elapsed:   50.9s
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:   56.4s
[Parallel(n_jobs=6)]: Done 169 tasks      | elapsed:  1.1min
[Parallel(n_jobs=6)]: Done 188 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Do

GridSearchCV(cv='warn', error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:squarederror',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=6,
       param_grid={'max_depth': [3, 5, 7], 'n_estimators': [100, 500, 1000], 'learning_rate': [0.01, 0.05, 0.1, 0.2], 'colsample_bytree': [0.5, 0.75, 1], 'gamma': [0.5, 1, 1.5, 2], 'subsample': [0.4, 0.8, 1]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=10)

In [55]:
print("Best parameters: ", gscv_2.best_params_)
print("Lowest RMSE: ", (-gscv_2.best_score_)**(1/2.0))

Best parameters:  {'colsample_bytree': 1, 'gamma': 0.5, 'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 1000, 'subsample': 0.4}
Lowest RMSE:  12.006054834056126


In [65]:
best_xgbr_2 = xgb.XGBRegressor(objective='reg:squarederror',
                               colsample_bytree = 0.1,
                               gamma = 0.5,
                              learning_rate = 0.05,
                              max_depth = 3,
                              n_estimators = 1000,
                              subsample = 0.4)

In [57]:
kfold = KFold(n_splits=5, shuffle=True)

scores = cross_val_score(best_xgbr_2, full_df, y_SPPGLA, cv=kfold)
print("Mean cross-validation score: %.5f" % scores.mean())

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


Mean cross-validation score: 0.83624


In [60]:
params_3 = {
    "learning_rate": [0.05], #, 0.5, 0.75, 1],
    'n_estimators': [1000], # 100, 500
    'gamma': [0.5], #1, 1.5, 2, 5],
    'max_depth': [3, 6, 9],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.4], #0.75, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'reg_alpha': [1, 2, 3],
    'reg_lambda': [1, 2, 3]
    }

gscv_3 = GridSearchCV(estimator = xgbr_1,
                      param_grid = params_3,
                      scoring = 'neg_mean_squared_error',
                      n_jobs=6, 
                      verbose=10,
                      cv = 10
                  )

gscv_3.fit(full_df, y_SPPGLA)

Fitting 10 folds for each of 243 candidates, totalling 2430 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done   1 tasks      | elapsed:    5.8s
[Parallel(n_jobs=6)]: Done   6 tasks      | elapsed:    6.0s
[Parallel(n_jobs=6)]: Done  13 tasks      | elapsed:   15.2s
[Parallel(n_jobs=6)]: Done  20 tasks      | elapsed:   20.0s
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   25.0s
[Parallel(n_jobs=6)]: Done  38 tasks      | elapsed:   34.4s
[Parallel(n_jobs=6)]: Done  49 tasks      | elapsed:   44.3s
[Parallel(n_jobs=6)]: Done  60 tasks      | elapsed:   49.7s
[Parallel(n_jobs=6)]: Done  73 tasks      | elapsed:  1.1min
[Parallel(n_jobs=6)]: Done  86 tasks      | elapsed:  1.2min
[Parallel(n_jobs=6)]: Done 101 tasks      | elapsed:  1.4min
[Parallel(n_jobs=6)]: Done 116 tasks      | elapsed:  1.6min
[Parallel(n_jobs=6)]: Done 133 tasks      | elapsed:  1.8min
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:  2.0min
[Parallel(n_jobs=6)]: Done 169 tasks      | elapsed:  2.3min
[Parallel(

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0,
       importance_type='gain', learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:squarederror',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=None, subsample=1, verbosity=1),
       fit_params=None, iid='warn', n_jobs=6,
       param_grid={'learning_rate': [0.05], 'n_estimators': [1000], 'gamma': [0.5], 'max_depth': [3, 6, 9], 'min_child_weight': [1, 5, 10], 'subsample': [0.4], 'colsample_bytree': [0.6, 0.8, 1.0], 'reg_alpha': [1, 2, 3], 'reg_lambda': [1, 2, 3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_squared_error', verbose=10)

In [62]:
print("Best parameters: ", gscv_3.best_params_)
print("Lowest RMSE: ", (-gscv_3.best_score_)**(1/2.0))

Best parameters:  {'colsample_bytree': 1.0, 'gamma': 0.5, 'learning_rate': 0.05, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 1000, 'reg_alpha': 1, 'reg_lambda': 3, 'subsample': 0.4}
Lowest RMSE:  11.592833640273456


In [64]:
best_xgbr_3 = xgb.XGBRegressor(objective='reg:squarederror',
                               colsample_bytree = 1,
                               gamma = 0.5,
                               learning_rate = 0.05,
                               max_depth = 3,
                               min_child_weight = 10,
                               n_estimators = 1000,
                               reg_alpha = 1,
                               reg_lambda = 3,
                               subsample = 0.4)

In [67]:
kfold = KFold(n_splits=10, shuffle=True)

scores = cross_val_score(best_xgbr_3, full_df, y_SPPGLA, cv=kfold)
print("Mean cross-validation score: %.5f" % scores.mean())

  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \
  if getattr(data, 'base', None) is not None and \


Mean cross-validation score: 0.85134
