In [17]:
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from feature_engine import imputation
from feature_engine import encoding

# pipeline utility
from sklearn.pipeline import Pipeline
from sklearn.utils import shuffle

# preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures

# hyper-parameter tuning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# metrics to evaluate models
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

# linear models
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

# tree models
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor

# ensemble models
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import GradientBoostingRegressor

# XGBoost
from xgboost import XGBRFRegressor

In [2]:
house_data = pd.read_csv('../data/train.csv', index_col='Id')
X = house_data.drop(labels='SalePrice', axis=1)
Y = house_data['SalePrice']
X.shape, Y.shape

((1460, 79), (1460,))

In [3]:
cv = RepeatedKFold(n_splits=5, n_repeats=10, random_state=64)

In [4]:
base_pipeline = Pipeline(steps=[('most_miss_cat', imputation.CategoricalImputer(variables=['Alley',
                                                                                           'FireplaceQu',
                                                                                           'PoolQC',
                                                                                           'Fence',
                                                                                           'MiscFeature'])),
                                ('least_miss_cat', imputation.RandomSampleImputer(random_state='LotArea',
                                                                                  seed='observation',
                                                                                  variables=['GarageCond',
                                                                                             'GarageType',
                                                                                             'GarageFinish',
                                                                                             'GarageQual',
                                                                                             'BsmtExposure',
                                                                                             'BsmtFinType2',
                                                                                             'BsmtCond',
                                                                                             'BsmtQual',
                                                                                             'BsmtFinType1',
                                                                                             'MasVnrType',
                                                                                             'Electrical'])),
                                ('least_miss_num', imputation.RandomSampleImputer(random_state='LotArea',
                                                                                  seed='observation',
                                                                                  variables=['LotFrontage',
                                                                                             'GarageYrBlt',
                                                                                             'MasVnrArea'])),
                                ('undetected', imputation.RandomSampleImputer(random_state='LotArea',
                                                                              seed='observation')),
                                ('encode', encoding.OrdinalEncoder(
                                    variables=X.select_dtypes(include=object).columns.to_list())),
                                ('scaling', RobustScaler()),
                                ('transform', QuantileTransformer(output_distribution='normal')),
                                ('scale_to_range', MinMaxScaler()),
                                ('feature_selection', SelectKBest(score_func=mutual_info_regression,
                                                                  k=20))]
                         )

In [5]:
X_ = base_pipeline.fit_transform(X, Y)
train_x, test_x, train_y, test_y = train_test_split(X_, Y, test_size=0.2, random_state=48)
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((1168, 20), (292, 20), (1168,), (292,))

### bagging

In [6]:
L1_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=2)),
                           ('lasso', Lasso(alpha=12.807147786982478,
                                           max_iter=20687))])
L1_bagging = BaggingRegressor(base_estimator=L1_model)
L1_bagging.get_params()

{'base_estimator__memory': None,
 'base_estimator__steps': [('poly', PolynomialFeatures()),
  ('lasso', Lasso(alpha=12.807147786982478, max_iter=20687))],
 'base_estimator__verbose': False,
 'base_estimator__poly': PolynomialFeatures(),
 'base_estimator__lasso': Lasso(alpha=12.807147786982478, max_iter=20687),
 'base_estimator__poly__degree': 2,
 'base_estimator__poly__include_bias': True,
 'base_estimator__poly__interaction_only': False,
 'base_estimator__poly__order': 'C',
 'base_estimator__lasso__alpha': 12.807147786982478,
 'base_estimator__lasso__copy_X': True,
 'base_estimator__lasso__fit_intercept': True,
 'base_estimator__lasso__max_iter': 20687,
 'base_estimator__lasso__normalize': 'deprecated',
 'base_estimator__lasso__positive': False,
 'base_estimator__lasso__precompute': False,
 'base_estimator__lasso__random_state': None,
 'base_estimator__lasso__selection': 'cyclic',
 'base_estimator__lasso__tol': 0.0001,
 'base_estimator__lasso__warm_start': False,
 'base_estimator': Pi

In [9]:
L1_bagging_GS = RandomizedSearchCV(estimator=L1_bagging,
                                   param_distributions={'n_estimators': np.random.randint(10, 100, 20),
                                                        'max_samples': [0.5, 0.67, 0.75, 0.8, 0.9]},
                                   cv=cv,
                                   scoring={'r2': make_scorer(r2_score),
                                            'me': make_scorer(max_error),
                                            'mse': make_scorer(mean_squared_error)
                                            },
                                   refit='mse',
                                   return_train_score=True,
                                   n_jobs=-1,
                                   n_iter=15)

L1_bagging_GS.fit(train_x, train_y)

RandomizedSearchCV(cv=RepeatedKFold(n_repeats=12, n_splits=6, random_state=64),
                   estimator=BaggingRegressor(base_estimator=Pipeline(steps=[('poly',
                                                                              PolynomialFeatures()),
                                                                             ('lasso',
                                                                              Lasso(alpha=12.807147786982478,
                                                                                    max_iter=20687))])),
                   n_iter=15, n_jobs=-1,
                   param_distributions={'max_samples': [0.5, 0.67, 0.75, 0.8,
                                                        0.9],
                                        'n_estimators': array([30, 45, 18, 27, 39, 26, 45, 79, 66, 33, 53, 65, 90, 84, 18, 38, 18,
       99, 25, 30])},
                   refit='mse', return_train_score=True,
                   scoring={'me': make_s

In [18]:
L1_bagging_GS.best_estimator_, np.mean(L1_bagging_GS.cv_results_['mean_test_r2']) * 100, np.mean(
    L1_bagging_GS.cv_results_['mean_train_r2']) * 100

(BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                                 ('lasso',
                                                  Lasso(alpha=12.807147786982478,
                                                        max_iter=20687))]),
                  max_samples=0.5, n_estimators=18),
 86.15027990703051,
 90.12425790601924)

In [36]:
l1_model = BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                                           ('lasso',
                                                            Lasso(alpha=12.807147786982478,
                                                                  max_iter=20687))]),
                            n_estimators=25)

l1_model.fit(train_x, train_y)
r2_score(test_y, l1_model.predict(test_x)), mean_squared_error(test_y, l1_model.predict(test_x), squared=False)

(0.7504144125481214, 39246.38222405792)

In [19]:
pd.to_pickle(pd.DataFrame(L1_bagging_GS.cv_results_),
             '../data/cv_results/ensemble/l1_bagging_gs.pkl')

In [23]:
L2_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=2)),
                           ('ridge', Ridge(alpha=0.5378845624526254, max_iter=6055))])
L2_bagging = BaggingRegressor(base_estimator=L2_model)
L2_model.get_params()

{'memory': None,
 'steps': [('poly', PolynomialFeatures()),
  ('ridge', Ridge(alpha=0.5378845624526254, max_iter=6055))],
 'verbose': False,
 'poly': PolynomialFeatures(),
 'ridge': Ridge(alpha=0.5378845624526254, max_iter=6055),
 'poly__degree': 2,
 'poly__include_bias': True,
 'poly__interaction_only': False,
 'poly__order': 'C',
 'ridge__alpha': 0.5378845624526254,
 'ridge__copy_X': True,
 'ridge__fit_intercept': True,
 'ridge__max_iter': 6055,
 'ridge__normalize': 'deprecated',
 'ridge__positive': False,
 'ridge__random_state': None,
 'ridge__solver': 'auto',
 'ridge__tol': 0.001}

In [26]:
L2_bagging_GS = RandomizedSearchCV(estimator=L2_bagging,
                                   param_distributions={'n_estimators': np.random.randint(10, 100, 20)},
                                   cv=cv,
                                   scoring={'r2': make_scorer(r2_score),
                                            'me': make_scorer(max_error),
                                            'mse': make_scorer(mean_squared_error)
                                            },
                                   refit='mse',
                                   return_train_score=True,
                                   n_jobs=-1,
                                   n_iter=15)

L2_bagging_GS.fit(train_x, train_y)

RandomizedSearchCV(cv=RepeatedKFold(n_repeats=12, n_splits=6, random_state=64),
                   estimator=BaggingRegressor(base_estimator=Pipeline(steps=[('poly',
                                                                              PolynomialFeatures()),
                                                                             ('ridge',
                                                                              Ridge(alpha=0.5378845624526254,
                                                                                    max_iter=6055))])),
                   n_iter=15, n_jobs=-1,
                   param_distributions={'n_estimators': array([65, 74, 66, 36, 69, 19, 84, 98, 25, 52, 27, 43, 11, 23, 74, 65, 38,
       47, 21, 90])},
                   refit='mse', return_train_score=True,
                   scoring={'me': make_scorer(max_error),
                            'mse': make_scorer(mean_squared_error),
                            'r2': make_scorer(r2_score)

In [27]:
L2_bagging_GS.best_estimator_, np.mean(L2_bagging_GS.cv_results_['mean_test_r2']) * 100, np.mean(
    L2_bagging_GS.cv_results_['mean_train_r2']) * 100

(BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                                 ('ridge',
                                                  Ridge(alpha=0.5378845624526254,
                                                        max_iter=6055))]),
                  n_estimators=11),
 86.09059311980134,
 89.28613115345779)

In [45]:
l2_model = BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                                           ('ridge', Ridge(alpha=0.5378845624526254,
                                                                           max_iter=6055))]),
                            n_estimators=11)

l2_model.fit(train_x, train_y)
r2_score(test_y, l2_model.predict(test_x)), mean_squared_error(test_y, l2_model.predict(test_x), squared=False)

(0.7372383025068575, 40269.005765315305)

In [48]:
pd.to_pickle(pd.DataFrame(L2_bagging_GS.cv_results_),
             '../data/cv_results/ensemble/l2_bagging_gs.pkl')

In [6]:
DT_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=4)),
                           ('DT', DecisionTreeRegressor(splitter='best',
                                                        min_samples_split=20,
                                                        min_samples_leaf=21,
                                                        max_features='auto',
                                                        max_depth=36,
                                                        ccp_alpha=6))])
DT_bagging = BaggingRegressor(base_estimator=DT_model)
DT_bagging.get_params()

{'base_estimator__memory': None,
 'base_estimator__steps': [('poly', PolynomialFeatures(degree=4)),
  ('DT',
   DecisionTreeRegressor(criterion='absolute_error', max_depth=13,
                         max_features='auto', max_leaf_nodes=29,
                         min_impurity_decrease=0.01, min_samples_leaf=26,
                         min_samples_split=58))],
 'base_estimator__verbose': False,
 'base_estimator__poly': PolynomialFeatures(degree=4),
 'base_estimator__DT': DecisionTreeRegressor(criterion='absolute_error', max_depth=13,
                       max_features='auto', max_leaf_nodes=29,
                       min_impurity_decrease=0.01, min_samples_leaf=26,
                       min_samples_split=58),
 'base_estimator__poly__degree': 4,
 'base_estimator__poly__include_bias': True,
 'base_estimator__poly__interaction_only': False,
 'base_estimator__poly__order': 'C',
 'base_estimator__DT__ccp_alpha': 0.0,
 'base_estimator__DT__criterion': 'absolute_error',
 'base_estimator__

In [19]:
DT_bagging_GS = GridSearchCV(estimator=DT_bagging,
                             param_grid={'n_estimators': np.arange(10, 20)},
                             cv=cv,
                             scoring={'r2': make_scorer(r2_score),
                                      'me': make_scorer(max_error),
                                      'mse': make_scorer(mean_squared_error)
                                      },
                             refit='mse',
                             return_train_score=True,
                             n_jobs=-1)

DT_bagging_GS.fit(train_x, train_y)

GridSearchCV(cv=RepeatedKFold(n_repeats=12, n_splits=6, random_state=64),
             estimator=BaggingRegressor(base_estimator=Pipeline(steps=[('poly',
                                                                        PolynomialFeatures(degree=4)),
                                                                       ('DT',
                                                                        DecisionTreeRegressor(ccp_alpha=6,
                                                                                              max_depth=36,
                                                                                              max_features='auto',
                                                                                              min_samples_leaf=21,
                                                                                              min_samples_split=20))])),
             n_jobs=-1,
             param_grid={'n_estimators': array([10, 11, 12, 13, 14, 15, 16,

In [20]:
DT_bagging_GS.best_estimator_, np.mean(DT_bagging_GS.cv_results_['mean_test_r2']) * 100, np.mean(
    DT_bagging_GS.cv_results_['mean_train_r2']) * 100

(BaggingRegressor(base_estimator=Pipeline(steps=[('poly',
                                                  PolynomialFeatures(degree=4)),
                                                 ('DT',
                                                  DecisionTreeRegressor(ccp_alpha=6,
                                                                        max_depth=36,
                                                                        max_features='auto',
                                                                        min_samples_leaf=21,
                                                                        min_samples_split=20))])),
 85.38757435080538,
 91.31868736174906)

In [7]:
dt_model = BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures(degree=4)),
                                                           ('DT', DecisionTreeRegressor(splitter='best',
                                                                                        min_samples_split=58,
                                                                                        min_samples_leaf=26,
                                                                                        min_impurity_decrease=0.01,
                                                                                        max_leaf_nodes=29,
                                                                                        max_features='auto',
                                                                                        max_depth=13,
                                                                                        criterion='absolute_error'))]),
                            n_estimators=100,
                            n_jobs=-1)

dt_model.fit(train_x, train_y)
r2_score(test_y, dt_model.predict(test_x)), mean_squared_error(test_y, dt_model.predict(test_x), squared=False)

(0.8381784546475929, 31601.525643965142)

In [23]:
pd.to_pickle(pd.DataFrame(DT_bagging_GS.cv_results_),
             '../data/cv_results/ensemble/dt_bagging_gs.pkl')

In [13]:
et_model = BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures(degree=4)),
                                                           ('ET', ExtraTreeRegressor(min_samples_split=53,
                                                                                     min_samples_leaf=26,
                                                                                     max_features='auto',
                                                                                     max_depth=66,
                                                                                     ccp_alpha=6))]),
                            n_estimators=120,
                            n_jobs=-1)

et_model.fit(train_x, train_y)
r2_score(test_y, et_model.predict(test_x)), mean_squared_error(test_y, et_model.predict(test_x), squared=False)

(0.8347935384765579, 31930.328922980818)

### Random forest

In [41]:
Forest_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=3)),
                               ('forest', XGBRFRegressor(n_estimators=100,
                                                         n_jobs=-1))
                               ]
                        )
Forest_model.get_params()

{'memory': None,
 'steps': [('poly', PolynomialFeatures(degree=3)),
  ('forest',
   XGBRFRegressor(base_score=None, booster=None, colsample_bylevel=None,
                  colsample_bytree=None, enable_categorical=False, gamma=None,
                  gpu_id=None, importance_type=None, interaction_constraints=None,
                  max_delta_step=None, max_depth=None, min_child_weight=None,
                  missing=nan, monotone_constraints=None, n_estimators=100,
                  n_jobs=-1, num_parallel_tree=None, objective='reg:squarederror',
                  predictor=None, random_state=None, reg_alpha=None,
                  scale_pos_weight=None, tree_method=None,
                  validate_parameters=None, verbosity=None))],
 'verbose': False,
 'poly': PolynomialFeatures(degree=3),
 'forest': XGBRFRegressor(base_score=None, booster=None, colsample_bylevel=None,
                colsample_bytree=None, enable_categorical=False, gamma=None,
                gpu_id=None, importance_

In [42]:
forest_grid = {'forest__max_depth': np.random.randint(2, 35, 25)}

forest_GS = RandomizedSearchCV(Forest_model,
                               param_distributions=forest_grid,
                               n_iter=10,
                               scoring={'r2': make_scorer(r2_score),
                                        'mse': make_scorer(mean_squared_error)
                                        },
                               refit='mse',
                               n_jobs=-1,
                               cv=cv,
                               return_train_score=True)

forest_GS.fit(train_x, train_y)

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.


In [None]:
temp_RF_depth = pd.DataFrame(forest_GS.cv_results_)
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=[24, 6], dpi=300)
sns.lineplot(x='forest__max_depth', y='mean_train_mse', data=temp_RF_depth, ax=axes[0])
sns.lineplot(x='forest__max_depth', y='mean_train_r2', data=temp_RF_depth, ax=axes[1])
plt.savefig('graphs/random_forest_depth_train.png')
plt.show()

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=[24, 6], dpi=300)
sns.lineplot(x='forest__max_depth', y='mean_test_mse', data=temp_RF_depth, ax=axes[0])
sns.lineplot(x='forest__max_depth', y='mean_test_r2', data=temp_RF_depth, ax=axes[1])
plt.savefig('graphs/random_forest_depth_test.png')
plt.show()

In [52]:
forest_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=3)),
                               ('forest',
                                XGBRFRegressor(base_score=0.5, booster='gbtree',
                                               colsample_bylevel=1, colsample_bytree=1,
                                               enable_categorical=False, gamma=0, gpu_id=-1,
                                               max_delta_step=0,
                                               max_depth=13, min_child_weight=1,
                                               n_estimators=100,
                                               n_jobs=-1, num_parallel_tree=100,
                                               objective='reg:squarederror',
                                               predictor='auto', random_state=0, reg_alpha=0,
                                               scale_pos_weight=1, tree_method='exact',
                                               validate_parameters=1))])

# (0.8364032786197753, 31774.386618074877)
# (0.8379728446988419, 31621.595670519357)
# (0.837409422097269, 31676.52738924592)
# 13-(0.857763488096932, 29627.52848877842)
# 15-(0.8577252702277984, 29631.50856724573)
forest_model.fit(train_x, train_y)
r2_score(test_y, forest_model.predict(test_x)), mean_squared_error(test_y, forest_model.predict(test_x), squared=False)

(0.857763488096932, 29627.52848877842)

### Gradient boosting tree

In [106]:
GB_model = Pipeline(steps=[('poly', PolynomialFeatures()),
                           ('boosting', GradientBoostingRegressor(n_estimators=200,
                                                                  min_samples_split=46,
                                                                  min_samples_leaf=23,
                                                                  max_depth=80))])
GB_model.fit(train_x, train_y)

Pipeline(steps=[('poly', PolynomialFeatures(degree=3)),
                ('boosting',
                 GradientBoostingRegressor(max_depth=80, min_samples_leaf=23,
                                           min_samples_split=46,
                                           n_estimators=200))])

In [107]:
r2_score(test_y, GB_model.predict(test_x)) * 100, mean_squared_error(test_y, GB_model.predict(test_x), squared=False)

(70.00713515937946, 43022.817955742175)

### Stacking

In [53]:
Stack_model = StackingRegressor(
    estimators=[('lasso', Pipeline(steps=[('poly', PolynomialFeatures(degree=2)),
                                          ('lasso', Lasso(alpha=12.807147786982478,
                                                          max_iter=20687))])),
                ('decision_tree',
                 BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures(degree=4)),
                                                                 ('DT',
                                                                  DecisionTreeRegressor(splitter='best',
                                                                                        min_samples_split=58,
                                                                                        min_samples_leaf=26,
                                                                                        min_impurity_decrease=0.01,
                                                                                        max_leaf_nodes=29,
                                                                                        max_features='auto',
                                                                                        max_depth=13,
                                                                                        criterion='absolute_error'))]),
                                  n_estimators=100,
                                  n_jobs=-1)),
                ('forest', Pipeline(steps=[('poly', PolynomialFeatures(degree=3)),
                                           ('forest',
                                            XGBRFRegressor(base_score=0.5, booster='gbtree',
                                                           colsample_bylevel=1, colsample_bytree=1,
                                                           enable_categorical=False, gamma=0, gpu_id=-1,
                                                           max_delta_step=0,
                                                           max_depth=13, min_child_weight=1,
                                                           n_estimators=100,
                                                           n_jobs=-1, num_parallel_tree=100,
                                                           objective='reg:squarederror',
                                                           predictor='auto', random_state=0, reg_alpha=0,
                                                           scale_pos_weight=1, tree_method='exact',
                                                           validate_parameters=1))]))
                ],
    cv=32,
    n_jobs=-1)
Stack_model.get_params()

{'cv': 32,
 'estimators': [('lasso',
   Pipeline(steps=[('poly', PolynomialFeatures()),
                   ('lasso', Lasso(alpha=12.807147786982478, max_iter=20687))])),
  ('decision_tree',
   BaggingRegressor(base_estimator=Pipeline(steps=[('poly',
                                                    PolynomialFeatures(degree=4)),
                                                   ('DT',
                                                    DecisionTreeRegressor(criterion='absolute_error',
                                                                          max_depth=13,
                                                                          max_features='auto',
                                                                          max_leaf_nodes=29,
                                                                          min_impurity_decrease=0.01,
                                                                          min_samples_leaf=26,
                                  

In [54]:
Stack_model.fit(train_x, train_y)

TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.


In [None]:
r2_score(test_y, Stack_model.predict(test_x)), mean_squared_error(test_y, Stack_model.predict(test_x), squared=False)

## Predictions on Test Data

In [55]:
house_test_data = pd.read_csv('../data/test.csv', index_col='Id')
house_test_data

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [56]:
X_t = base_pipeline.transform(house_test_data)
X_t

array([[0.        , 0.47210624, 0.42965303, ..., 0.        , 0.41258743,
        0.61663878],
       [0.        , 0.47210624, 0.49915546, ..., 0.        , 0.41258743,
        0.43137602],
       [0.53092803, 0.52789376, 0.42965303, ..., 1.        , 0.52202823,
        0.50419355],
       ...,
       [0.        , 0.49505129, 0.42965303, ..., 0.        , 0.52202823,
        0.5626834 ],
       [0.59093623, 0.49505129, 0.42965303, ..., 0.52264797, 0.        ,
        0.        ],
       [0.53092803, 0.49505129, 0.56016822, ..., 1.        , 0.64555594,
        0.59182204]])

In [57]:
prediction = pd.DataFrame(data=forest_model.predict(X_t), index=house_test_data.index, columns=['SalePrice'])
prediction

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,115422.648438
1462,154133.515625
1463,178486.187500
1464,185786.062500
1465,194300.468750
...,...
2915,78996.562500
2916,78153.742188
2917,155800.812500
2918,126163.898438


In [58]:
prediction.to_csv('../data/predictions_v_5.csv')