In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from feature_engine import imputation
from feature_engine import encoding

# pipeline utility
from sklearn.pipeline import Pipeline

# preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures

# hyper-parameter tuning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# metrics to evaluate models
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

# linear models
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

# tree models
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor

# ensemble models
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
house_data = pd.read_csv('../data/train.csv', index_col='Id')
X = house_data.drop(labels='SalePrice', axis=1)
Y = house_data['SalePrice']
X.shape, Y.shape

((1460, 79), (1460,))

In [4]:
cv = RepeatedKFold(n_splits=6, n_repeats=12, random_state=64)

In [3]:
base_pipeline = Pipeline(steps=[('most_miss_cat', imputation.CategoricalImputer(variables=['Alley',
                                                                                           'FireplaceQu',
                                                                                           'PoolQC',
                                                                                           'Fence',
                                                                                           'MiscFeature'])),
                                ('least_miss_cat', imputation.RandomSampleImputer(random_state='LotArea',
                                                                                  seed='observation',
                                                                                  variables=['GarageCond',
                                                                                             'GarageType',
                                                                                             'GarageFinish',
                                                                                             'GarageQual',
                                                                                             'BsmtExposure',
                                                                                             'BsmtFinType2',
                                                                                             'BsmtCond',
                                                                                             'BsmtQual',
                                                                                             'BsmtFinType1',
                                                                                             'MasVnrType',
                                                                                             'Electrical'])),
                                ('least_miss_num', imputation.RandomSampleImputer(random_state='LotArea',
                                                                                  seed='observation',
                                                                                  variables=['LotFrontage',
                                                                                             'GarageYrBlt',
                                                                                             'MasVnrArea'])),
                                ('undetected', imputation.RandomSampleImputer(random_state='LotArea',
                                                                              seed='observation')),
                                ('encode', encoding.OrdinalEncoder(
                                    variables=X.select_dtypes(include=object).columns.to_list())),
                                ('scaling', RobustScaler()),
                                ('transform', QuantileTransformer(output_distribution='normal')),
                                ('scale_to_range', MinMaxScaler()),
                                ('feature_selection', SelectKBest(score_func=mutual_info_regression,
                                                                  k=20))]
                         )

In [5]:
X_ = base_pipeline.fit_transform(X, Y)
train_x, test_x, train_y, test_y = train_test_split(X_, Y, test_size=0.2, random_state=48)
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((1168, 20), (292, 20), (1168,), (292,))

### bagging

In [8]:
L1_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=2)),
                           ('lasso', Lasso(alpha=12.807147786982478,
                                           max_iter=20687))])
L1_bagging = BaggingRegressor(base_estimator=L1_model)
L1_bagging.get_params()

{'base_estimator__memory': None,
 'base_estimator__steps': [('poly', PolynomialFeatures()),
  ('lasso', Lasso(alpha=12.807147786982478, max_iter=20687))],
 'base_estimator__verbose': False,
 'base_estimator__poly': PolynomialFeatures(),
 'base_estimator__lasso': Lasso(alpha=12.807147786982478, max_iter=20687),
 'base_estimator__poly__degree': 2,
 'base_estimator__poly__include_bias': True,
 'base_estimator__poly__interaction_only': False,
 'base_estimator__poly__order': 'C',
 'base_estimator__lasso__alpha': 12.807147786982478,
 'base_estimator__lasso__copy_X': True,
 'base_estimator__lasso__fit_intercept': True,
 'base_estimator__lasso__max_iter': 20687,
 'base_estimator__lasso__normalize': 'deprecated',
 'base_estimator__lasso__positive': False,
 'base_estimator__lasso__precompute': False,
 'base_estimator__lasso__random_state': None,
 'base_estimator__lasso__selection': 'cyclic',
 'base_estimator__lasso__tol': 0.0001,
 'base_estimator__lasso__warm_start': False,
 'base_estimator': Pi

In [9]:
L1_bagging_GS = RandomizedSearchCV(estimator=L1_bagging,
                                   param_distributions={'n_estimators': np.random.randint(10, 100, 20),
                                                        'max_samples': [0.5, 0.67, 0.75, 0.8, 0.9]},
                                   cv=cv,
                                   scoring={'r2': make_scorer(r2_score),
                                            'me': make_scorer(max_error),
                                            'mse': make_scorer(mean_squared_error)
                                            },
                                   refit='mse',
                                   return_train_score=True,
                                   n_jobs=-1,
                                   n_iter=15)

L1_bagging_GS.fit(train_x, train_y)

RandomizedSearchCV(cv=RepeatedKFold(n_repeats=12, n_splits=6, random_state=64),
                   estimator=BaggingRegressor(base_estimator=Pipeline(steps=[('poly',
                                                                              PolynomialFeatures()),
                                                                             ('lasso',
                                                                              Lasso(alpha=12.807147786982478,
                                                                                    max_iter=20687))])),
                   n_iter=15, n_jobs=-1,
                   param_distributions={'max_samples': [0.5, 0.67, 0.75, 0.8,
                                                        0.9],
                                        'n_estimators': array([30, 45, 18, 27, 39, 26, 45, 79, 66, 33, 53, 65, 90, 84, 18, 38, 18,
       99, 25, 30])},
                   refit='mse', return_train_score=True,
                   scoring={'me': make_s

In [18]:
L1_bagging_GS.best_estimator_, np.mean(L1_bagging_GS.cv_results_['mean_test_r2']) * 100, np.mean(
    L1_bagging_GS.cv_results_['mean_train_r2']) * 100

(BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                                 ('lasso',
                                                  Lasso(alpha=12.807147786982478,
                                                        max_iter=20687))]),
                  max_samples=0.5, n_estimators=18),
 86.15027990703051,
 90.12425790601924)

In [36]:
l1_model = BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                                           ('lasso',
                                                            Lasso(alpha=12.807147786982478,
                                                                  max_iter=20687))]),
                            n_estimators=25)

l1_model.fit(train_x, train_y)
r2_score(test_y, l1_model.predict(test_x)), mean_squared_error(test_y, l1_model.predict(test_x), squared=False)

(0.7504144125481214, 39246.38222405792)

In [19]:
pd.to_pickle(pd.DataFrame(L1_bagging_GS.cv_results_),
             '../data/cv_results/ensemble/l1_bagging_gs.pkl')

In [23]:
L2_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=2)),
                           ('ridge', Ridge(alpha=0.5378845624526254, max_iter=6055))])
L2_bagging = BaggingRegressor(base_estimator=L2_model)
L2_model.get_params()

{'memory': None,
 'steps': [('poly', PolynomialFeatures()),
  ('ridge', Ridge(alpha=0.5378845624526254, max_iter=6055))],
 'verbose': False,
 'poly': PolynomialFeatures(),
 'ridge': Ridge(alpha=0.5378845624526254, max_iter=6055),
 'poly__degree': 2,
 'poly__include_bias': True,
 'poly__interaction_only': False,
 'poly__order': 'C',
 'ridge__alpha': 0.5378845624526254,
 'ridge__copy_X': True,
 'ridge__fit_intercept': True,
 'ridge__max_iter': 6055,
 'ridge__normalize': 'deprecated',
 'ridge__positive': False,
 'ridge__random_state': None,
 'ridge__solver': 'auto',
 'ridge__tol': 0.001}

In [26]:
L2_bagging_GS = RandomizedSearchCV(estimator=L2_bagging,
                                   param_distributions={'n_estimators': np.random.randint(10, 100, 20)},
                                   cv=cv,
                                   scoring={'r2': make_scorer(r2_score),
                                            'me': make_scorer(max_error),
                                            'mse': make_scorer(mean_squared_error)
                                            },
                                   refit='mse',
                                   return_train_score=True,
                                   n_jobs=-1,
                                   n_iter=15)

L2_bagging_GS.fit(train_x, train_y)

RandomizedSearchCV(cv=RepeatedKFold(n_repeats=12, n_splits=6, random_state=64),
                   estimator=BaggingRegressor(base_estimator=Pipeline(steps=[('poly',
                                                                              PolynomialFeatures()),
                                                                             ('ridge',
                                                                              Ridge(alpha=0.5378845624526254,
                                                                                    max_iter=6055))])),
                   n_iter=15, n_jobs=-1,
                   param_distributions={'n_estimators': array([65, 74, 66, 36, 69, 19, 84, 98, 25, 52, 27, 43, 11, 23, 74, 65, 38,
       47, 21, 90])},
                   refit='mse', return_train_score=True,
                   scoring={'me': make_scorer(max_error),
                            'mse': make_scorer(mean_squared_error),
                            'r2': make_scorer(r2_score)

In [27]:
L2_bagging_GS.best_estimator_, np.mean(L2_bagging_GS.cv_results_['mean_test_r2']) * 100, np.mean(
    L2_bagging_GS.cv_results_['mean_train_r2']) * 100

(BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                                 ('ridge',
                                                  Ridge(alpha=0.5378845624526254,
                                                        max_iter=6055))]),
                  n_estimators=11),
 86.09059311980134,
 89.28613115345779)

In [45]:
l2_model = BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                                           ('ridge', Ridge(alpha=0.5378845624526254,
                                                                           max_iter=6055))]),
                            n_estimators=11)

l2_model.fit(train_x, train_y)
r2_score(test_y, l2_model.predict(test_x)), mean_squared_error(test_y, l2_model.predict(test_x), squared=False)

(0.7372383025068575, 40269.005765315305)

In [48]:
pd.to_pickle(pd.DataFrame(L2_bagging_GS.cv_results_),
             '../data/cv_results/ensemble/l2_bagging_gs.pkl')

In [18]:
DT_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=4)),
                           ('DT', DecisionTreeRegressor(splitter='best',
                                                        min_samples_split=20,
                                                        min_samples_leaf=21,
                                                        max_features='auto',
                                                        max_depth=36,
                                                        ccp_alpha=6))])
DT_bagging = BaggingRegressor(base_estimator=DT_model)
DT_bagging.get_params()

{'base_estimator__memory': None,
 'base_estimator__steps': [('poly', PolynomialFeatures(degree=4)),
  ('DT',
   DecisionTreeRegressor(ccp_alpha=6, max_depth=36, max_features='auto',
                         min_samples_leaf=21, min_samples_split=20))],
 'base_estimator__verbose': False,
 'base_estimator__poly': PolynomialFeatures(degree=4),
 'base_estimator__DT': DecisionTreeRegressor(ccp_alpha=6, max_depth=36, max_features='auto',
                       min_samples_leaf=21, min_samples_split=20),
 'base_estimator__poly__degree': 4,
 'base_estimator__poly__include_bias': True,
 'base_estimator__poly__interaction_only': False,
 'base_estimator__poly__order': 'C',
 'base_estimator__DT__ccp_alpha': 6,
 'base_estimator__DT__criterion': 'squared_error',
 'base_estimator__DT__max_depth': 36,
 'base_estimator__DT__max_features': 'auto',
 'base_estimator__DT__max_leaf_nodes': None,
 'base_estimator__DT__min_impurity_decrease': 0.0,
 'base_estimator__DT__min_samples_leaf': 21,
 'base_estimator_

In [19]:
DT_bagging_GS = GridSearchCV(estimator=DT_bagging,
                             param_grid={'n_estimators': np.arange(10, 20)},
                             cv=cv,
                             scoring={'r2': make_scorer(r2_score),
                                      'me': make_scorer(max_error),
                                      'mse': make_scorer(mean_squared_error)
                                      },
                             refit='mse',
                             return_train_score=True,
                             n_jobs=-1)

DT_bagging_GS.fit(train_x, train_y)

GridSearchCV(cv=RepeatedKFold(n_repeats=12, n_splits=6, random_state=64),
             estimator=BaggingRegressor(base_estimator=Pipeline(steps=[('poly',
                                                                        PolynomialFeatures(degree=4)),
                                                                       ('DT',
                                                                        DecisionTreeRegressor(ccp_alpha=6,
                                                                                              max_depth=36,
                                                                                              max_features='auto',
                                                                                              min_samples_leaf=21,
                                                                                              min_samples_split=20))])),
             n_jobs=-1,
             param_grid={'n_estimators': array([10, 11, 12, 13, 14, 15, 16,

In [20]:
DT_bagging_GS.best_estimator_, np.mean(DT_bagging_GS.cv_results_['mean_test_r2']) * 100, np.mean(
    DT_bagging_GS.cv_results_['mean_train_r2']) * 100

(BaggingRegressor(base_estimator=Pipeline(steps=[('poly',
                                                  PolynomialFeatures(degree=4)),
                                                 ('DT',
                                                  DecisionTreeRegressor(ccp_alpha=6,
                                                                        max_depth=36,
                                                                        max_features='auto',
                                                                        min_samples_leaf=21,
                                                                        min_samples_split=20))])),
 85.38757435080538,
 91.31868736174906)

In [28]:
dt_model = BaggingRegressor(base_estimator=Pipeline(steps=[('poly',
                                                            PolynomialFeatures(degree=4)),
                                                           ('DT',
                                                            DecisionTreeRegressor(ccp_alpha=6,
                                                                                  max_depth=36,
                                                                                  max_features='auto',
                                                                                  min_samples_leaf=21,
                                                                                  min_samples_split=20))]))

dt_model.fit(train_x, train_y)
r2_score(test_y, dt_model.predict(test_x)), mean_squared_error(test_y, dt_model.predict(test_x), squared=False)

(0.8349538933490716, 31914.82884255128)

In [23]:
pd.to_pickle(pd.DataFrame(DT_bagging_GS.cv_results_),
             '../data/cv_results/ensemble/dt_bagging_gs.pkl')

## Predictions on Test Data

In [52]:
house_test_data = pd.read_csv('../data/test.csv', index_col='Id')
house_test_data

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


In [53]:
X_t = base_pipeline.transform(house_test_data)
X_t

array([[0.        , 0.47210624, 0.42965303, ..., 0.        , 0.41258743,
        0.61663878],
       [0.        , 0.47210624, 0.49915546, ..., 0.        , 0.41258743,
        0.43137602],
       [0.53092803, 0.52789376, 0.42965303, ..., 1.        , 0.52202823,
        0.50419355],
       ...,
       [0.        , 0.49505129, 0.42965303, ..., 0.        , 0.52202823,
        0.5626834 ],
       [0.59093623, 0.49505129, 0.42965303, ..., 0.52264797, 0.        ,
        0.        ],
       [0.53092803, 0.49505129, 0.56016822, ..., 1.        , 0.64555594,
        0.59182204]])

In [55]:
prediction = pd.DataFrame(data=dt_model.predict(X_t), index=house_test_data.index, columns=['SalePrice'])
prediction

Unnamed: 0_level_0,SalePrice
Id,Unnamed: 1_level_1
1461,122299.045163
1462,153330.912332
1463,176605.603058
1464,183236.115771
1465,202194.582268
...,...
2915,78068.154066
2916,81561.069130
2917,152985.516352
2918,122618.412468


In [56]:
prediction.to_csv('../data/predictions_v_2.csv')