In [7]:
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from feature_engine import imputation
from feature_engine import encoding

# pipeline utility
from sklearn.pipeline import Pipeline

# preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures

# hyper-parameter tuning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# metrics to evaluate models
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

# linear models
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

# neighbors models
from sklearn.neighbors import KNeighborsRegressor

# tree models
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor

# ensemble models
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import ExtraTreesRegressor

In [2]:
house_data = pd.read_csv('../data/train.csv', index_col='Id')
X = house_data.drop(labels='SalePrice', axis=1)
Y = house_data['SalePrice']
X.shape, Y.shape

((1460, 79), (1460,))

In [3]:
cv = RepeatedKFold(n_splits=6, n_repeats=12, random_state=64)

In [4]:
base_pipeline = Pipeline(steps=[('most_miss_cat', imputation.CategoricalImputer(variables=['Alley',
                                                                                           'FireplaceQu',
                                                                                           'PoolQC',
                                                                                           'Fence',
                                                                                           'MiscFeature'])),
                                ('least_miss_cat', imputation.RandomSampleImputer(random_state='LotArea',
                                                                                  seed='observation',
                                                                                  variables=['GarageCond',
                                                                                             'GarageType',
                                                                                             'GarageFinish',
                                                                                             'GarageQual',
                                                                                             'BsmtExposure',
                                                                                             'BsmtFinType2',
                                                                                             'BsmtCond',
                                                                                             'BsmtQual',
                                                                                             'BsmtFinType1',
                                                                                             'MasVnrType',
                                                                                             'Electrical'])),
                                ('least_miss_num', imputation.RandomSampleImputer(random_state='LotArea',
                                                                                  seed='observation',
                                                                                  variables=['LotFrontage',
                                                                                             'GarageYrBlt',
                                                                                             'MasVnrArea'])),
                                ('undetected', imputation.RandomSampleImputer(random_state='LotArea',
                                                                              seed='observation')),
                                ('encode', encoding.OrdinalEncoder(
                                    variables=X.select_dtypes(include=object).columns.to_list())),
                                ('scaling', RobustScaler()),
                                ('transform', QuantileTransformer(output_distribution='normal')),
                                ('scale_to_range', MinMaxScaler()),
                                ('feature_selection', SelectKBest(score_func=mutual_info_regression,
                                                                  k=20))]
                         )

In [5]:
X_ = base_pipeline.fit_transform(X, Y)
train_x, test_x, train_y, test_y = train_test_split(X_, Y, test_size=0.2, random_state=48)
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((1168, 20), (292, 20), (1168,), (292,))

### bagging

In [8]:
L1_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=2)),
                           ('lasso', Lasso(alpha=12.807147786982478,
                                           max_iter=20687))])
L1_bagging = BaggingRegressor(base_estimator=L1_model)
L1_bagging.get_params()

{'base_estimator__memory': None,
 'base_estimator__steps': [('poly', PolynomialFeatures()),
  ('lasso', Lasso(alpha=12.807147786982478, max_iter=20687))],
 'base_estimator__verbose': False,
 'base_estimator__poly': PolynomialFeatures(),
 'base_estimator__lasso': Lasso(alpha=12.807147786982478, max_iter=20687),
 'base_estimator__poly__degree': 2,
 'base_estimator__poly__include_bias': True,
 'base_estimator__poly__interaction_only': False,
 'base_estimator__poly__order': 'C',
 'base_estimator__lasso__alpha': 12.807147786982478,
 'base_estimator__lasso__copy_X': True,
 'base_estimator__lasso__fit_intercept': True,
 'base_estimator__lasso__max_iter': 20687,
 'base_estimator__lasso__normalize': 'deprecated',
 'base_estimator__lasso__positive': False,
 'base_estimator__lasso__precompute': False,
 'base_estimator__lasso__random_state': None,
 'base_estimator__lasso__selection': 'cyclic',
 'base_estimator__lasso__tol': 0.0001,
 'base_estimator__lasso__warm_start': False,
 'base_estimator': Pi

In [9]:
L1_bagging_GS = RandomizedSearchCV(estimator=L1_bagging,
                                   param_distributions={'n_estimators': np.random.randint(10, 100, 20),
                                                        'max_samples': [0.5, 0.67, 0.75, 0.8, 0.9]},
                                   cv=cv,
                                   scoring={'r2': make_scorer(r2_score),
                                            'me': make_scorer(max_error),
                                            'mse': make_scorer(mean_squared_error)
                                            },
                                   refit='mse',
                                   return_train_score=True,
                                   n_jobs=-1,
                                   n_iter=15)

L1_bagging_GS.fit(train_x, train_y)

RandomizedSearchCV(cv=RepeatedKFold(n_repeats=12, n_splits=6, random_state=64),
                   estimator=BaggingRegressor(base_estimator=Pipeline(steps=[('poly',
                                                                              PolynomialFeatures()),
                                                                             ('lasso',
                                                                              Lasso(alpha=12.807147786982478,
                                                                                    max_iter=20687))])),
                   n_iter=15, n_jobs=-1,
                   param_distributions={'max_samples': [0.5, 0.67, 0.75, 0.8,
                                                        0.9],
                                        'n_estimators': array([30, 45, 18, 27, 39, 26, 45, 79, 66, 33, 53, 65, 90, 84, 18, 38, 18,
       99, 25, 30])},
                   refit='mse', return_train_score=True,
                   scoring={'me': make_s

In [18]:
L1_bagging_GS.best_estimator_, np.mean(L1_bagging_GS.cv_results_['mean_test_r2']) * 100, np.mean(
    L1_bagging_GS.cv_results_['mean_train_r2']) * 100

(BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                                 ('lasso',
                                                  Lasso(alpha=12.807147786982478,
                                                        max_iter=20687))]),
                  max_samples=0.5, n_estimators=18),
 86.15027990703051,
 90.12425790601924)

In [25]:
r2_score(test_y, L1_bagging_GS.predict(test_x)) * 100, mean_squared_error(test_y, L1_bagging_GS.predict(test_x),
                                                                          squared=False)

(74.14627965818734, 39943.986019962256)

In [19]:
pd.to_pickle(pd.DataFrame(L1_bagging_GS.cv_results_),
             '../data/cv_results/ensemble/l1_bagging_gs.pkl')

In [23]:
L2_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=2)),
                           ('ridge', Ridge(alpha=0.5378845624526254, max_iter=6055))])
L2_bagging = BaggingRegressor(base_estimator=L2_model)
L2_model.get_params()

{'memory': None,
 'steps': [('poly', PolynomialFeatures()),
  ('ridge', Ridge(alpha=0.5378845624526254, max_iter=6055))],
 'verbose': False,
 'poly': PolynomialFeatures(),
 'ridge': Ridge(alpha=0.5378845624526254, max_iter=6055),
 'poly__degree': 2,
 'poly__include_bias': True,
 'poly__interaction_only': False,
 'poly__order': 'C',
 'ridge__alpha': 0.5378845624526254,
 'ridge__copy_X': True,
 'ridge__fit_intercept': True,
 'ridge__max_iter': 6055,
 'ridge__normalize': 'deprecated',
 'ridge__positive': False,
 'ridge__random_state': None,
 'ridge__solver': 'auto',
 'ridge__tol': 0.001}

In [26]:
L2_bagging_GS = RandomizedSearchCV(estimator=L2_bagging,
                                   param_distributions={'n_estimators': np.random.randint(10, 100, 20)},
                                   cv=cv,
                                   scoring={'r2': make_scorer(r2_score),
                                            'me': make_scorer(max_error),
                                            'mse': make_scorer(mean_squared_error)
                                            },
                                   refit='mse',
                                   return_train_score=True,
                                   n_jobs=-1,
                                   n_iter=15)

L2_bagging_GS.fit(train_x, train_y)

RandomizedSearchCV(cv=RepeatedKFold(n_repeats=12, n_splits=6, random_state=64),
                   estimator=BaggingRegressor(base_estimator=Pipeline(steps=[('poly',
                                                                              PolynomialFeatures()),
                                                                             ('ridge',
                                                                              Ridge(alpha=0.5378845624526254,
                                                                                    max_iter=6055))])),
                   n_iter=15, n_jobs=-1,
                   param_distributions={'n_estimators': array([65, 74, 66, 36, 69, 19, 84, 98, 25, 52, 27, 43, 11, 23, 74, 65, 38,
       47, 21, 90])},
                   refit='mse', return_train_score=True,
                   scoring={'me': make_scorer(max_error),
                            'mse': make_scorer(mean_squared_error),
                            'r2': make_scorer(r2_score)

In [27]:
L2_bagging_GS.best_estimator_, np.mean(L2_bagging_GS.cv_results_['mean_test_r2']) * 100, np.mean(
    L2_bagging_GS.cv_results_['mean_train_r2']) * 100

(BaggingRegressor(base_estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                                 ('ridge',
                                                  Ridge(alpha=0.5378845624526254,
                                                        max_iter=6055))]),
                  n_estimators=11),
 86.09059311980134,
 89.28613115345779)

In [28]:
r2_score(test_y, L2_bagging_GS.predict(test_x)) * 100, mean_squared_error(test_y, L2_bagging_GS.predict(test_x),
                                                                          squared=False)

(74.12505640031279, 39960.377618516635)