In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from feature_engine import imputation
from feature_engine import encoding

# pipeline utility
from sklearn.pipeline import Pipeline

# preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures

# hyper-parameter tuning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

# metrics to evaluate models
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

# linear models
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor

# neighbors models
from sklearn.neighbors import KNeighborsRegressor

# tree models
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor

In [2]:
house_data = pd.read_csv('../data/train.csv', index_col='Id')
X = house_data.drop(labels='SalePrice', axis=1)
Y = house_data['SalePrice']
X.shape, Y.shape

((1460, 79), (1460,))

In [3]:
cv = RepeatedKFold(n_splits=6, n_repeats=12, random_state=64)

In [4]:
base_pipeline = Pipeline(steps=[('most_miss_cat', imputation.CategoricalImputer(variables=['Alley',
                                                                                           'FireplaceQu',
                                                                                           'PoolQC',
                                                                                           'Fence',
                                                                                           'MiscFeature'])),
                                ('least_miss_cat', imputation.RandomSampleImputer(random_state='LotArea',
                                                                                  seed='observation',
                                                                                  variables=['GarageCond',
                                                                                             'GarageType',
                                                                                             'GarageFinish',
                                                                                             'GarageQual',
                                                                                             'BsmtExposure',
                                                                                             'BsmtFinType2',
                                                                                             'BsmtCond',
                                                                                             'BsmtQual',
                                                                                             'BsmtFinType1',
                                                                                             'MasVnrType',
                                                                                             'Electrical'])),
                                ('least_miss_num', imputation.RandomSampleImputer(random_state='LotArea',
                                                                                  seed='observation',
                                                                                  variables=['LotFrontage',
                                                                                             'GarageYrBlt',
                                                                                             'MasVnrArea'])),
                                ('encode', encoding.OrdinalEncoder(
                                    variables=X.select_dtypes(include=object).columns.to_list())),
                                ('scaling', RobustScaler()),
                                ('transform', QuantileTransformer(output_distribution='normal')),
                                ('scale_to_range', MinMaxScaler()),
                                ('feature_selection', SelectKBest(score_func=mutual_info_regression,
                                                                  k=20))]
                         )

In [5]:
X_ = base_pipeline.fit_transform(X, Y)
train_x, test_x, train_y, test_y = train_test_split(X_, Y, test_size=0.2, random_state=48)
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((1168, 20), (292, 20), (1168,), (292,))

In [6]:
L1_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=2)),
                           ('lasso', Lasso())])
L1_model.get_params()

{'memory': None,
 'steps': [('poly', PolynomialFeatures()), ('lasso', Lasso())],
 'verbose': False,
 'poly': PolynomialFeatures(),
 'lasso': Lasso(),
 'poly__degree': 2,
 'poly__include_bias': True,
 'poly__interaction_only': False,
 'poly__order': 'C',
 'lasso__alpha': 1.0,
 'lasso__copy_X': True,
 'lasso__fit_intercept': True,
 'lasso__max_iter': 1000,
 'lasso__normalize': 'deprecated',
 'lasso__positive': False,
 'lasso__precompute': False,
 'lasso__random_state': None,
 'lasso__selection': 'cyclic',
 'lasso__tol': 0.0001,
 'lasso__warm_start': False}

In [9]:
l1_grid = {'lasso__alpha': np.exp(np.random.normal(0, 5, 25)),
           'lasso__max_iter': np.random.randint(1000, 6580, 4)
           }

l1_grid_search = GridSearchCV(estimator=L1_model,
                              param_grid=l1_grid,
                              scoring={'r2': make_scorer(r2_score),
                                       'me': make_scorer(max_error)
                                       },
                              refit='r2',
                              n_jobs=-1,
                              cv=cv)

l1_grid_search.fit(train_x, train_y)

  model = cd_fast.enet_coordinate_descent(


GridSearchCV(cv=RepeatedKFold(n_repeats=12, n_splits=6, random_state=64),
             estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                       ('lasso', Lasso())]),
             n_jobs=-1,
             param_grid={'lasso__alpha': array([1.36240559e+00, 5.09650935e-04, 1.36518178e-03, 2.34427570e+00,
       3.94148561e-02, 1.19543881e-03, 1.34546427e+03, 1.39763998e-01,
       8.91411678e+00, 3.89664653e+00, 2.22187391e+00, 7.39601408e+02,
       6.53411636e-02, 4.27754032e-03, 4.99826970e-02, 5.16190912e+01,
       2.01002633e-01, 5.62079557e+00, 1.40125439e-02, 3.13203511e-01,
       6.56560299e+00, 9.63509355e+01, 5.52455953e-03, 3.22991033e-03,
       1.28071478e+01]),
                         'lasso__max_iter': array([3411, 5502, 5985, 2687])},
             refit='r2',
             scoring={'me': make_scorer(max_error),
                      'r2': make_scorer(r2_score)})

In [12]:
l1_grid_search.best_params_, l1_grid_search.best_score_

({'lasso__alpha': 12.807147786982478, 'lasso__max_iter': 2687},
 0.8582179183379455)

In [17]:
temp_l1_cv_result = pd.DataFrame(l1_grid_search.cv_results_)
temp_l1_cv_result.to_pickle('../data/cv_results/l1_cv_result.pkl')

In [8]:
L2_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=2)),
                           ('ridge', Ridge())])
L2_model.get_params()

{'memory': None,
 'steps': [('poly', PolynomialFeatures()), ('ridge', Ridge())],
 'verbose': False,
 'poly': PolynomialFeatures(),
 'ridge': Ridge(),
 'poly__degree': 2,
 'poly__include_bias': True,
 'poly__interaction_only': False,
 'poly__order': 'C',
 'ridge__alpha': 1.0,
 'ridge__copy_X': True,
 'ridge__fit_intercept': True,
 'ridge__max_iter': None,
 'ridge__normalize': 'deprecated',
 'ridge__positive': False,
 'ridge__random_state': None,
 'ridge__solver': 'auto',
 'ridge__tol': 0.001}

In [10]:
l2_grid = {'ridge__alpha': np.exp(np.random.normal(0, 5, 25)),
           'ridge__max_iter': np.random.randint(1000, 6580, 4)
           }

l2_grid_search = GridSearchCV(estimator=L2_model,
                              param_grid=l2_grid,
                              scoring={'r2': make_scorer(r2_score),
                                       'me': make_scorer(max_error)
                                       },
                              refit='r2',
                              n_jobs=-1,
                              cv=cv)

l2_grid_search.fit(train_x, train_y)

GridSearchCV(cv=RepeatedKFold(n_repeats=12, n_splits=6, random_state=64),
             estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                       ('ridge', Ridge())]),
             n_jobs=-1,
             param_grid={'ridge__alpha': array([1.31126344e+02, 3.89911045e-01, 1.57619448e+00, 1.25777725e+00,
       3.31674693e-02, 3.75486593e-03, 9.82609112e-03, 6.72825549e+05,
       1.38327667e+03, 1.18260102e+06, 2.23209131e+00, 3.55747948e+00,
       3.69327214e+00, 6.40218447e+01, 4.37709079e-01, 4.89488876e-03,
       1.28600521e-01, 2.07991834e-03, 5.37884562e-01, 1.55284967e+01,
       6.75642940e-03, 2.62140261e+01, 3.40807308e-01, 1.51681554e-02,
       2.47807479e+00]),
                         'ridge__max_iter': array([6055, 5819, 1835, 3678])},
             refit='r2',
             scoring={'me': make_scorer(max_error),
                      'r2': make_scorer(r2_score)})

In [11]:
l2_grid_search.best_params_, l2_grid_search.best_score_

({'ridge__alpha': 0.5378845624526254, 'ridge__max_iter': 6055},
 0.8602809816793933)

In [18]:
temp_l2_cv_result = pd.DataFrame(l2_grid_search.cv_results_)
temp_l2_cv_result.to_pickle('../data/cv_results/l2_cv_result.pkl')

In [14]:
elasticNet_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=3)),
                                   ('elasticNet', ElasticNet())])
elasticNet_model.get_params()

{'memory': None,
 'steps': [('poly', PolynomialFeatures(degree=3)),
  ('elasticNet', ElasticNet())],
 'verbose': False,
 'poly': PolynomialFeatures(degree=3),
 'elasticNet': ElasticNet(),
 'poly__degree': 3,
 'poly__include_bias': True,
 'poly__interaction_only': False,
 'poly__order': 'C',
 'elasticNet__alpha': 1.0,
 'elasticNet__copy_X': True,
 'elasticNet__fit_intercept': True,
 'elasticNet__l1_ratio': 0.5,
 'elasticNet__max_iter': 1000,
 'elasticNet__normalize': 'deprecated',
 'elasticNet__positive': False,
 'elasticNet__precompute': False,
 'elasticNet__random_state': None,
 'elasticNet__selection': 'cyclic',
 'elasticNet__tol': 0.0001,
 'elasticNet__warm_start': False}

In [16]:
elasticNet_grid = {'elasticNet__alpha': np.exp(np.random.normal(0, 5, 25)),
                   'elasticNet__max_iter': np.random.randint(1000, 6580, 4),
                   'elasticNet__l1_ratio': np.linspace(0, 1, 10)
                   }

elasticNet_grid_search = GridSearchCV(estimator=elasticNet_model,
                                      param_grid=elasticNet_grid,
                                      scoring={'r2': make_scorer(r2_score),
                                               'me': make_scorer(max_error)
                                               },
                                      refit='r2',
                                      n_jobs=-1,
                                      cv=cv)

elasticNet_grid_search.fit(train_x, train_y)

KeyboardInterrupt: 

In [None]:
elasticNet_grid_search.best_params_, elasticNet_grid_search.best_score_

In [None]:
temp_elasticNet_cv_result = pd.DataFrame(elasticNet_grid_search.cv_results_)
temp_elasticNet_cv_result.to_pickle('../data/cv_results/elasticNet_cv_result.pkl')
