In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from feature_engine import imputation
from feature_engine import encoding

# pipeline utility
from sklearn.pipeline import Pipeline

# preprocessing
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PolynomialFeatures

# hyper-parameter tuning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV

# metrics to evaluate models
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer

# feature selection
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression

# linear models
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import SGDRegressor

# neighbors models
from sklearn.neighbors import KNeighborsRegressor

# tree models
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor

In [2]:
house_data = pd.read_csv('../data/train.csv', index_col='Id')
X = house_data.drop(labels='SalePrice', axis=1)
Y = house_data['SalePrice']
X.shape, Y.shape

((1460, 79), (1460,))

In [3]:
cv = RepeatedKFold(n_splits=6, n_repeats=12, random_state=64)

In [4]:
base_pipeline = Pipeline(steps=[('most_miss_cat', imputation.CategoricalImputer(variables=['Alley',
                                                                                           'FireplaceQu',
                                                                                           'PoolQC',
                                                                                           'Fence',
                                                                                           'MiscFeature'])),
                                ('least_miss_cat', imputation.RandomSampleImputer(random_state='LotArea',
                                                                                  seed='observation',
                                                                                  variables=['GarageCond',
                                                                                             'GarageType',
                                                                                             'GarageFinish',
                                                                                             'GarageQual',
                                                                                             'BsmtExposure',
                                                                                             'BsmtFinType2',
                                                                                             'BsmtCond',
                                                                                             'BsmtQual',
                                                                                             'BsmtFinType1',
                                                                                             'MasVnrType',
                                                                                             'Electrical'])),
                                ('least_miss_num', imputation.RandomSampleImputer(random_state='LotArea',
                                                                                  seed='observation',
                                                                                  variables=['LotFrontage',
                                                                                             'GarageYrBlt',
                                                                                             'MasVnrArea'])),
                                ('encode', encoding.OrdinalEncoder(
                                    variables=X.select_dtypes(include=object).columns.to_list())),
                                ('scaling', RobustScaler()),
                                ('transform', QuantileTransformer(output_distribution='normal')),
                                ('scale_to_range', MinMaxScaler()),
                                ('feature_selection', SelectKBest(score_func=mutual_info_regression,
                                                                  k=20))]
                         )

In [5]:
X_ = base_pipeline.fit_transform(X, Y)
train_x, test_x, train_y, test_y = train_test_split(X_, Y, test_size=0.2, random_state=48)
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((1168, 20), (292, 20), (1168,), (292,))

In [6]:
L1_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=2)),
                           ('lasso', Lasso())])
L1_model.get_params()

{'memory': None,
 'steps': [('poly', PolynomialFeatures()), ('lasso', Lasso())],
 'verbose': False,
 'poly': PolynomialFeatures(),
 'lasso': Lasso(),
 'poly__degree': 2,
 'poly__include_bias': True,
 'poly__interaction_only': False,
 'poly__order': 'C',
 'lasso__alpha': 1.0,
 'lasso__copy_X': True,
 'lasso__fit_intercept': True,
 'lasso__max_iter': 1000,
 'lasso__normalize': 'deprecated',
 'lasso__positive': False,
 'lasso__precompute': False,
 'lasso__random_state': None,
 'lasso__selection': 'cyclic',
 'lasso__tol': 0.0001,
 'lasso__warm_start': False}

In [19]:
l1_grid = {'lasso__alpha': np.exp(np.random.normal(0, 5, 25)),
           'lasso__max_iter': np.random.randint(1000, 6580, 4)
           }

l1_grid_search = GridSearchCV(estimator=L1_model,
                              param_grid=l1_grid,
                              scoring={'r2': make_scorer(r2_score),
                                       'me': make_scorer(max_error)
                                       },
                              refit='r2',
                              n_jobs=-1,
                              cv=cv)

l1_grid_search.fit(train_x, train_y)

GridSearchCV(cv=RepeatedKFold(n_repeats=12, n_splits=6, random_state=64),
             estimator=Pipeline(steps=[('poly', PolynomialFeatures()),
                                       ('lasso', Lasso())]),
             n_jobs=-1,
             param_grid={'lasso__alpha': array([9.86806311e-04, 3.81430696e+00, 1.53425497e-02, 3.91884000e+00,
       7.96058440e+02, 2.03425451e-02, 6.95414589e-01, 1.25299019e+01,
       9.74090183e-02, 8.71121918e+02, 9.44227064e-01, 1.11936352e+01,
       5.41601376e-02, 1.20546714e+01, 8.57357934e+00, 3.08929287e+02,
       9.12962041e-04, 9.38644213e-03, 2.36389524e+00, 3.70308296e-02,
       7.01168270e-01, 4.82841423e-03, 1.36398950e-05, 7.17478394e+00,
       3.13054565e-03]),
                         'lasso__max_iter': array([4971, 4360, 1132, 6330])})

In [22]:
L2_model = Pipeline(steps=[('poly', PolynomialFeatures(degree=2)),
                           ('ridge', Ridge())])
L2_model.get_params()

{'memory': None,
 'steps': [('poly', PolynomialFeatures()), ('ridge', Ridge())],
 'verbose': False,
 'poly': PolynomialFeatures(),
 'ridge': Ridge(),
 'poly__degree': 2,
 'poly__include_bias': True,
 'poly__interaction_only': False,
 'poly__order': 'C',
 'ridge__alpha': 1.0,
 'ridge__copy_X': True,
 'ridge__fit_intercept': True,
 'ridge__max_iter': None,
 'ridge__normalize': 'deprecated',
 'ridge__positive': False,
 'ridge__random_state': None,
 'ridge__solver': 'auto',
 'ridge__tol': 0.001}

In [None]:
l1_grid = {'ridge__alpha': np.exp(np.random.normal(0, 5, 25)),
           'lasso__max_iter': np.random.randint(1000, 6580, 4)
           }

l1_grid_search = GridSearchCV(estimator=L1_model,
                              param_grid=l1_grid,
                              scoring={'r2': make_scorer(r2_score),
                                       'me': make_scorer(max_error)
                                       },
                              refit='r2',
                              n_jobs=-1,
                              cv=cv)

l1_grid_search.fit(train_x, train_y)

array([4.17862549e-05, 5.19053030e-01, 2.90003534e+02, 4.00091571e-02,
       2.21592788e+00, 4.14039177e+00, 5.12886532e-05, 2.75553689e+01,
       1.75310390e+01, 4.20822692e-02, 3.18512936e+00, 8.60916792e-04,
       4.25265143e+01, 2.31982973e+05, 1.05347118e-02, 3.76209160e-01,
       3.19664350e+00, 4.08110251e+00, 3.09746290e+04, 6.21856298e-06,
       1.10332036e-04, 4.69693245e+00, 2.69553197e-01, 1.03493772e+02,
       3.98737158e-02])