# References
- sklearn pipeline : https://www.kaggle.com/code/alexisbcook/pipelines
- ordinal categorical features : https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pandas.api.types import CategoricalDtype

# Load data

In [2]:
from sklearn.model_selection import train_test_split

# Read the data
X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
X_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
#handling categorical variables is done later

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

print(X.shape)
print(X_test.shape)

print("Loaded data")

(1460, 79)
(1459, 79)
Loaded data


In [3]:
print(X.Exterior2nd.unique())
print(X_test.Exterior2nd.unique())

['VinylSd' 'MetalSd' 'Wd Shng' 'HdBoard' 'Plywood' 'Wd Sdng' 'CmentBd'
 'BrkFace' 'Stucco' 'AsbShng' 'Brk Cmn' 'ImStucc' 'AsphShn' 'Stone'
 'Other' 'CBlock']
['VinylSd' 'Wd Sdng' 'HdBoard' 'Plywood' 'MetalSd' 'Brk Cmn' 'CmentBd'
 'ImStucc' 'Wd Shng' 'AsbShng' 'Stucco' 'CBlock' 'BrkFace' 'AsphShn' nan
 'Stone']


# Categorical features - special handling
Ref : https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices

In [4]:
# X.OverallQual.unique()
# # array([ 7,  6,  8,  5,  9,  4, 10,  3,  1,  2])

# X.OverallCond.unique()
# # array([5, 8, 6, 7, 4, 2, 3, 9, 1])

In [5]:
# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", 
                "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
                "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]


# The ordinal (ordered) categorical features 
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(10))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

# to replace later
# "LotShape": ["IR3", "IR2", "IR1", "Reg"]
# "Utilities": ["ELO", "NoSeWa", "NoSewr", "AllPub"],


ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}
ordered_levels.keys()

# small_cat_categorical_cols = list(set(small_cat_categorical_cols).difference(set(ordered_levels.keys())))
# numerical_cols = list(set(numerical_cols).difference(set(ordered_levels.keys())))

# for name, levels in ordered_levels.items():
#     X[name] = X[name].astype(CategoricalDtype(levels, ordered=True))
#     X[name] = X[name].cat.codes
#     X_test[name] = X_test[name].astype(CategoricalDtype(levels, ordered=True))
#     X_test[name] = X_test[name].cat.codes

# print(len(ordered_levels.keys()))
# print(len(small_cat_categorical_cols))
# print(len(large_cat_categorical_cols))
# print(len(numerical_cols))  


def encode(df):
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    return df

X = encode(X)
X_test = encode(X_test)

In [6]:
# [(c, X[c].dtype) for c in X.columns]

46 + 33 = 79 

As expected 79 total features

# Feature Utility Scores

In [7]:
from sklearn.feature_selection import mutual_info_regression

def make_mi_scores(X, y):
    X = X.copy(deep = True)
    for name in X.select_dtypes("number"):
        X[name] = X[name].fillna(0)
    for name in X.select_dtypes("category"):
        X[name] = X[name].fillna("None")
        
    for name in X.select_dtypes(["category"]):
        X[name] = X[name].cat.codes
        
    # All discrete features should now have integer dtypes
    # discrete features are features that are not floats
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [8]:
mi_scores = make_mi_scores(X, y)
mi_scores

OverallQual     5.786503e-01
Neighborhood    5.253900e-01
GarageArea      4.927946e-01
GrLivArea       4.328080e-01
YearBuilt       4.087151e-01
                    ...     
Utilities       1.887379e-15
PoolArea        2.220446e-16
PoolQC          0.000000e+00
MiscVal         0.000000e+00
MoSold          0.000000e+00
Name: MI Scores, Length: 79, dtype: float64

In [9]:
mi_scores[:20]

OverallQual     0.578650
Neighborhood    0.525390
GarageArea      0.492795
GrLivArea       0.432808
YearBuilt       0.408715
TotalBsmtSF     0.399518
LotArea         0.392427
GarageCars      0.360102
BsmtQual        0.331398
ExterQual       0.325150
KitchenQual     0.322472
1stFlrSF        0.283265
MSSubClass      0.278183
YearRemodAdd    0.275659
FullBath        0.273156
GarageFinish    0.262945
GarageYrBlt     0.260881
LotFrontage     0.216092
FireplaceQu     0.209348
TotRmsAbvGrd    0.207962
Name: MI Scores, dtype: float64

In [10]:
mi_scores[-20:]

ExterCond       1.845965e-02
LotConfig       1.593551e-02
BsmtFullBath    1.318343e-02
Heating         1.287198e-02
BsmtHalfBath    1.131273e-02
Functional      1.084641e-02
LowQualFinSF    6.635548e-03
RoofMatl        4.894636e-03
LandSlope       4.280622e-03
YrSold          3.618707e-03
BsmtFinSF2      2.734002e-03
MiscFeature     2.703237e-03
Condition2      2.664787e-03
3SsnPorch       7.615664e-04
Street          3.666896e-04
Utilities       1.887379e-15
PoolArea        2.220446e-16
PoolQC          0.000000e+00
MiscVal         0.000000e+00
MoSold          0.000000e+00
Name: MI Scores, dtype: float64

In [11]:
X.MiscVal.describe()

count     1460.000000
mean        43.489041
std        496.123024
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      15500.000000
Name: MiscVal, dtype: float64

PoolQC though has MI = 0, sounds like a possibly useful feature. Might have interaction effects.
Lets remove MoSold, MiscVal and MiscFeature

In [12]:
features_to_drop = ['MoSold', 'MiscVal', 'MiscFeature']
X.drop(columns = features_to_drop, inplace = True)
X_test.drop(columns = features_to_drop, inplace = True)
print(X.shape)
print(X_test.shape)

(1460, 76)
(1459, 76)


In [13]:
def remove_columns_from_list(orig_list, to_remove):
    return [f for f in orig_list if f not in to_remove]

In [14]:
# ordinal_categorical_cols = remove_columns_from_list(ordered_levels.keys(), features_to_drop)
# print(len(ordinal_categorical_cols))

# small_cat_categorical_cols = remove_columns_from_list(small_cat_categorical_cols, features_to_drop)
# print(len(small_cat_categorical_cols))

# large_cat_categorical_cols = remove_columns_from_list(large_cat_categorical_cols, features_to_drop)
# print(len(large_cat_categorical_cols))

# numerical_cols = remove_columns_from_list(numerical_cols, features_to_drop)
# print(len(numerical_cols))

In [15]:
ordinal_categorical_cols = remove_columns_from_list(ordered_levels.keys(), features_to_drop)

categorical_cols = [cname for cname in X.columns if
                    X[cname].dtype == "category"]

numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

small_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() < 10 and cname not in ordinal_categorical_cols]
large_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() >= 10 and cname not in ordinal_categorical_cols]

print(len(ordinal_categorical_cols))
print(len(small_cat_categorical_cols))
print(len(large_cat_categorical_cols))
print(len(categorical_cols))  
print(len(numerical_cols))

24
17
4
45
31


# Training pipeline

In [16]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler, FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBRegressor

In [17]:
# # Preprocessing for numerical data
# numerical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())
# ])

# # Preprocessing for categorical data

# ord_categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='median')),
#     ('scaler', StandardScaler())
# ])
# small_categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
# ])
# large_categorical_transformer = Pipeline(steps=[
#     ('imputer', SimpleImputer(strategy='most_frequent')),
#     ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
# ])

# # Bundle preprocessing for numerical and categorical data
# preprocessor = ColumnTransformer(
#     transformers=[
#         ('num', numerical_transformer, numerical_cols),
#         ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
#         ('small_cat', small_categorical_transformer, small_cat_categorical_cols),
#         ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
#     ])


# Imputation for numerical data
numerical_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value=0))
])

# Imputation for categorical data
categorical_imputer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='None'))
])

all_type_imputer = ColumnTransformer(
    transformers=[
        ('num', numerical_imputer, numerical_cols),
        ('cat', categorical_imputer, categorical_cols)
    ])

#all_type_imputer outputs a numpy array. Defining this transform to get Pandas DataFrame
#We need Pandas Dataframe for appending features
class ArrayToDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame(X, columns=self.columns)

In [18]:
pipeline_imp = Pipeline(steps=[
    ('imputer', all_type_imputer),
    ('array_to_df', ArrayToDataFrame(columns = numerical_cols+categorical_cols))
])
pipeline_imp.fit_transform(X, np.log(y))

Unnamed: 0,LotFrontage,LotArea,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,...,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,SaleType,SaleCondition
0,65.0,8450.0,2003.0,2003.0,196.0,706.0,0.0,150.0,856.0,856.0,...,,Attchd,RFn,TA,TA,Y,,,WD,Normal
1,80.0,9600.0,1976.0,1976.0,0.0,978.0,0.0,284.0,1262.0,1262.0,...,TA,Attchd,RFn,TA,TA,Y,,,WD,Normal
2,68.0,11250.0,2001.0,2002.0,162.0,486.0,0.0,434.0,920.0,920.0,...,TA,Attchd,RFn,TA,TA,Y,,,WD,Normal
3,60.0,9550.0,1915.0,1970.0,0.0,216.0,0.0,540.0,756.0,961.0,...,Gd,Detchd,Unf,TA,TA,Y,,,WD,Abnorml
4,84.0,14260.0,2000.0,2000.0,350.0,655.0,0.0,490.0,1145.0,1145.0,...,TA,Attchd,RFn,TA,TA,Y,,,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,62.0,7917.0,1999.0,2000.0,0.0,0.0,0.0,953.0,953.0,953.0,...,TA,Attchd,RFn,TA,TA,Y,,,WD,Normal
1456,85.0,13175.0,1978.0,1988.0,119.0,790.0,163.0,589.0,1542.0,2073.0,...,TA,Attchd,Unf,TA,TA,Y,,MnPrv,WD,Normal
1457,66.0,9042.0,1941.0,2006.0,0.0,275.0,0.0,877.0,1152.0,1188.0,...,Gd,Attchd,RFn,TA,TA,Y,,GdPrv,WD,Normal
1458,68.0,9717.0,1950.0,1996.0,0.0,49.0,1029.0,0.0,1078.0,1078.0,...,,Attchd,Unf,TA,TA,Y,,,WD,Normal


In [19]:
# include features found useful in https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices
class AppendFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self  # No fitting required

    def transform(self, X):
        if not isinstance(X, pd.DataFrame):
            raise ValueError("Input should be a pandas DataFrame with column names.")

        # X["LivLotRatio"] = X.GrLivArea / X.LotArea
        # X["Spaciousness"] = (X['1stFlrSF'] + X['2ndFlrSF']) / X.TotRmsAbvGrd

        bldg_dummies = pd.get_dummies(X.BldgType, prefix="Bldg")
        X = X.join(bldg_dummies.mul(X.GrLivArea, axis=0))
        
        # X["PorchTypes"] = X[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch"]].gt(0.0).sum(axis=1)

        # X["MSClass"] = X.MSSubClass.str.split("_", n=1, expand=True)[0]

        # X["MedNhbdArea"] = X.groupby("Neighborhood")["GrLivArea"].transform("median")

        # #PCA inspired as specified in https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices
        # X["Feature1"] = X.GrLivArea + X.TotalBsmtSF
        # X["Feature2"] = X.YearRemodAdd * X.TotalBsmtSF
        
        return X

In [20]:
bldg_dummies = pd.get_dummies(X.BldgType, prefix="Bldg")
bldg_dummies.mul(X.GrLivArea, axis=0)

Unnamed: 0_level_0,Bldg_1Fam,Bldg_2fmCon,Bldg_Duplex,Bldg_Twnhs,Bldg_TwnhsE,Bldg_None
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1710,0,0,0,0,0
2,1262,0,0,0,0,0
3,1786,0,0,0,0,0
4,1717,0,0,0,0,0
5,2198,0,0,0,0,0
...,...,...,...,...,...,...
1456,1647,0,0,0,0,0
1457,2073,0,0,0,0,0
1458,2340,0,0,0,0,0
1459,1078,0,0,0,0,0


In [21]:
X

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,61,0,0,0,0,,,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,0,0,0,,,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,42,0,0,0,0,,,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,35,272,0,0,0,,,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,84,0,0,0,0,,,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,40,0,0,0,0,,,2007,WD,Normal
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,0,0,0,,MnPrv,2010,WD,Normal
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,60,0,0,0,0,,GdPrv,2010,WD,Normal
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,112,0,0,0,,,2010,WD,Normal


In [22]:
X.BldgType.value_counts()

BldgType
1Fam      1220
TwnhsE     114
Duplex      52
Twnhs       43
2fmCon      31
None         0
Name: count, dtype: int64

In [23]:
pd.DataFrame(SimpleImputer(strategy='constant', fill_value='None').fit_transform(pd.DataFrame(X.BldgType, columns = ['BldgType'])), 
             columns = ['BldgType']).value_counts()

BldgType
1Fam        1220
TwnhsE       114
Duplex        52
Twnhs         43
2fmCon        31
Name: count, dtype: int64

In [25]:
X.BldgType.fillna("None").value_counts()

BldgType
1Fam      1220
TwnhsE     114
Duplex      52
Twnhs       43
2fmCon      31
None         0
Name: count, dtype: int64

In [117]:
pipeline_imp = Pipeline(steps=[
    ('imputer', all_type_imputer),
    ('array_to_df', ArrayToDataFrame(columns = numerical_cols+categorical_cols)),
    ('append_features', AppendFeatures())
])
pipeline_imp.fit_transform(X, np.log(y)).iloc[:, -15:]

Unnamed: 0,FireplaceQu,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,SaleType,SaleCondition,Bldg_1Fam,Bldg_2fmCon,Bldg_Duplex,Bldg_Twnhs,Bldg_TwnhsE
0,,Attchd,RFn,TA,TA,Y,,,WD,Normal,1710.0,0.0,0.0,0.0,0.0
1,TA,Attchd,RFn,TA,TA,Y,,,WD,Normal,1262.0,0.0,0.0,0.0,0.0
2,TA,Attchd,RFn,TA,TA,Y,,,WD,Normal,1786.0,0.0,0.0,0.0,0.0
3,Gd,Detchd,Unf,TA,TA,Y,,,WD,Abnorml,1717.0,0.0,0.0,0.0,0.0
4,TA,Attchd,RFn,TA,TA,Y,,,WD,Normal,2198.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,TA,Attchd,RFn,TA,TA,Y,,,WD,Normal,1647.0,0.0,0.0,0.0,0.0
1456,TA,Attchd,Unf,TA,TA,Y,,MnPrv,WD,Normal,2073.0,0.0,0.0,0.0,0.0
1457,Gd,Attchd,RFn,TA,TA,Y,,GdPrv,WD,Normal,2340.0,0.0,0.0,0.0,0.0
1458,,Attchd,Unf,TA,TA,Y,,,WD,Normal,1078.0,0.0,0.0,0.0,0.0


In [None]:
# Define model
model = XGBRegressor(random_state = 0)

#Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[
   ('preprocessor', preprocessor),
   ('model', model)
])

# Hyperparam search using GridSearchCV

Uncomment the code in the cell below to identify hyperparameters using GridSearchCV and paste the identified best params onto the full data retrain cell

In [14]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'model__n_estimators' : [400], 
#     'model__learning_rate' : [0.05],
#     'model__max_depth' : range(3, 10, 1),
#     'model__subsample' : np.arange(0.5, 1.05, 0.1),
#     'model__lambda' : [0, 0.5, 1.0, 1.5, 2.0],
#     'model__alpha' : [0, 0.5, 1.0, 1.5, 2.0],
# }  

# gcv = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
# gcv.fit(X, np.log(y))

# print(gcv.best_estimator_)
# print(gcv.best_score_)
# print(gcv.best_params_)

**Few of the identified hyperparams and associated scores in the hidden cell below**

In [None]:
# param_grid = {
#     'model__n_estimators' : range(50, 450, 50), 
#     'model__learning_rate' : [0.5, 0.1, 0.05, 0.01],
# }  
# gcv = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
# gcv.fit(X, np.log(y))
# -0.1305179327795296
# {'model__learning_rate': 0.1, 'model__n_estimators': 350}
# public score : 0.13731

# param_grid = {
#     'model__n_estimators' : [350], 
#     'model__learning_rate' : [0.1],
#     'model__max_depth' : range(3, 10, 1),
#     'model__subsample' : np.arange(0.5, 1.05, 0.1),
#     'model__lambda' : [0, 0.5, 1.0, 1.5, 2.0],
#     'model__alpha' : [0, 0.5, 1.0, 1.5, 2.0],
# }  
# -0.12224762414010111
# {'model__alpha': 0, 'model__lambda': 2.0, 'model__learning_rate': 0.1, 'model__max_depth': 3, 
#  'model__n_estimators': 350, 'model__subsample': 0.7}
# public score : 0.13744

# param_grid = {
#     'model__n_estimators' : [350], 
#     'model__learning_rate' : [0.1],
#     'model__max_depth' : range(3, 10, 1),
#     'model__subsample' : np.arange(0.5, 1.05, 0.1),
#     'model__lambda' : [0, 0.5, 1.0, 1.5, 2.0],
#     'model__alpha' : [0, 0.5, 1.0, 1.5, 2.0],
# }  
# used neg_mean_squared_error and
#     gcv.fit(X, np.log(y))
#     print(-1 * np.sqrt(-1 * gcv.best_score_))
# -0.12280028330394728
# {'model__alpha': 0, 'model__lambda': 2.0, 'model__learning_rate': 0.1, 'model__max_depth': 3, 
#  'model__n_estimators': 350, 'model__subsample': 0.7}


# param_grid = {
#     'model__n_estimators' : range(50, 550, 50), 
#     'model__learning_rate' : [0.5, 0.1, 0.05, 0.01],
#     'model__max_depth' : [3],
#     'model__subsample' : [0.7],
#     'model__lambda' : [2.0],
#     'model__alpha' : [0],
# }  
# -0.12208852576613138
# {'model__alpha': 0, 'model__lambda': 2.0, 'model__learning_rate': 0.1, 
#  'model__max_depth': 3, 'model__n_estimators': 400, 'model__subsample': 0.7}
# public score : 0.13709


# with ord_categorical_transformer
# param_grid = {
#     'model__n_estimators' : [400], 
#     'model__learning_rate' : [0.05],
#     'model__max_depth' : range(3, 10, 1),
#     'model__subsample' : np.arange(0.5, 1.05, 0.1),
#     'model__lambda' : [0, 0.5, 1.0, 1.5, 2.0],
#     'model__alpha' : [0, 0.5, 1.0, 1.5, 2.0],
# }  
# -0.12115739318576706
# {'model__alpha': 0, 'model__lambda': 1.0, 'model__learning_rate': 0.05, 
#  'model__max_depth': 3, 'model__n_estimators': 400, 'model__subsample': 0.5}
# public score : 0.13436 with y and prediction
# public score : 0.13080 with np.log(y) and np.exp(prediction)

# Train on full data and obtain test predictions

In [16]:
#retrain on full data and obtain test predictions using best model hyperparameter values

best_params = {'model__alpha': 0, 'model__lambda': 1.0, 'model__learning_rate': 0.05, 
               'model__max_depth': 3, 'model__n_estimators': 400, 'model__subsample': 0.5}
pipeline.set_params(**best_params)

pipeline.fit(X, np.log(y))

# Preprocessing of validation data, get predictions
pred = np.exp(pipeline.predict(X_test))

print(pred[:10])

[121871.44  158508.14  186366.3   188257.84  188758.7   175689.28
 168794.8   162954.53  182221.1   128713.305]


In [17]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': pred})
output.to_csv('submission.csv', index=False)
print('saved output file')

saved output file


# What next