Hi everyone, 

This is a ready to submit notebook that is relatively easy to understand. 

If you'd like to directly go to a more sophisticated, high scoring notebook, check out the [notebook-part2](https://www.kaggle.com/code/abhivij/housing-price-prediction-part-2)

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pandas.api.types import CategoricalDtype

# Load data

In [6]:
from sklearn.model_selection import train_test_split

# Read the data
X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv',
                index_col='Id', dtype = {"MSSubClass" : "object"})
X_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv',
                     index_col='Id', dtype = {"MSSubClass" : "object"})
#MSSubClass variable is a categorical variable encoded with numbers with arbitrary ordering 
#should be converted to object type so that it is considered as categorical variable

print(X.shape)
print(X_test.shape)

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X.columns if
                    X[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

small_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() < 10]
large_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() >= 10]

print(len(small_cat_categorical_cols))
print(len(large_cat_categorical_cols))
print(len(categorical_cols))      

print(X.shape)
print(X_test.shape)

print(categorical_cols)
print(numerical_cols)

print("Loaded data")

(1460, 80)
(1459, 79)
40
4
44
(1460, 79)
(1459, 79)
['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']
['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars'

# Ordinal categorical features - special handling
Ref : https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices

In [7]:
# large_cat_categorical_cols
# ['MSSubClass', 'Neighborhood', 'Exterior1st', 'Exterior2nd']

#small_cat_categorical_cols

# X.LandSlope.unique()
# array(['Gtl', 'Mod', 'Sev'], dtype=object)

# X.ExterQual.unique()
# array(['Gd', 'TA', 'Ex', 'Fa'], dtype=object)

# X.ExterCond.unique()
# array(['TA', 'Gd', 'Fa', 'Po', 'Ex'], dtype=object)

# X.BsmtQual.unique()
# array(['Gd', 'TA', 'Ex', nan, 'Fa'], dtype=object)

# X.BsmtCond.unique()
# array(['TA', 'Gd', nan, 'Fa', 'Po'], dtype=object)

# X.BsmtExposure.unique()
# array(['No', 'Gd', 'Mn', 'Av', nan], dtype=object)

# X.BsmtFinType1.unique()
# array(['GLQ', 'ALQ', 'Unf', 'Rec', 'BLQ', nan, 'LwQ'], dtype=object)

# X.BsmtFinType2.unique()
# array(['Unf', 'BLQ', nan, 'ALQ', 'Rec', 'LwQ', 'GLQ'], dtype=object)

#The above small category columns are clearly ordinal but their default ordering is incorrect. 
# Similar case for HeatingQC, KitchenQual, FireplaceQu, GarageQual, GarageCond, PoolQC
# Use these as ordinal categories

five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ordered_levels = {
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"]
}
ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}
ordered_levels.keys()

small_cat_categorical_cols = list(set(small_cat_categorical_cols).difference(set(ordered_levels.keys())))

for name, levels in ordered_levels.items():
    X[name] = X[name].astype(CategoricalDtype(levels, ordered=True))
    X[name] = X[name].cat.codes
    X_test[name] = X_test[name].astype(CategoricalDtype(levels, ordered=True))
    X_test[name] = X_test[name].cat.codes

print(len(ordered_levels.keys()))
print(len(small_cat_categorical_cols))
print(len(large_cat_categorical_cols))
print(len(categorical_cols))    

14
26
4
44


# Training pipeline

In [8]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from xgboost import XGBRegressor

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Preprocessing for categorical data

ord_categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
small_categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
large_categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ord', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('ord_cat', ord_categorical_transformer, list(ordered_levels.keys())),
        ('small_cat', small_categorical_transformer, small_cat_categorical_cols),
        ('large_cat', large_categorical_transformer, large_cat_categorical_cols)
    ])

# Define model
model = XGBRegressor(random_state = 0)

#Bundle preprocessing and modeling code in a pipeline
pipeline = Pipeline(steps=[
   ('preprocessor', preprocessor),
   ('model', model)
])

# Hyperparam search using GridSearchCV

Uncomment the code in the cell below to identify hyperparameters using GridSearchCV and paste the identified best params onto the full data retrain cell

In [9]:
# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'model__n_estimators' : [400], 
#     'model__learning_rate' : [0.05],
#     'model__max_depth' : range(3, 10, 1),
#     'model__subsample' : np.arange(0.5, 1.05, 0.1),
#     'model__lambda' : [0, 0.5, 1.0, 1.5, 2.0],
#     'model__alpha' : [0, 0.5, 1.0, 1.5, 2.0],
# }  

# gcv = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
# gcv.fit(X, np.log(y))

# print(gcv.best_estimator_)
# print(gcv.best_score_)
# print(gcv.best_params_)

**Few of the identified hyperparams and associated scores in the hidden cell below**

In [10]:
# param_grid = {
#     'model__n_estimators' : range(50, 450, 50), 
#     'model__learning_rate' : [0.5, 0.1, 0.05, 0.01],
# }  
# gcv = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error', n_jobs=-1)
# gcv.fit(X, np.log(y))
# -0.1305179327795296
# {'model__learning_rate': 0.1, 'model__n_estimators': 350}
# public score : 0.13731

# param_grid = {
#     'model__n_estimators' : [350], 
#     'model__learning_rate' : [0.1],
#     'model__max_depth' : range(3, 10, 1),
#     'model__subsample' : np.arange(0.5, 1.05, 0.1),
#     'model__lambda' : [0, 0.5, 1.0, 1.5, 2.0],
#     'model__alpha' : [0, 0.5, 1.0, 1.5, 2.0],
# }  
# -0.12224762414010111
# {'model__alpha': 0, 'model__lambda': 2.0, 'model__learning_rate': 0.1, 'model__max_depth': 3, 
#  'model__n_estimators': 350, 'model__subsample': 0.7}
# public score : 0.13744

# param_grid = {
#     'model__n_estimators' : [350], 
#     'model__learning_rate' : [0.1],
#     'model__max_depth' : range(3, 10, 1),
#     'model__subsample' : np.arange(0.5, 1.05, 0.1),
#     'model__lambda' : [0, 0.5, 1.0, 1.5, 2.0],
#     'model__alpha' : [0, 0.5, 1.0, 1.5, 2.0],
# }  
# used neg_mean_squared_error and
#     gcv.fit(X, np.log(y))
#     print(-1 * np.sqrt(-1 * gcv.best_score_))
# -0.12280028330394728
# {'model__alpha': 0, 'model__lambda': 2.0, 'model__learning_rate': 0.1, 'model__max_depth': 3, 
#  'model__n_estimators': 350, 'model__subsample': 0.7}


# param_grid = {
#     'model__n_estimators' : range(50, 550, 50), 
#     'model__learning_rate' : [0.5, 0.1, 0.05, 0.01],
#     'model__max_depth' : [3],
#     'model__subsample' : [0.7],
#     'model__lambda' : [2.0],
#     'model__alpha' : [0],
# }  
# -0.12208852576613138
# {'model__alpha': 0, 'model__lambda': 2.0, 'model__learning_rate': 0.1, 
#  'model__max_depth': 3, 'model__n_estimators': 400, 'model__subsample': 0.7}
# public score : 0.13709


# with ord_categorical_transformer
# param_grid = {
#     'model__n_estimators' : [400], 
#     'model__learning_rate' : [0.05],
#     'model__max_depth' : range(3, 10, 1),
#     'model__subsample' : np.arange(0.5, 1.05, 0.1),
#     'model__lambda' : [0, 0.5, 1.0, 1.5, 2.0],
#     'model__alpha' : [0, 0.5, 1.0, 1.5, 2.0],
# }  
# -0.12115739318576706
# {'model__alpha': 0, 'model__lambda': 1.0, 'model__learning_rate': 0.05, 
#  'model__max_depth': 3, 'model__n_estimators': 400, 'model__subsample': 0.5}
# public score : 0.13436 with y and prediction
# public score : 0.13080 with np.log(y) and np.exp(prediction)

# Train on full data and obtain test predictions

In [11]:
#retrain on full data and obtain test predictions using best model hyperparameter values

best_params = {'model__alpha': 0, 'model__lambda': 1.0, 'model__learning_rate': 0.05, 
               'model__max_depth': 3, 'model__n_estimators': 400, 'model__subsample': 0.5}
pipeline.set_params(**best_params)

pipeline.fit(X, np.log(y))

# Preprocessing of validation data, get predictions
pred = np.exp(pipeline.predict(X_test))

print(pred[:10])

[121871.44 158508.16 186366.31 188257.86 188758.72 175689.28 168794.78
 162954.52 182221.11 128713.3 ]


In [12]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': pred})
output.to_csv('submission.csv', index=False)
print('saved output file')

saved output file


# References
- sklearn pipeline : https://www.kaggle.com/code/alexisbcook/pipelines
- ordinal categorical features : https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices

# What next

If you have been able to submit this notebook to the competition, I'm sure you would have seen multiple ways to improve on this.

You can find plenty of ways to improve using the ideas in this reference notebook : https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices.

If you found this notebook helpful, please upvote 😄

Do checkout the [notebook-part2](https://www.kaggle.com/code/abhivij/housing-price-prediction-part-2) which describes the approach to obtain a high score.

