# References
- sklearn pipeline : https://www.kaggle.com/code/alexisbcook/pipelines
- ordinal categorical features : https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from pandas.api.types import CategoricalDtype

from xgboost import XGBRegressor

from sklearn.model_selection import KFold, cross_val_score
from sklearn.feature_selection import mutual_info_regression

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.preprocessing import StandardScaler, OneHotEncoder

from functools import reduce

from category_encoders import MEstimateEncoder, cat_boost

from sklearn.compose import ColumnTransformer

# Load data

In [2]:
# Read the data
X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
X_test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
#handling categorical variables is done later

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

print(X.shape)
print(X_test.shape)

print("Loaded data")

(1460, 79)
(1459, 79)
Loaded data


In [3]:
print(X.Exterior2nd.unique())
print(X_test.Exterior2nd.unique())

['VinylSd' 'MetalSd' 'Wd Shng' 'HdBoard' 'Plywood' 'Wd Sdng' 'CmentBd'
 'BrkFace' 'Stucco' 'AsbShng' 'Brk Cmn' 'ImStucc' 'AsphShn' 'Stone'
 'Other' 'CBlock']
['VinylSd' 'Wd Sdng' 'HdBoard' 'Plywood' 'MetalSd' 'Brk Cmn' 'CmentBd'
 'ImStucc' 'Wd Shng' 'AsbShng' 'Stucco' 'CBlock' 'BrkFace' 'AsphShn' nan
 'Stone']


# Categorical features - special handling
Ref : https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices

In [4]:
# X.OverallQual.unique()
# # array([ 7,  6,  8,  5,  9,  4, 10,  3,  1,  2])

# X.OverallCond.unique()
# # array([5, 8, 6, 7, 4, 2, 3, 9, 1])

In [5]:
# The nominative (unordered) categorical features
features_nom = ["MSSubClass", "MSZoning", "Street", "Alley", "LandContour", "LotConfig", "Neighborhood", 
                "Condition1", "Condition2", "BldgType", "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", 
                "MasVnrType", "Foundation", "Heating", "CentralAir", "GarageType", "MiscFeature", "SaleType", "SaleCondition"]


# The ordinal (ordered) categorical features 
five_levels = ["Po", "Fa", "TA", "Gd", "Ex"]
ten_levels = list(range(1, 11))

ordered_levels = {
    "OverallQual": ten_levels,
    "OverallCond": ten_levels,
    "ExterQual": five_levels,
    "ExterCond": five_levels,
    "BsmtQual": five_levels,
    "BsmtCond": five_levels,
    "HeatingQC": five_levels,
    "KitchenQual": five_levels,
    "FireplaceQu": five_levels,
    "GarageQual": five_levels,
    "GarageCond": five_levels,
    "PoolQC": five_levels,
    "LotShape": ["Reg", "IR1", "IR2", "IR3"],
    "LandSlope": ["Sev", "Mod", "Gtl"],
    "BsmtExposure": ["No", "Mn", "Av", "Gd"],
    "BsmtFinType1": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "BsmtFinType2": ["Unf", "LwQ", "Rec", "BLQ", "ALQ", "GLQ"],
    "Functional": ["Sal", "Sev", "Maj1", "Maj2", "Mod", "Min2", "Min1", "Typ"],
    "GarageFinish": ["Unf", "RFn", "Fin"],
    "PavedDrive": ["N", "P", "Y"],
    "Utilities": ["NoSeWa", "NoSewr", "AllPub"],
    "CentralAir": ["N", "Y"],
    "Electrical": ["Mix", "FuseP", "FuseF", "FuseA", "SBrkr"],
    "Fence": ["MnWw", "GdWo", "MnPrv", "GdPrv"],
}

# to replace later
# "LotShape": ["IR3", "IR2", "IR1", "Reg"]
# "Utilities": ["ELO", "NoSeWa", "NoSewr", "AllPub"],


ordered_levels = {key: ["None"] + value for key, value in
                  ordered_levels.items()}
ordered_levels.keys()

dict_keys(['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC', 'LotShape', 'LandSlope', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Functional', 'GarageFinish', 'PavedDrive', 'Utilities', 'CentralAir', 'Electrical', 'Fence'])

In [6]:
def encode(df):
    # Nominal categories
    for name in features_nom:
        df[name] = df[name].astype("category")
        # Add a None category for missing values
        if "None" not in df[name].cat.categories:
            df[name] = df[name].cat.add_categories("None")
    # Ordinal categories
    for name, levels in ordered_levels.items():
        df[name] = df[name].astype(CategoricalDtype(levels,
                                                    ordered=True))
    return df

In [7]:
# X = encode(X)
# X_test = encode(X_test)

In [8]:
# [(c, X[c].dtype) for c in X.columns]

In [9]:
def impute(df):
    for name in df.select_dtypes("number"):
        df[name] = df[name].fillna(0)
    for name in df.select_dtypes("category"):
        df[name] = df[name].fillna("None")
    return df

# Function to load and preprocess data

In [10]:
# we might need to keep testing different steps of preprocessing and adding new features
# obatining a fresh copy of the data after preprocessing will be helpful for this purpose

# create a function to perform data load and preprocessing together
def load_and_preprocess_data(train_data = True):
    if train_data:
        print("Train data")
        X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv', index_col='Id')
        X.dropna(axis=0, subset=['SalePrice'], inplace=True)
        y = X.SalePrice
        X.drop(['SalePrice'], axis=1, inplace=True)
    else:
        print("Test data")
        X = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv', index_col='Id')
        y = None
    print("Loaded data")
    print(X.shape)
    
    X = encode(X)
    X = impute(X)
    
    return (X, y)

In [11]:
X, y = load_and_preprocess_data()
display(X)

X_test, _ = load_and_preprocess_data(train_data = False)
display(X_test)

Train data
Loaded data
(1460, 79)


Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,2,2008,WD,Normal
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,0,,,,0,5,2007,WD,Normal
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,9,2008,WD,Normal
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,,0,2,2006,WD,Abnorml
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,0,,,,0,12,2008,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,8,2007,WD,Normal
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,2,2010,WD,Normal
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2010,WD,Normal


Test data
Loaded data
(1459, 79)


Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,Inside,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,Corner,...,0,0,,,Gar2,12500,6,2010,WD,Normal
1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,MnPrv,,0,3,2010,WD,Normal
1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,Inside,...,0,0,,,,0,6,2010,WD,Normal
1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,Inside,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,6,2006,WD,Normal
2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,4,2006,WD,Abnorml
2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,,,0,9,2006,WD,Abnorml
2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,Inside,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal


# Scoring function

In [12]:
def score_dataset(X, y, model=XGBRegressor()):
    # Label encoding for categoricals
    #
    X = X.copy()
    for colname in X.select_dtypes(["category"]):
        X[colname] = X[colname].cat.codes
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_root_mean_squared_error",
    )
    score = -1 * score.mean()
    return score

In [13]:
#baseline score
# score_dataset(X, y)
# # 0.14471175908169623
# # 0.1403798233991327 - after changing 10 levels range to range(1, 11) from the previous value of range(10)

# Feature Utility Scores

In [14]:
def make_mi_scores(X, y):
    X = X.copy(deep = True)
    for name in X.select_dtypes("number"):
        X[name] = X[name].fillna(0)
    for name in X.select_dtypes("category"):
        X[name] = X[name].fillna("None")
        
    for name in X.select_dtypes(["category"]):
        X[name] = X[name].cat.codes
        
    # All discrete features should now have integer dtypes
    # discrete features are features that are not floats
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

In [15]:
mi_scores = make_mi_scores(X, y)
mi_scores

OverallQual     5.786503e-01
Neighborhood    5.253900e-01
GarageArea      4.927946e-01
GrLivArea       4.328080e-01
YearBuilt       4.087151e-01
                    ...     
Utilities       1.887379e-15
PoolArea        2.220446e-16
PoolQC          0.000000e+00
MiscVal         0.000000e+00
MoSold          0.000000e+00
Name: MI Scores, Length: 79, dtype: float64

In [16]:
mi_scores[:20], mi_scores[-20:]

(OverallQual     0.578650
 Neighborhood    0.525390
 GarageArea      0.492795
 GrLivArea       0.432808
 YearBuilt       0.408715
 TotalBsmtSF     0.399518
 LotArea         0.392427
 GarageCars      0.360102
 BsmtQual        0.331398
 ExterQual       0.325150
 KitchenQual     0.322472
 1stFlrSF        0.283265
 MSSubClass      0.278183
 YearRemodAdd    0.275659
 FullBath        0.273156
 GarageFinish    0.262945
 GarageYrBlt     0.260881
 LotFrontage     0.216092
 FireplaceQu     0.209348
 TotRmsAbvGrd    0.207962
 Name: MI Scores, dtype: float64,
 ExterCond       1.845965e-02
 LotConfig       1.593551e-02
 BsmtFullBath    1.318343e-02
 Heating         1.287198e-02
 BsmtHalfBath    1.131273e-02
 Functional      1.084641e-02
 LowQualFinSF    6.635548e-03
 RoofMatl        4.894636e-03
 LandSlope       4.280622e-03
 YrSold          3.618707e-03
 BsmtFinSF2      2.734002e-03
 MiscFeature     2.703237e-03
 Condition2      2.664787e-03
 3SsnPorch       7.615664e-04
 Street          3.666896e

In [17]:
X.MiscVal.describe()

count     1460.000000
mean        43.489041
std        496.123024
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max      15500.000000
Name: MiscVal, dtype: float64

MiscVal seems to be mostly all 0s.

PoolQC though has MI = 0, sounds like a possibly useful feature. Might have interaction effects with PoolArea.

Lets remove MoSold, MiscVal and Utilities

In [18]:
X, y = load_and_preprocess_data()
X_test, _ = load_and_preprocess_data(train_data = False)
features_to_drop = ['MoSold', 'MiscVal', 'Utilities']
X.drop(columns = features_to_drop, inplace = True)
X_test.drop(columns = features_to_drop, inplace = True)
print(X.shape)
print(X_test.shape)

# score_dataset(X, y)
# # 0.14420647274995163
# # 0.13814891718435593   - After range(1,11)

Train data
Loaded data
(1460, 79)
Test data
Loaded data
(1459, 79)
(1460, 76)
(1459, 76)


That gives a small increase in performance

In [19]:
def remove_columns_from_list(orig_list, to_remove):
    return [f for f in orig_list if f not in to_remove]

In [20]:
ordinal_categorical_cols = remove_columns_from_list(ordered_levels.keys(), features_to_drop)

# Append features

In [21]:
ms_subclass_mapping = {
    20: "1-STORY 1946 & NEWER ALL STYLES",
    30: "1-STORY 1945 & OLDER",
    40: "1-STORY W/FINISHED ATTIC ALL AGES",
    45: "1-1/2 STORY - UNFINISHED ALL AGES",
    50: "1-1/2 STORY FINISHED ALL AGES",
    60: "2-STORY 1946 & NEWER",
    70: "2-STORY 1945 & OLDER",
    75: "2-1/2 STORY ALL AGES",
    80: "SPLIT OR MULTI-LEVEL",
    85: "SPLIT FOYER",
    90: "DUPLEX - ALL STYLES AND AGES",
    120: "1-STORY PUD (Planned Unit Development) - 1946 & NEWER",
    150: "1-1/2 STORY PUD - ALL AGES",
    160: "2-STORY PUD - 1946 & NEWER",
    180: "PUD - MULTILEVEL - INCL SPLIT LEV/FOYER",
    190: "2 FAMILY CONVERSION - ALL STYLES AND AGES"
}

ms_class_mapping = {
    "1-STORY 1946 & NEWER ALL STYLES": "1-Story",
    "1-STORY 1945 & OLDER": "1-Story",
    "1-STORY W/FINISHED ATTIC ALL AGES": "1-Story",
    "1-STORY PUD (Planned Unit Development) - 1946 & NEWER": "1-Story",
    "1-1/2 STORY - UNFINISHED ALL AGES": "1-1/2 Story",
    "1-1/2 STORY FINISHED ALL AGES": "1-1/2 Story",
    "1-1/2 STORY PUD - ALL AGES": "1-1/2 Story",
    "2-STORY 1946 & NEWER": "2-Story",
    "2-STORY 1945 & OLDER": "2-Story",
    "2-STORY PUD - 1946 & NEWER": "2-Story",
    "SPLIT OR MULTI-LEVEL": "Split-Level",
    "SPLIT FOYER": "Split-Level",
    "PUD - MULTILEVEL - INCL SPLIT LEV/FOYER": "Split-Level",
    "DUPLEX - ALL STYLES AND AGES": "Multi-Family/Duplex",
    "2 FAMILY CONVERSION - ALL STYLES AND AGES": "Multi-Family/Duplex",
    "2-1/2 STORY ALL AGES": "2-1/2 Story",
}

In [22]:
X["MSSubClass"].map(ms_subclass_mapping).map(ms_class_mapping).value_counts()

MSSubClass
1-Story                696
2-Story                422
1-1/2 Story            156
Split-Level             88
Multi-Family/Duplex     82
2-1/2 Story             16
Name: count, dtype: int64

In [23]:
(X["MSSubClass"].map(ms_subclass_mapping)
                .map(ms_class_mapping)
                .astype('category')
                .cat.add_categories("None").fillna("None"))

Id
1       2-Story
2       1-Story
3       2-Story
4       2-Story
5       2-Story
         ...   
1456    2-Story
1457    1-Story
1458    2-Story
1459    1-Story
1460    1-Story
Name: MSSubClass, Length: 1460, dtype: category
Categories (7, object): ['1-1/2 Story', '1-Story', '2-1/2 Story', '2-Story', 'Multi-Family/Duplex', 'Split-Level', 'None']

In [24]:
(X["MSSubClass"].map(ms_subclass_mapping)
                .str.contains('PUD')
                .astype('category')
                .cat.add_categories("None").fillna("None").value_counts())

MSSubClass
False    1300
True      160
None        0
Name: count, dtype: int64

In [25]:
(X["MSSubClass"].map(ms_subclass_mapping)
                .str.contains('PUD')
                .astype('category')
                .cat.add_categories("None").fillna("None"))

Id
1       False
2       False
3       False
4       False
5       False
        ...  
1456    False
1457    False
1458    False
1459    False
1460    False
Name: MSSubClass, Length: 1460, dtype: category
Categories (3, object): [False, True, 'None']

In [26]:
X[['GrLivArea', 'LotArea', '1stFlrSF', '2ndFlrSF', 'TotRmsAbvGrd']]

Unnamed: 0_level_0,GrLivArea,LotArea,1stFlrSF,2ndFlrSF,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1710,8450,856,854,8
2,1262,9600,1262,0,6
3,1786,11250,920,866,6
4,1717,9550,961,756,7
5,2198,14260,1145,1053,9
...,...,...,...,...,...
1456,1647,7917,953,694,7
1457,2073,13175,2073,0,7
1458,2340,9042,1188,1152,9
1459,1078,9717,1078,0,5


In [27]:
X.loc[X['1stFlrSF'] + X['2ndFlrSF'] != X['GrLivArea'], ['MSSubClass', 'GrLivArea', 'LotArea', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'TotRmsAbvGrd']]

Unnamed: 0_level_0,MSSubClass,GrLivArea,LotArea,TotalBsmtSF,1stFlrSF,2ndFlrSF,TotRmsAbvGrd
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
52,50,1176,6240,816,816,0,6
89,50,1526,8470,1013,1013,0,6
126,190,754,6780,520,520,0,5
171,50,1382,12358,720,854,0,7
186,75,3608,22950,1107,1518,1518,12
188,50,1656,10410,660,808,704,8
198,75,3112,25419,1360,1360,1360,8
199,75,2229,5520,755,929,929,8
264,50,1316,5500,926,926,0,6
268,75,2192,8400,720,1052,720,8


In [28]:
def append_features(df):
    df = df.copy()

    #The commented features below ended up decreasing the overall score
    
    df["LivLotRatio"] = df.GrLivArea / df.LotArea
    # df["Spaciousness"] = (df['1stFlrSF'] + df['2ndFlrSF']) / df.TotRmsAbvGrd
    # df["Spaciousness"] = df.GrLivArea / df.TotRmsAbvGrd

    # bldg_dummies = pd.get_dummies(df.BldgType, prefix="Bldg")
    # df = df.join(bldg_dummies.mul(df.GrLivArea, axis=0))
    
    # df["PorchTypes"] = df[["WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch"]].gt(0.0).sum(axis=1)

    # df["TotalOutsideSF"] = df.WoodDeckSF + df.OpenPorchSF + df.EnclosedPorch + df["3SsnPorch"] + df.ScreenPorch

    df["MSClass"] = (X["MSSubClass"].map(ms_subclass_mapping)
                                    .map(ms_class_mapping)
                                    .astype('category')
                                    .cat.add_categories("None")
                                    .fillna("None"))
    df["IsPUD"] = (X["MSSubClass"].map(ms_subclass_mapping)
                                  .str.contains('PUD')
                                  .astype('category')
                                  .cat.add_categories("None")
                                  .fillna("None"))
    # df.drop(columns = "MSSubClass", inplace = True)

    # df["MedNhbdArea"] = df.groupby("Neighborhood")["GrLivArea"].transform("median")

    # #PCA inspired as specified in https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices
    # df["Feature1"] = df.GrLivArea + df.TotalBsmtSF
    # df["Feature2"] = df.YearRemodAdd * df.TotalBsmtSF
    
    return df

In [29]:
X = append_features(X)
print(X.shape)
X_test = append_features(X_test)
print(X_test.shape)
# score_dataset(X_app, y)
# # (1460, 79)
# # 0.13445016760533443

(1460, 79)
(1459, 79)


# Append Cluster information as training features

In [30]:
class AppendKMeans(BaseEstimator, TransformerMixin):
    def __init__(self, cluster_columns, n_clusters=20, return_cluster=True, return_distances=False):
        self.cluster_columns = cluster_columns
        self.n_clusters = n_clusters
        self.return_cluster = return_cluster
        self.return_distances = return_distances

    def fit(self, X, y=None):
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X[self.cluster_columns])  # Scale features
        self.kmeans = KMeans(n_clusters=self.n_clusters, n_init=10, random_state=0)
        self.kmeans.fit(X_scaled)  # Fit K-Means on scaled features
        return self

    def transform(self, X):
        X_scaled = self.scaler.transform(X[self.cluster_columns])  # Apply same scaling as training
        result = X.copy()
        if self.return_cluster:
            result["Cluster"] = self.kmeans.predict(X_scaled)  # Get cluster
        if self.return_distances:
            cluster_distances = self.kmeans.transform(X_scaled)
            cluster_distances = pd.DataFrame(
                    cluster_distances, columns=[f"distance_centroid_{i}" for i in range(cluster_distances.shape[1])]
            )
            cluster_distances.set_index(X.index, inplace = True)
            result = result.join(cluster_distances)
        return result

In [31]:
# cluster_features = [
#     "LotArea",
#     "TotalBsmtSF",
#     "1stFlrSF",
#     "2ndFlrSF",
#     "GrLivArea",
# ]

# pipeline = Pipeline([
#     ('append_kmeans', AppendKMeans([cname for cname in X_app.columns if 
#                 X_app[cname].dtype in ['int64', 'float64']], 
#                                    n_clusters = 7,
#                                    return_cluster=False, return_distances=True)),  
#     ('model', XGBRegressor())         
# ])
# score_dataset(X_app, y, pipeline)
# #0.13514428961917468

In [32]:
# cluster_features = [
#     "LotArea",
#     "TotalBsmtSF",
#     "1stFlrSF",
#     "2ndFlrSF",
#     "GrLivArea",
# ]

# pipeline = Pipeline([
#     ('append_kmeans', AppendKMeans(X_app.columns, 
#                                    n_clusters = 7,
#                                    return_cluster=False, return_distances=True)),  
#     ('model', XGBRegressor())         
# ])
# score_dataset(X_app, y, pipeline)
# 0.1357512856677338

In [33]:
class AppendPCA(BaseEstimator, TransformerMixin):
    def __init__(self, pca_columns, n_components=2, pca_col_prefix="PCA"):
        self.pca_columns = pca_columns
        self.n_components = n_components
        self.pca_col_prefix = pca_col_prefix

    def fit(self, X, y=None):
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X[self.pca_columns])  # Scale features
        self.pca = PCA(n_components=self.n_components, random_state=0)
        self.pca.fit(X_scaled)  # Fit PCA on scaled features
        return self

    def transform(self, X):
        X_scaled = self.scaler.transform(X[self.pca_columns])  # Apply same scaling as training
        pca_components = self.pca.transform(X_scaled)  # Apply PCA
        # print(self.pca.explained_variance_ratio_)
        # print(np.cumsum(self.pca.explained_variance_ratio_))
        pca_components = pd.DataFrame(
                    pca_components, columns=[f"{self.pca_col_prefix}_{i}" for i in range(pca_components.shape[1])]
        )
        result = X.copy()
        pca_components.set_index(X.index, inplace = True)
        result = result.join(pca_components)
        return result

In [34]:
# numerical_cols = [cname for cname in X_app.columns if X_app[cname].dtype in ['int64', 'float64']]

# pipeline = Pipeline([
#     ('append_pca', AppendPCA(numerical_cols, n_components = 5)),  
#     ('model', XGBRegressor())         
# ])
# score_dataset(X_app, y, pipeline)
# # 0.13252821087942276

In [35]:
# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X_app.columns, n_components = 5)),  
#     ('model', XGBRegressor())         
# ])
# score_dataset(X_app, y, pipeline)
# 0.13202011491124924

In [36]:
# numerical_cols = [cname for cname in X_app.columns if X_app[cname].dtype in ['int64', 'float64']]

# pipeline = Pipeline([
#     ('append_pca_num', AppendPCA(numerical_cols, n_components = 5, pca_col_prefix = "PCA_NUM")), 
#     ('append_pca_all', AppendPCA(X_app.columns, n_components = 5)), 
#     ('model', XGBRegressor())         
# ])
# score_dataset(X_app, y, pipeline)
# # 0.13242210519993605

In [37]:
# pipeline = Pipeline([
#     ('append_pca', AppendPCA(X_app.columns, n_components = 5)),  
#     ('append_kmeans', AppendKMeans([f"PCA_{i}" for i in range(5)], 
#                                    n_clusters = 5,
#                                    return_cluster=True, return_distances=False)),
#     ('model', XGBRegressor())         
# ])
# score_dataset(X_app, y, pipeline)
# # 0.1324665988050256

Could create an ensemble of XGBoost of all PCA and XGBoost of (PCA + Kmeans)

# Target Encoding

In [38]:
class CrossFoldEncoder(BaseEstimator, TransformerMixin):
    
    #encoder_other_params should be a dict of argument_name and value
    # This is done to ensure it works properly within Pipeline
    # Not passing it as kwargs, because Pipeline uses sklearn.base.clone() and clone does not retain kwargs
    def __init__(self, cols, encoder, encoder_other_params):
        self.cols = cols
        self.encoder = encoder
        self.cv = KFold(n_splits=5)
        self.encoder_other_params = encoder_other_params  
        print(self.encoder_other_params)
        
    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit(self, X, y):
        self.fitted_encoders_ = []
        X_encoded = []
        for idx_encode, _ in self.cv.split(X):
            fitted_encoder = self.encoder(cols=self.cols, **self.encoder_other_params)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            self.fitted_encoders_.append(fitted_encoder)
        return self

    # To transform the data, average the encodings learned from
    # each fold.
    def transform(self, X):
        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        #drop columns for which target encoding has been created and join with target encodings
        return X.drop(columns=self.cols).join(X_encoded)   

In [39]:
# encoder = CrossFoldEncoder(cols=["Neighborhood", "MSSubClass", "Exterior1st", "Exterior2nd"], 
#                            encoder=MEstimateEncoder, 
#                            encoder_other_params={"m":10.0})
# encoder.fit_transform(X, y)

In [40]:
pipeline = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_encoder', CrossFoldEncoder(cols=["Neighborhood", "MSSubClass", "Exterior1st", "Exterior2nd"], 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('scaling', ColumnTransformer(
        transformers=[
            ('num_scaler', StandardScaler(), 
             ["Neighborhood_encoded", "MSSubClass_encoded", 
              "Exterior1st_encoded", "Exterior2nd_encoded"])
        ],
        remainder="passthrough"  # Pass through other columns untransformed
    )),
    ('model', XGBRegressor())         
])
score_dataset(X, y, pipeline)

{'m': 10.0}
{'m': 10.0}
{'m': 10.0}
{'m': 10.0}
{'m': 10.0}
{'m': 10.0}


0.12862897771977228

# Training pipeline

In [41]:
categorical_cols = [cname for cname in X.columns if
                    X[cname].dtype == "category"]

numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

small_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() < 10 and cname not in ordinal_categorical_cols]
large_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() >= 10 and cname not in ordinal_categorical_cols]

print(len(ordinal_categorical_cols))
print(len(small_cat_categorical_cols))
print(len(large_cat_categorical_cols))
print(len(categorical_cols))  
print(len(numerical_cols))

23
20
4
47
32


In [42]:
X[ordinal_categorical_cols]

Unnamed: 0_level_0,OverallQual,OverallCond,ExterQual,ExterCond,BsmtQual,BsmtCond,HeatingQC,KitchenQual,FireplaceQu,GarageQual,...,LandSlope,BsmtExposure,BsmtFinType1,BsmtFinType2,Functional,GarageFinish,PavedDrive,CentralAir,Electrical,Fence
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7,5,Gd,TA,Gd,TA,Ex,Gd,,TA,...,Gtl,No,GLQ,Unf,Typ,RFn,Y,Y,SBrkr,
2,6,8,TA,TA,Gd,TA,Ex,TA,TA,TA,...,Gtl,Gd,ALQ,Unf,Typ,RFn,Y,Y,SBrkr,
3,7,5,Gd,TA,Gd,TA,Ex,Gd,TA,TA,...,Gtl,Mn,GLQ,Unf,Typ,RFn,Y,Y,SBrkr,
4,7,5,TA,TA,TA,Gd,Gd,Gd,Gd,TA,...,Gtl,No,ALQ,Unf,Typ,Unf,Y,Y,SBrkr,
5,8,5,Gd,TA,Gd,TA,Ex,Gd,TA,TA,...,Gtl,Av,GLQ,Unf,Typ,RFn,Y,Y,SBrkr,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,6,5,TA,TA,Gd,TA,Ex,TA,TA,TA,...,Gtl,No,Unf,Unf,Typ,RFn,Y,Y,SBrkr,
1457,6,6,TA,TA,Gd,TA,TA,TA,TA,TA,...,Gtl,No,ALQ,Rec,Min1,Unf,Y,Y,SBrkr,MnPrv
1458,7,9,Ex,Gd,TA,Gd,Ex,Gd,Gd,TA,...,Gtl,No,GLQ,Unf,Typ,RFn,Y,Y,SBrkr,GdPrv
1459,5,6,TA,TA,TA,TA,Gd,Gd,,TA,...,Gtl,Mn,GLQ,Rec,Typ,Unf,Y,Y,FuseA,


In [44]:
large_cat_categorical_cols

['MSSubClass', 'Neighborhood', 'Exterior1st', 'Exterior2nd']

In [45]:
[col + "_encoded" for col in large_cat_categorical_cols]

['MSSubClass_encoded',
 'Neighborhood_encoded',
 'Exterior1st_encoded',
 'Exterior2nd_encoded']

Lets define a Transformer to convert categorical columns to their codes

In [43]:
class OrdinalEncoder(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        result = X.copy()
        for col in result.columns:
            result[col] = result[col].cat.codes
        return result

In [44]:
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder()),
    ('scaler', StandardScaler())
])
small_categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


pipeline = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols+[col + "_encoded" for col in large_cat_categorical_cols]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ])
    ),
    ('model', XGBRegressor())         
])

{'m': 10.0}


The pipeline planned will look something like that given above. 

It'll be better to append PCA columns at the end after encoder_scaler step, so that scaling withon PCA can be avoided.
It'll also be good to scale PCA component columns. Lets make those 2 changes.


Decided not to do the above. Since its problematic to do so within pipeline.
ColumnTransformer outputs numpy array, so accessing specific columns for PCA, or adding PCA components with specific column name for later use is not straight forward (or not possible within pipeline ?)

Instead its much easier to convert to cat.codes within PCA fit, transform

In [45]:
class AppendPCA(BaseEstimator, TransformerMixin):
    def __init__(self, pca_columns, n_components=2, pca_col_prefix="PCA"):
        self.pca_columns = pca_columns
        self.n_components = n_components
        self.pca_col_prefix = pca_col_prefix

    def fit(self, X, y=None):
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        self.scaler = StandardScaler()
        X_scaled = self.scaler.fit_transform(X[self.pca_columns])  # Scale features
        self.pca = PCA(n_components=self.n_components, random_state=0)
        self.pca.fit(X_scaled)  # Fit PCA on scaled features
        return self

    def transform(self, X):
        result = X.copy()
        X = X.copy()
        for colname in X.select_dtypes(["category"]):
            X[colname] = X[colname].cat.codes
        X_scaled = self.scaler.transform(X[self.pca_columns])  # Apply same scaling as training
        pca_components = self.pca.transform(X_scaled)  # Apply PCA
        # print(self.pca.explained_variance_ratio_)
        # print(np.cumsum(self.pca.explained_variance_ratio_))
        pca_components = pd.DataFrame(
                    pca_components, columns=[f"{self.pca_col_prefix}_{i}" for i in range(pca_components.shape[1])]
        )
        pca_components.set_index(X.index, inplace = True)
        result = result.join(pca_components)
        return result

In [76]:
[f"PCA_{i}" for i in range(5)]

['PCA_0', 'PCA_1', 'PCA_2', 'PCA_3', 'PCA_4']

In [46]:
X, y = load_and_preprocess_data()
X_test, _ = load_and_preprocess_data(train_data = False)

print("removing less important features")
features_to_drop = ['MoSold', 'MiscVal', 'Utilities']
X.drop(columns = features_to_drop, inplace = True)
X_test.drop(columns = features_to_drop, inplace = True)
print(X.shape)
print(X_test.shape)

print("appending features")
X = append_features(X)
print(X.shape)
X_test = append_features(X_test)
print(X_test.shape)
ordinal_categorical_cols = remove_columns_from_list(ordered_levels.keys(), features_to_drop)

Train data
Loaded data
(1460, 79)
Test data
Loaded data
(1459, 79)
removing less important features
(1460, 76)
(1459, 76)
appending features
(1460, 79)
(1459, 79)


In [47]:
categorical_cols = [cname for cname in X.columns if
                    X[cname].dtype == "category"]

numerical_cols = [cname for cname in X.columns if 
                X[cname].dtype in ['int64', 'float64']]

small_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() < 10 and cname not in ordinal_categorical_cols]
large_cat_categorical_cols = [cname for cname in categorical_cols if
                             X[cname].nunique() >= 10 and cname not in ordinal_categorical_cols]

print(len(ordinal_categorical_cols))
print(len(small_cat_categorical_cols))
print(len(large_cat_categorical_cols))
print(len(categorical_cols))  
print(len(numerical_cols))

23
20
4
47
32


In [81]:
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])
ord_categorical_transformer = Pipeline(steps=[
    ('catcode', OrdinalEncoder())
    # ,
    # ('scaler', StandardScaler())
])
small_categorical_transformer = Pipeline(steps=[
    # ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ('small_cat_catcode', OrdinalEncoder())
    # ,
    # ('scaler', StandardScaler())
])


pipeline = Pipeline([
    ('append_pca', AppendPCA(X.columns, n_components = 5)),  
    ('append_target_encoder', CrossFoldEncoder(cols=large_cat_categorical_cols, 
                                        encoder=MEstimateEncoder, 
                                        encoder_other_params={"m":10.0})),
    ('encoder_scaler', ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, [col + "_encoded" for col in large_cat_categorical_cols]+[f"PCA_{i}" for i in range(5)]),
            ('ord_cat', ord_categorical_transformer, ordinal_categorical_cols),
            ('small_cat', small_categorical_transformer, small_cat_categorical_cols)
        ],
        remainder="passthrough")
    ),
    ('model', XGBRegressor())         
])

{'m': 10.0}


Lets redefine scoring function without cat.codes conversion

In [74]:
def score_dataset(X, y, model=XGBRegressor()):
    # Metric for Housing competition is RMSLE (Root Mean Squared Log Error)
    log_y = np.log(y)
    score = cross_val_score(
        model, X, log_y, cv=5, scoring="neg_root_mean_squared_error",
    )
    score = -1 * score.mean()
    return score

In [82]:
score_dataset(X, y, pipeline)

{'m': 10.0}
{'m': 10.0}
{'m': 10.0}
{'m': 10.0}
{'m': 10.0}


0.12792979790024622

With this, its a good time to create a fresh notebook with only necessary code, instead of running all exploratory analysis everytime.