# Load Data

In [1]:
from pathlib import Path
import sys

# Go up one folder from the current notebook directory
sys.path.append(str(Path().resolve().parent))

from src.data_access import load_housing_data

In [2]:
train, test = load_housing_data()

# Insights from EDA

## Feature Engineering Ideas

🏠 Size & Area Features

TotalSF = TotalBsmtSF + 1stFlrSF + 2ndFlrSF → overall living + basement size.

PorchSF = OpenPorchSF + EnclosedPorch + 3SsnPorch + ScreenPorch → one combined porch measure.

TotalBaths = FullBath + HalfBath*0.5 + BsmtFullBath + BsmtHalfBath*0.5 → cleaner bathroom measure.

BasementPct = TotalBsmtSF / TotalSF → proportion of house that’s basement.

LotRatio = GrLivArea / LotArea → density of construction.

AvgRoomSize = GrLivArea / TotRmsAbvGrd (even if you drop TotRmsAbvGrd, this ratio is useful).

⏳ Time-Based Features

Use YrSold as reference:

HouseAge = YrSold – YearBuilt → age of the house at sale.

RemodAge = YrSold – YearRemodAdd → how long since last remodel.

GarageAge = YrSold – GarageYrBlt (clip negatives to 0).

IsRemodeled = binary flag if YearBuilt != YearRemodAdd.

DecadeBuilt = (YearBuilt // 10) * 10 → bin into decades.

🚪 Garage & Basement Features

HasGarage = from GarageType / GarageCars > 0.

GarageCapacityPerSF = GarageCars / GrLivArea → relative size.

FinishedBsmtPct = (TotalBsmtSF - BsmtUnfSF) / TotalBsmtSF → % finished basement.

HasBasement = binary if TotalBsmtSF > 0.

🏡 Location & Lot Features

CornerLot = flag from LotConfig == 'Corner'.

LotFrontageRatio = LotFrontage / LotArea → shape measure.

Neighborhood_Tier = group neighborhoods by median SalePrice (high/med/low tier).

MSSubClass_Category = map MSSubClass codes into meaningful categories (e.g., “1-Story,” “2-Story,” “Split Level”).

🔥 Quality / Condition Interactions

OverallQualityIndex = combine OverallQual (numeric) + OverallCond (ordinal) → more stable quality measure.

ExterScore = mean of ExterQual + ExterCond.

GarageScore = mean of GarageQual + GarageCond.

KitchenScore = just KitchenQual, or combine with Functional.

QualityAgeInteraction = OverallQual * HouseAge → newer but poor-quality vs. older but well-built.

⚡ Utility / Convenience Flags

HasCentralAir = from CentralAir.

Has2ndFlr = binary if 2ndFlrSF > 0.

HasPorch = binary if PorchSF > 0.

HasWoodDeck = binary if WoodDeckSF > 0.

HasMasonryVeneer = binary if MasVnrArea > 0.

🎯 Interaction Ideas (cross-bucket)

Size × Neighborhood: a big house in a low-value neighborhood doesn’t increase price as much as in a high-value one. Could be modeled with interaction terms.

Quality × Area: OverallQual * GrLivArea → large but poor-quality homes might not price the same as smaller but high-quality ones.

Condition × YearBuilt: newer homes in “adjacent to positive feature” conditions might be premium.

## Feature Classification and details

In [12]:
nominal_cols = ['MSSubClass', 'MoSold', 'BldgType', 'MasVnrType', 'GarageType', 'SaleType', 'Condition1',
                'Condition2', 'SaleCondition', 'Neighborhood', 'Exterior1st', 'HouseStyle', 'RoofMatl',
                'BsmtFinType2', 'RoofStyle', 'BsmtFinType1', 'Heating', 'Foundation', 'LotConfig', 'MSZoning',
                'Electrical']

collapse_to_binary_cols = ['LowQualFinSF', 'MiscVal', '3SsnPorch', 'PoolArea', 'BsmtFullBath', 'HalfBath',
                           'BsmtHalfBath', 'BsmtFinSF2', 'EnclosedPorch', 'ScreenPorch', 'Fence']

right_skewed_cols = ['LotArea', 'GrLivArea', 'BsmtUnfSF', 'SalePrice', '1stFlrSF', 'TotalBsmtSF', 'LotFrontage']

skewed_and_binary = ['2ndFlrSF', 'OpenPorchSF', 'WoodDeckSF', 'MasVnrArea']

drop = ['PoolQC', 'MiscFeature', 'Utilities', 'Id', 'GarageArea', 'TotRmsAbvGrd', 'Alley', 'Exterior2nd']

obj_ordinal_cols = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu',
                    'GarageQual', 'GarageCond', 'Functional', 'LandContour', 'LotShape', 'BsmtExposure',
                    'LandSlope', 'GarageFinish', 'PavedDrive']

obj_already_binary_cols = ['Street', 'CentralAir']

year_cols = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']

imabalanced_numerical = {
    'KitchenAbvGr': 'collapse to multiple kitches flag.',
    'TotRmsAbvGrd': 'join 2 and 3, and 10, 11, 12, 14',
    'OverallCond': 'collapsed into poor average good',
    'BedroomAbvGr': 'Collapse to 3 bins (Small / Typical / Large) OR drop in favor of TotRmsAbvGrd.',
    'GarageCars': 'collapse 4 into 3'
}

imbalanced_object = {
    'Condition1': 'Collapse very rare categories (<= 5 obs) into "other"',
    'Condition2': 'Collapse into "Norm" vs "Other"',
    'SaleCondition': 'collapse Adjland, alloca, and family into other',
    'GarageType': 'collapse less than 10 into other',
    'SaleType': 'Collapse less than 10 into other',
    'Exterior1st': 'combine less than and combine Wd Sdng and Wd Shng',
    'RoofMatl': 'collapse into other all except CompShg and WdShngl',
    'RoofStyle': 'keep Gambrel and Hip, collapse others into other',
    'Heating': 'keep GasA and GasW, collapse others into other',
    'Foundation': 'collapse stone and wood into others',
    'Electrical': 'collapse FusP and Mix'
}

lists_dict = {
    "nominal_cols": nominal_cols,
    "collapse_to_binary_cols": collapse_to_binary_cols,
    "right_skewed_cols": right_skewed_cols,
    "skewed_and_binary": skewed_and_binary,
    "drop": drop,
    "obj_ordinal_cols": obj_ordinal_cols,
    "obj_already_binary_cols": obj_already_binary_cols,
    "year_cols": year_cols
}

The cleanest approach is to build a pipeline for each feature bucket (e.g., skewed numeric, ordinal, nominal, year, binary), chaining the relevant steps like imputing, encoding, scaling, or transformations. These pipelines are then combined inside a ColumnTransformer, which applies the right sequence of transformations to each group while keeping the whole preprocessing reproducible and consistent.

# Imputation Column Transformer

In [22]:
impute_structural_none = [
    "MasVnrType",
    "GarageType",
    "BsmtFinType1", "BsmtFinType2",
    "BsmtQual", "BsmtCond", "BsmtExposure",
    "FireplaceQu",
    "GarageQual", "GarageCond", "GarageFinish"
]

impute_mode_fill = [
    # Nominal
    "MSSubClass", "MoSold", "BldgType", "SaleType", "Condition1", "Condition2",
    "SaleCondition", "Neighborhood", "Exterior1st", "HouseStyle", "RoofMatl",
    "RoofStyle", "Heating", "Foundation", "LotConfig", "MSZoning", "Electrical",
    # Ordinal
    "ExterQual", "ExterCond", "HeatingQC", "KitchenQual",
    "Functional", "LandContour", "LotShape", "LandSlope", "PavedDrive"
]

impute_zero_fill = [
    "2ndFlrSF", "OpenPorchSF", "WoodDeckSF", "MasVnrArea",
    "LowQualFinSF", "MiscVal", "3SsnPorch", "PoolArea",
    "BsmtFullBath", "HalfBath", "BsmtHalfBath",
    "BsmtFinSF2", "EnclosedPorch", "ScreenPorch", "Fence"
]

impute_median_fill = [
    "LotArea", "GrLivArea", "BsmtUnfSF", "1stFlrSF",
    "TotalBsmtSF", "LotFrontage"
]

impute_year_cols = ["YearBuilt", "YearRemodAdd", "GarageYrBlt", "YrSold"]

In [75]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

imputer = ColumnTransformer(
    transformers=[
        ('none_imputer', SimpleImputer(strategy='constant', fill_value='None'), impute_structural_none),
        ('mode_imputer', SimpleImputer(strategy='most_frequent'), impute_mode_fill),
        ('zero_imputer', SimpleImputer(strategy='constant', fill_value=0), impute_zero_fill),
        ('median_imputer', SimpleImputer(strategy='median'), impute_median_fill + impute_year_cols)
    ]
).set_output(transform='pandas')

# Nominal Pipeline

In [73]:
from sklearn.base import TransformerMixin, BaseEstimator

class CollapseRareCategories(TransformerMixin, BaseEstimator):
    def __init__(self, cols=nominal_cols, threshold=10):
        self.cols = cols
        self.threshold = threshold
        self.to_transform_ = {}
        self.feature_names_in_ = None

    def fit(self, X, y=None):
        # Record input feature names
        self.feature_names_in_ = X.columns if isinstance(X, pd.DataFrame) else np.arange(X.shape[1])
        
        for feature in self.cols:
            counts = X[feature].value_counts()
            rare_cats = counts[counts < self.threshold].index.tolist()
            self.to_transform_[feature] = rare_cats
        return self

    def transform(self, X, y=None):
        X_copy = X.copy()
        
        for feature, rare_cats in self.to_transform_.items():
            if pd.api.types.is_numeric_dtype(X_copy[feature]):
                X_copy[feature] = X_copy[feature].astype("object")
            X_copy.loc[X_copy[feature].isin(rare_cats), feature] = "Other"
        return X_copy

    def get_feature_names_out(self, input_features=None):
        # Return the original features unchanged (collapse doesn't expand features)
        if input_features is None:
            return np.array(self.feature_names_in_)
        return np.array(input_features)

from sklearn.base import TransformerMixin, BaseEstimator

class collapse_rare_categories(TransformerMixin, BaseEstimator):
    def __init__(self, cols, threshold=10):
        self.threshold = threshold
        self.counts = {}
        self.to_transform_ = {}
        self.cols = cols

    def fit(self, X, y=None):
        for feature in X[self.cols].columns:
            count = X[feature].value_counts()
            self.counts[feature] = {cat: count for (cat, count) in zip(count.index, count.values)}
    
        for feature_name, cat_count_dict in counts.items():
            self.to_transform[feature_name] = []
            for cat, count in cat_count_dict.items():
                if count < 10:
                    self.to_transform[feature_name].append(cat)

    def transform(self, X, y=None):
        for feature, cat_list in self.to_transform.items():
            if exp[feature].dtype == 'int64':
                exp[feature] = exp[feature].astype('object')
            for cat in cat_list:
                exp.loc[exp[feature] == cat, feature] = 'Other'

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

nominal_pipeline = Pipeline([
    ('collapse', CollapseRareCategories()),
    ('one_hot', OneHotEncoder(handle_unknown='ignore'))
])

# Binary Pipeline