# Objectives

Prepare the dataset for ML by:
* Encoding categorical variables appropriately for model input.
* Handling skewed or non-linear numeric features.
* Treating outliers to improve model robustness.
* Removing multicollinear or redundant features.
* Saving consistent train/test datasets after transformations.

# Inputs

* outputs/datasets/cleaned/TrainSetCleaned.csv: cleaned training dataset from the Data Cleaning notebook.
* outputs/datasets/cleaned/TestSetCleaned.csv: cleaned test dataset from the Data Cleaning notebook.

# Outputs

* outputs/datasets/engineered/TrainSetEngineered.csv: fully transformed and engineered training dataset.
* outputs/datasets/engineered/TestSetEngineered.csv: fully transformed and engineered test dataset.

Additional Comments
* This notebook focuses only on the cleaned house records.
* The inherited_houses.csv data will be transformed later, using the same pipeline, once the final model is trained.

# Change Working Directory

In [26]:
import os 

project_root = "/workspaces/milestone-project-heritage-housing-issues"
os.chdir(project_root)

print("Current working directory set to:")
print(os.getcwd())

Current working directory set to:
/workspaces/milestone-project-heritage-housing-issues


# Load Cleaned Data

In [27]:
# Load required libraries
import pandas as pd
import os

# Load cleaned data
train_path = "outputs/datasets/cleaned/TrainSetCleaned.csv"
test_path = "outputs/datasets/cleaned/TestSetCleaned.csv"

TrainSet = pd.read_csv(train_path)
TestSet = pd.read_csv(test_path)

print("TrainSet:", TrainSet.shape)
print("TestSet:", TestSet.shape)
TrainSet.head(3)

TrainSet: (1168, 24)
TestSet: (292, 24)


Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,EnclosedPorch,GarageArea,GarageFinish,...,LotFrontage,MasVnrArea,OpenPorchSF,OverallCond,OverallQual,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,SalePrice
0,1314,0.0,3.0,No,922,Rec,392,0.0,294,RFn,...,70.0,0.0,0,6,5,1314,0.0,1957,1957,145000
1,799,772.0,3.0,No,0,Unf,799,0.0,380,RFn,...,59.0,0.0,40,7,6,799,0.0,1993,1994,178000
2,796,0.0,2.0,No,0,Unf,796,0.0,0,Unf,...,67.0,0.0,0,7,5,796,328.0,1910,1950,85000


# Data Exploration

In [28]:
from ydata_profiling import ProfileReport

pandas_report = ProfileReport(df=TrainSet, minimal=True)
pandas_report.to_notebook_iframe()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

# Feature Engineering

In [29]:
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import warnings
from feature_engine import transformation as vt
from feature_engine.outliers import Winsorizer
from feature_engine.encoding import OrdinalEncoder
sns.set(style="whitegrid")
warnings.filterwarnings('ignore')


def FeatureEngineeringAnalysis(df, analysis_type=None):
    """
    - used for quick feature engineering on numerical and categorical variables
    to decide which transformation can better transform the distribution shape
    - Once transformed, use a reporting tool, like ydata-profiling, to evaluate distributions
    """
    check_missing_values(df)
    allowed_types = ['numerical', 'ordinal_encoder', 'outlier_winsorizer']
    check_user_entry_on_analysis_type(analysis_type, allowed_types)
    list_column_transformers = define_list_column_transformers(analysis_type)

    # Loop in each variable and engineer the data according to the analysis type
    df_feat_eng = pd.DataFrame([])
    for column in df.columns:
        # create additional columns (column_method) to apply the methods
        df_feat_eng = pd.concat([df_feat_eng, df[column]], axis=1)
        for method in list_column_transformers:
            df_feat_eng[f"{column}_{method}"] = df[column]

        # Apply transformers in respective column_transformers
        df_feat_eng, list_applied_transformers = apply_transformers(
            analysis_type, df_feat_eng, column)

        # For each variable, assess how the transformations perform
        transformer_evaluation(
            column, list_applied_transformers, analysis_type, df_feat_eng)

    return df_feat_eng


def check_user_entry_on_analysis_type(analysis_type, allowed_types):
    """ Check analysis type """
    if analysis_type is None:
        raise SystemExit(
            f"You should pass analysis_type parameter as one of the following options: {allowed_types}")
    if analysis_type not in allowed_types:
        raise SystemExit(
            f"analysis_type argument should be one of these options: {allowed_types}")


def check_missing_values(df):
    if df.isna().sum().sum() != 0:
        raise SystemExit(
            f"There is a missing value in your dataset. Please handle that before getting into feature engineering.")


def define_list_column_transformers(analysis_type):
    """ Set suffix columns according to analysis_type"""
    if analysis_type == 'numerical':
        list_column_transformers = [
            "log_e", "log_10", "reciprocal", "power", "box_cox", "yeo_johnson"]

    elif analysis_type == 'ordinal_encoder':
        list_column_transformers = ["ordinal_encoder"]

    elif analysis_type == 'outlier_winsorizer':
        list_column_transformers = ['iqr']

    return list_column_transformers


def apply_transformers(analysis_type, df_feat_eng, column):
    for col in df_feat_eng.select_dtypes(include='category').columns:
        df_feat_eng[col] = df_feat_eng[col].astype('object')

    if analysis_type == 'numerical':
        df_feat_eng, list_applied_transformers = FeatEngineering_Numerical(
            df_feat_eng, column)

    elif analysis_type == 'outlier_winsorizer':
        df_feat_eng, list_applied_transformers = FeatEngineering_OutlierWinsorizer(
            df_feat_eng, column)

    elif analysis_type == 'ordinal_encoder':
        df_feat_eng, list_applied_transformers = FeatEngineering_CategoricalEncoder(
            df_feat_eng, column)

    return df_feat_eng, list_applied_transformers


def transformer_evaluation(column, list_applied_transformers, analysis_type, df_feat_eng):
    # For each variable, assess how the transformations perform
    print(f"* Variable Analyzed: {column}")
    print(f"* Applied transformation: {list_applied_transformers} \n")
    for col in [column] + list_applied_transformers:

        if analysis_type != 'ordinal_encoder':
            DiagnosticPlots_Numerical(df_feat_eng, col)

        else:
            if col == column:
                DiagnosticPlots_Categories(df_feat_eng, col)
            else:
                DiagnosticPlots_Numerical(df_feat_eng, col)

        print("\n")


def DiagnosticPlots_Categories(df_feat_eng, col):
    plt.figure(figsize=(4, 3))
    sns.countplot(data=df_feat_eng, x=col, palette=[
                  '#432371'], order=df_feat_eng[col].value_counts().index)
    plt.xticks(rotation=90)
    plt.suptitle(f"{col}", fontsize=30, y=1.05)
    plt.show()
    print("\n")


def DiagnosticPlots_Numerical(df, variable):
    fig, axes = plt.subplots(1, 3, figsize=(12, 4))
    sns.histplot(data=df, x=variable, kde=True, element="step", ax=axes[0])
    stats.probplot(df[variable], dist="norm", plot=axes[1])
    sns.boxplot(x=df[variable], ax=axes[2])

    axes[0].set_title('Histogram')
    axes[1].set_title('QQ Plot')
    axes[2].set_title('Boxplot')
    fig.suptitle(f"{variable}", fontsize=30, y=1.05)
    plt.tight_layout()
    plt.show()


def FeatEngineering_CategoricalEncoder(df_feat_eng, column):
    list_methods_worked = []
    try:
        encoder = OrdinalEncoder(encoding_method='arbitrary', variables=[
                                 f"{column}_ordinal_encoder"])
        df_feat_eng = encoder.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_ordinal_encoder")

    except Exception:
        df_feat_eng.drop([f"{column}_ordinal_encoder"], axis=1, inplace=True)

    return df_feat_eng, list_methods_worked


def FeatEngineering_OutlierWinsorizer(df_feat_eng, column):
    list_methods_worked = []

    # Winsorizer iqr
    try:
        disc = Winsorizer(
            capping_method='iqr', tail='both', fold=1.5, variables=[f"{column}_iqr"])
        df_feat_eng = disc.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_iqr")
    except Exception:
        df_feat_eng.drop([f"{column}_iqr"], axis=1, inplace=True)

    return df_feat_eng, list_methods_worked


def FeatEngineering_Numerical(df_feat_eng, column):
    list_methods_worked = []

    # LogTransformer base e
    try:
        lt = vt.LogTransformer(variables=[f"{column}_log_e"])
        df_feat_eng = lt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_log_e")
    except Exception:
        df_feat_eng.drop([f"{column}_log_e"], axis=1, inplace=True)

    # LogTransformer base 10
    try:
        lt = vt.LogTransformer(variables=[f"{column}_log_10"], base='10')
        df_feat_eng = lt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_log_10")
    except Exception:
        df_feat_eng.drop([f"{column}_log_10"], axis=1, inplace=True)

    # ReciprocalTransformer
    try:
        rt = vt.ReciprocalTransformer(variables=[f"{column}_reciprocal"])
        df_feat_eng = rt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_reciprocal")
    except Exception:
        df_feat_eng.drop([f"{column}_reciprocal"], axis=1, inplace=True)

    # PowerTransformer
    try:
        pt = vt.PowerTransformer(variables=[f"{column}_power"])
        df_feat_eng = pt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_power")
    except Exception:
        df_feat_eng.drop([f"{column}_power"], axis=1, inplace=True)

    # BoxCoxTransformer
    try:
        bct = vt.BoxCoxTransformer(variables=[f"{column}_box_cox"])
        df_feat_eng = bct.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_box_cox")
    except Exception:
        df_feat_eng.drop([f"{column}_box_cox"], axis=1, inplace=True)

    # YeoJohnsonTransformer
    try:
        yjt = vt.YeoJohnsonTransformer(variables=[f"{column}_yeo_johnson"])
        df_feat_eng = yjt.fit_transform(df_feat_eng)
        list_methods_worked.append(f"{column}_yeo_johnson")
    except Exception:
        df_feat_eng.drop([f"{column}_yeo_johnson"], axis=1, inplace=True)

    return df_feat_eng, list_methods_worked

# Dealing with Feature Engineering

* Categorical Encoding – Ordinal
* Numerical Transformation
* Date/Year Feature Engineering
* Smart Correlated Feature Selection

# Categorical Encoding - Ordinal

'OverallCond' and 'OverallQual' are ordered levels, so no need for encoding.

Selected Variables:

In [30]:
variables_engineering = ['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual']
variables_engineering

['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual']

Create separate DataFrame

In [31]:
df_engineering = TrainSet[variables_engineering].copy()
df_engineering.head(3)

Unnamed: 0,BsmtExposure,BsmtFinType1,GarageFinish,KitchenQual
0,No,Rec,RFn,TA
1,No,Unf,RFn,TA
2,No,Unf,Unf,TA


Apply Transformations

In [32]:
df_engineering = FeatureEngineeringAnalysis(df=df_engineering, analysis_type='ordinal_encoder')

* Variable Analyzed: BsmtExposure
* Applied transformation: ['BsmtExposure_ordinal_encoder'] 







* Variable Analyzed: BsmtFinType1
* Applied transformation: ['BsmtFinType1_ordinal_encoder'] 







* Variable Analyzed: GarageFinish
* Applied transformation: ['GarageFinish_ordinal_encoder'] 







* Variable Analyzed: KitchenQual
* Applied transformation: ['KitchenQual_ordinal_encoder'] 









In [33]:
# Step 1 - Create a transformer
encoder = OrdinalEncoder(encoding_method='arbitrary', variables=variables_engineering)

# Step 2 - Fit and transform on training set
TrainSet = encoder.fit_transform(TrainSet)

# Step 3 - Transform the test set
TestSet = encoder.transform(TestSet)

print("* Categorical encoding - ordinal transformation done!")
TrainSet.head(3)
TestSet.head(3)

* Categorical encoding - ordinal transformation done!


Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,EnclosedPorch,GarageArea,GarageFinish,...,LotFrontage,MasVnrArea,OpenPorchSF,OverallCond,OverallQual,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,SalePrice
0,1068,0.0,3.0,0,663,5,396,0.0,264,0,...,70.0,0.0,0,8,6,1059,0.0,1963,2003,154500
1,1500,0.0,3.0,1,1032,5,431,0.0,712,0,...,98.0,362.0,32,5,8,1463,0.0,1994,1995,325000
2,1028,0.0,2.0,0,0,1,1008,0.0,360,1,...,56.0,0.0,0,6,5,1008,0.0,1927,1950,115000


# Numerical Feature Transformation

In this step, we analyzed all continuous numeric features to assess their distributions and apply necessary transformations.

Why?
Even though tree-based models like XGBoost don't require scaling, features with high skew (like LotArea or GrLivArea) can impact model learning. Transformations like log-scaling help stabilize variance and reduce skew.

Selected Variables:
* Square footage features (e.g., 1stFlrSF, TotalBsmtSF, GrLivArea)
* Lot dimensions (LotArea, LotFrontage)
* Other structural features (GarageArea, MasVnrArea, etc.)
* Year-based features (YearBuilt, GarageYrBlt, etc.)

Transformation Method:
Using FeatureEngineeringAnalysis() with the "numerical" option, we identified and applied transformations where needed.

In [34]:
variables_engineering = ['1stFlrSF', '2ndFlrSF', 'BedroomAbvGr', 'BsmtFinSF1', 'BsmtUnfSF',
    'TotalBsmtSF', 'GarageArea', 'GarageYrBlt', 'GrLivArea', 'LotArea',
    'LotFrontage', 'MasVnrArea', 'OpenPorchSF', 'YearBuilt', 'YearRemodAdd']
variables_engineering

['1stFlrSF',
 '2ndFlrSF',
 'BedroomAbvGr',
 'BsmtFinSF1',
 'BsmtUnfSF',
 'TotalBsmtSF',
 'GarageArea',
 'GarageYrBlt',
 'GrLivArea',
 'LotArea',
 'LotFrontage',
 'MasVnrArea',
 'OpenPorchSF',
 'YearBuilt',
 'YearRemodAdd']

In [35]:
df_engineering = TrainSet[variables_engineering].copy()
df_engineering.head(3)

Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtFinSF1,BsmtUnfSF,TotalBsmtSF,GarageArea,GarageYrBlt,GrLivArea,LotArea,LotFrontage,MasVnrArea,OpenPorchSF,YearBuilt,YearRemodAdd
0,1314,0.0,3.0,922,392,1314,294,1957.0,1314,8400,70.0,0.0,0,1957,1957
1,799,772.0,3.0,0,799,799,380,1993.0,1571,7837,59.0,0.0,40,1993,1994
2,796,0.0,2.0,0,796,796,0,1980.0,796,8777,67.0,0.0,0,1910,1950


In [36]:
df_engineering = FeatureEngineeringAnalysis(df=df_engineering, analysis_type='numerical')

df_engineering.head()

* Variable Analyzed: 1stFlrSF
* Applied transformation: ['1stFlrSF_log_e', '1stFlrSF_log_10', '1stFlrSF_reciprocal', '1stFlrSF_power', '1stFlrSF_box_cox', '1stFlrSF_yeo_johnson'] 















* Variable Analyzed: 2ndFlrSF
* Applied transformation: ['2ndFlrSF_power', '2ndFlrSF_yeo_johnson'] 







* Variable Analyzed: BedroomAbvGr
* Applied transformation: ['BedroomAbvGr_power', 'BedroomAbvGr_yeo_johnson'] 







* Variable Analyzed: BsmtFinSF1
* Applied transformation: ['BsmtFinSF1_power', 'BsmtFinSF1_yeo_johnson'] 







* Variable Analyzed: BsmtUnfSF
* Applied transformation: ['BsmtUnfSF_power', 'BsmtUnfSF_yeo_johnson'] 







* Variable Analyzed: TotalBsmtSF
* Applied transformation: ['TotalBsmtSF_power', 'TotalBsmtSF_yeo_johnson'] 







* Variable Analyzed: GarageArea
* Applied transformation: ['GarageArea_power', 'GarageArea_yeo_johnson'] 







* Variable Analyzed: GarageYrBlt
* Applied transformation: ['GarageYrBlt_log_e', 'GarageYrBlt_log_10', 'GarageYrBlt_reciprocal'

Unnamed: 0,1stFlrSF,1stFlrSF_log_e,1stFlrSF_log_10,1stFlrSF_reciprocal,1stFlrSF_power,1stFlrSF_box_cox,1stFlrSF_yeo_johnson,2ndFlrSF,2ndFlrSF_power,2ndFlrSF_yeo_johnson,...,YearBuilt_power,YearBuilt_box_cox,YearBuilt_yeo_johnson,YearRemodAdd,YearRemodAdd_log_e,YearRemodAdd_log_10,YearRemodAdd_reciprocal,YearRemodAdd_power,YearRemodAdd_box_cox,YearRemodAdd_yeo_johnson
0,1314,7.180831,3.118595,0.000761,36.249138,6.571998,6.549951,0.0,0.0,0.0,...,44.237993,5.311234e+71,5.833207e+71,1957,7.579168,3.291591,0.000511,44.237993,5.920889e+133,1.247205e+103
1,799,6.683361,2.902547,0.001252,28.266588,6.15383,6.135072,772.0,27.784888,4.074338,...,44.643029,7.960436e+71,8.742716e+71,1994,7.597898,3.299725,0.000502,44.654227,1.279264e+134,2.2608780000000002e+103
2,796,6.679599,2.900913,0.001256,28.213472,6.150648,6.131915,0.0,0.0,0.0,...,43.703547,3.096175e+71,3.400504e+71,1950,7.575585,3.290035,0.000513,44.158804,5.109497e+133,1.1130490000000001e+103
3,981,6.888572,2.991669,0.001019,31.32092,6.32696,6.30684,787.0,28.05352,4.080885,...,44.011362,4.228235e+71,4.643796e+71,1950,7.575585,3.290035,0.000513,44.158804,5.109497e+133,1.1130490000000001e+103
4,1026,6.933423,3.011147,0.000975,32.031235,6.364681,6.344264,665.0,25.787594,4.022889,...,43.863424,3.641125e+71,3.998999e+71,1950,7.575585,3.290035,0.000513,44.158804,5.109497e+133,1.1130490000000001e+103


In [37]:
# Not going to apply these numerical transformations at all, as we are using 
# tree-based models which do not require normally distributed features, 
# nor are they sensitive to skewness, variance, or magnitude in the same way linear models are.



# Smart Correlation Selection

In [38]:
df_engineering = TrainSet.copy()
df_engineering.head(3)

Unnamed: 0,1stFlrSF,2ndFlrSF,BedroomAbvGr,BsmtExposure,BsmtFinSF1,BsmtFinType1,BsmtUnfSF,EnclosedPorch,GarageArea,GarageFinish,...,LotFrontage,MasVnrArea,OpenPorchSF,OverallCond,OverallQual,TotalBsmtSF,WoodDeckSF,YearBuilt,YearRemodAdd,SalePrice
0,1314,0.0,3.0,0,922,0,392,0.0,294,0,...,70.0,0.0,0,6,5,1314,0.0,1957,1957,145000
1,799,772.0,3.0,0,0,1,799,0.0,380,0,...,59.0,0.0,40,7,6,799,0.0,1993,1994,178000
2,796,0.0,2.0,0,0,1,796,0.0,0,1,...,67.0,0.0,0,7,5,796,328.0,1910,1950,85000


In [39]:
from feature_engine.selection import SmartCorrelatedSelection

corr_sel = SmartCorrelatedSelection(
    variables=None, method="spearman", threshold=0.6, selection_method="variance"
)

df_engineering = corr_sel.fit_transform(df_engineering)

# Show correlated sets and dropped features
print(corr_sel.correlated_feature_sets_)
print("Features dropped:", corr_sel.features_to_drop_)

[{'1stFlrSF', 'TotalBsmtSF'}, {'GrLivArea', '2ndFlrSF'}, {'SalePrice', 'GarageArea'}, {'YearRemodAdd', 'GarageYrBlt', 'YearBuilt'}]
Features dropped: ['1stFlrSF', '2ndFlrSF', 'GarageArea', 'GarageYrBlt', 'YearRemodAdd']


# Conclusion

Feature Engineering Transformers

* Ordinal categorical encoding: ['BsmtExposure', 'BsmtFinType1', 'GarageFinish', 'KitchenQual']
* Smart Correlation Selection: ['1stFlrSF', '2ndFlrSF', 'GarageArea', 'GarageYrBlt', 'YearRemodAdd']