In [1]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.6.2-py2.py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.2


In [2]:
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# from feature-engine
from feature_engine.imputation import (
    AddMissingIndicator,
    MeanMedianImputer,
    CategoricalImputer,
)

In [3]:
# let's load the dataset with a selected group of variables

cols_to_use = [
    "BsmtQual",
    "FireplaceQu",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

data = pd.read_csv("/content/drive/MyDrive/Feature Engineering/Datasets/houseprice.csv", usecols=cols_to_use)

data.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,SalePrice
0,65.0,196.0,Gd,,2003.0,208500
1,80.0,0.0,Gd,TA,1976.0,181500
2,68.0,162.0,Gd,TA,2001.0,223500
3,60.0,0.0,TA,Gd,1998.0,140000
4,84.0,350.0,Gd,TA,2000.0,250000


In [4]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),  # just the features
    data["SalePrice"],  # the target
    test_size=0.3,  # the percentage of obs in the test set
    random_state=0,  # for reproducibility
)

X_train.shape, X_test.shape

((1022, 5), (438, 5))

In [5]:
X_train.isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
BsmtQual       0.023483
FireplaceQu    0.467710
GarageYrBlt    0.052838
dtype: float64

In [6]:
# we call the imputer from feature-engine
# the argument how allows us to determine if we want
# to add missing indicators to all variables, or only to
# those that show missing data in the train set

imputer = AddMissingIndicator(missing_only=True)

In [7]:
# we fit the imputer

imputer.fit(X_train)


In [8]:
# the attribute `variables` shows the variables entered by the user, in this
# case None

imputer.variables

In [9]:
# this attribute stores the variables, numerical and categorical,
# that had missing data in the train set

imputer.variables_

['LotFrontage', 'MasVnrArea', 'BsmtQual', 'FireplaceQu', 'GarageYrBlt']

In [10]:
# feature-engine returns a dataframe
# with the additional features

# no need to contatenate!!

X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

X_train_t.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,LotFrontage_na,MasVnrArea_na,BsmtQual_na,FireplaceQu_na,GarageYrBlt_na
64,,573.0,Gd,,1998.0,1,0,0,1,0
682,,0.0,Gd,Gd,1996.0,1,0,0,0,0
960,50.0,0.0,TA,,,0,0,0,1,1
1384,60.0,0.0,TA,,1939.0,0,0,0,1,0
1100,60.0,0.0,TA,,1930.0,0,0,0,1,0


In [11]:
# let's check NA

X_train_t.isnull().mean()

LotFrontage       0.184932
MasVnrArea        0.004892
BsmtQual          0.023483
FireplaceQu       0.467710
GarageYrBlt       0.052838
LotFrontage_na    0.000000
MasVnrArea_na     0.000000
BsmtQual_na       0.000000
FireplaceQu_na    0.000000
GarageYrBlt_na    0.000000
dtype: float64

In [12]:
# let's do it imputation but this time
# and let's select a few variables

imputer = AddMissingIndicator(variables=["BsmtQual", "FireplaceQu", "LotFrontage"])

imputer.fit(X_train)

In [13]:
# now the imputer uses only the variables we indicated

imputer.variables

['BsmtQual', 'FireplaceQu', 'LotFrontage']

In [14]:
# missing indicators will be added for the following variables
# in case that these are different from the ones passed by the user

# remember that with the argument how set to 'missing_only' the imputer
# will learn and store the variables if they show NA in the train dataset

imputer.variables_

['BsmtQual', 'FireplaceQu', 'LotFrontage']

In [15]:
# feature-engine returns a dataframe
# with the additional features

# no need to contatenate!!

X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

X_train_t.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,BsmtQual_na,FireplaceQu_na,LotFrontage_na
64,,573.0,Gd,,1998.0,0,1,1
682,,0.0,Gd,Gd,1996.0,0,0,1
960,50.0,0.0,TA,,,0,1,0
1384,60.0,0.0,TA,,1939.0,0,1,0
1100,60.0,0.0,TA,,1930.0,0,1,0


In [16]:
pipe = Pipeline(
    [
        # missing indicator
        ("missing_ind", AddMissingIndicator()),
        # mode imputation
        (
            "imputer_mode",
            CategoricalImputer(
                imputation_method="frequent", variables=["FireplaceQu", "BsmtQual"]
            ),
        ),
        # median imputation
        (
            "imputer_median",
            MeanMedianImputer(
                imputation_method="median",
                variables=["LotFrontage", "MasVnrArea", "GarageYrBlt"],
            ),
        ),
    ]
)

In [17]:
# fit the pipe
pipe.fit(X_train)

In [18]:
# inspect the separate steps
pipe.named_steps["missing_ind"].variables_

['LotFrontage', 'MasVnrArea', 'BsmtQual', 'FireplaceQu', 'GarageYrBlt']

In [19]:
pipe.named_steps["imputer_mode"].imputer_dict_

{'FireplaceQu': 'Gd', 'BsmtQual': 'TA'}

In [20]:
pipe.named_steps["imputer_median"].imputer_dict_

{'LotFrontage': 69.0, 'MasVnrArea': 0.0, 'GarageYrBlt': 1979.0}

In [21]:
# let's transform the data with the pipeline

# this pipeline will:
# - add the missing indicators
# - fill na in the original variables
# leaving the dataset ready to use for ML

X_train_t = pipe.transform(X_train)
X_test_t = pipe.transform(X_test)

# let's check null values are gone
X_train_t.isnull().mean()

LotFrontage       0.0
MasVnrArea        0.0
BsmtQual          0.0
FireplaceQu       0.0
GarageYrBlt       0.0
LotFrontage_na    0.0
MasVnrArea_na     0.0
BsmtQual_na       0.0
FireplaceQu_na    0.0
GarageYrBlt_na    0.0
dtype: float64

In [22]:
X_train_t.shape

(1022, 10)