In [1]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.6.2-py2.py3-none-any.whl (328 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.2


In [2]:
import pandas as pd
import matplotlib.pyplot as plt

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# from feature-engine
from feature_engine.imputation import EndTailImputer

In [3]:
# let's load the dataset with a selected group of variables

cols_to_use = [
    "BsmtQual",
    "FireplaceQu",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

data = pd.read_csv("/content/drive/MyDrive/Feature Engineering/Datasets/houseprice.csv", usecols=cols_to_use)
data.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,SalePrice
0,65.0,196.0,Gd,,2003.0,208500
1,80.0,0.0,Gd,TA,1976.0,181500
2,68.0,162.0,Gd,TA,2001.0,223500
3,60.0,0.0,TA,Gd,1998.0,140000
4,84.0,350.0,Gd,TA,2000.0,250000


In [4]:
data.isnull().mean()

LotFrontage    0.177397
MasVnrArea     0.005479
BsmtQual       0.025342
FireplaceQu    0.472603
GarageYrBlt    0.055479
SalePrice      0.000000
dtype: float64

In [5]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(["SalePrice"], axis=1),
    data["SalePrice"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((1022, 5), (438, 5))

In [6]:
# we specify whether we want to find the values using
# the gaussian approximation or the inter-quartile range
# proximity rule.

# in addition we need to specify if we want the values placed at
# the left or right tail

imputer = EndTailImputer(imputation_method="gaussian", tail="right")

In [7]:
# we fit the imputer

imputer.fit(X_train)

In [8]:
# we see that the imputer found the numerical variables

imputer.variables_

['LotFrontage', 'MasVnrArea', 'GarageYrBlt']

In [9]:
# here we can see the values that will be used
# to replace NA for each variable

imputer.imputer_dict_

{'LotFrontage': 138.9022201686726,
 'MasVnrArea': 648.3947111415165,
 'GarageYrBlt': 2052.9707419772235}

In [10]:
# and this is how those values were calculated
# which is how we learnt in the first notebooks of
# this section

X_train[imputer.variables_].mean() + 3 * X_train[imputer.variables_].std()

LotFrontage     138.902220
MasVnrArea      648.394711
GarageYrBlt    2052.970742
dtype: float64

In [11]:
# feature-engine returns a dataframe

X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

X_train_t.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
64,138.90222,573.0,Gd,,1998.0
682,138.90222,0.0,Gd,Gd,1996.0
960,50.0,0.0,TA,,2052.970742
1384,60.0,0.0,TA,,1939.0
1100,60.0,0.0,TA,,1930.0


In [12]:
# let's check that the numerical variables don't
# contain NA any more

X_train_t[imputer.variables_].isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
GarageYrBlt    0.0
dtype: float64

In [13]:
# let's impute 2 of the 3 numerival variables

# let's also select the IQR proximity rule on the left tail

imputer = EndTailImputer(
    imputation_method="iqr", tail="left", variables=["LotFrontage", "MasVnrArea"]
)

imputer.fit(X_train)

In [14]:
# now the imputer uses only the variables we indicated

imputer.variables_

['LotFrontage', 'MasVnrArea']

In [15]:
# and we can see the value assigned to each variable

imputer.imputer_dict_

{'LotFrontage': -8.0, 'MasVnrArea': -510.0}

In [16]:
# feature-engine returns a dataframe

X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

# let's check null values are gone
X_train_t[imputer.variables_].isnull().mean()

LotFrontage    0.0
MasVnrArea     0.0
dtype: float64

In [17]:
pipe = Pipeline(
    [
        (
            "imputer_skewed",
            EndTailImputer(
                imputation_method="iqr",
                tail="right",
                variables=["GarageYrBlt", "MasVnrArea"],
            ),
        ),
        (
            "imputer_gaussian",
            EndTailImputer(
                imputation_method="gaussian", tail="right", variables=["LotFrontage"]
            ),
        ),
    ]
)

In [18]:
pipe.fit(X_train)

In [19]:
pipe.named_steps["imputer_skewed"].imputer_dict_

{'GarageYrBlt': 2121.0, 'MasVnrArea': 680.0}

In [20]:
pipe.named_steps["imputer_gaussian"].imputer_dict_

{'LotFrontage': 138.9022201686726}

In [21]:
# let's transform the data with the pipeline
X_train_t = pipe.transform(X_train)
X_test_t = pipe.transform(X_test)

# let's check null values are gone
X_test_t.isnull().mean()

LotFrontage    0.000000
MasVnrArea     0.000000
BsmtQual       0.029680
FireplaceQu    0.484018
GarageYrBlt    0.000000
dtype: float64