In [17]:
!pip install feature-engine



In [18]:
import pandas as pd

# to split the datasets
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

# from feature-engine
from feature_engine.imputation import CategoricalImputer

In [19]:
# let's load the dataset with a selected group of variables

cols_to_use = [
    "BsmtQual",
    "FireplaceQu",
    "LotFrontage",
    "MasVnrArea",
    "GarageYrBlt",
    "SalePrice",
]

data = pd.read_csv("/content/drive/MyDrive/Feature Engineering/Datasets/houseprice.csv", usecols=cols_to_use)

data.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt,SalePrice
0,65.0,196.0,Gd,,2003.0,208500
1,80.0,0.0,Gd,TA,1976.0,181500
2,68.0,162.0,Gd,TA,2001.0,223500
3,60.0,0.0,TA,Gd,1998.0,140000
4,84.0,350.0,Gd,TA,2000.0,250000


In [20]:
# let's separate into training and testing set

X_train, X_test, y_train, y_test = train_test_split(
    data.drop("SalePrice", axis=1),  # just the features
    data["SalePrice"],  # the target
    test_size=0.3,  # the percentage of obs in the test set
    random_state=0,  # for reproducibility
)

X_train.shape, X_test.shape

((1022, 5), (438, 5))

In [21]:
X_train.isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
BsmtQual       0.023483
FireplaceQu    0.467710
GarageYrBlt    0.052838
dtype: float64

In [22]:
# we call the imputer from featur- engine.
# By default it performs imputation with a string missing.

imputer = CategoricalImputer()

In [23]:
# we fit the imputer

imputer.fit(X_train)


In [24]:
# we see that the imputer found the categorical variables

imputer.variables_

['BsmtQual', 'FireplaceQu']

In [25]:
# feature-engine returns a dataframe

X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

X_train_t.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
64,,573.0,Gd,Missing,1998.0
682,,0.0,Gd,Gd,1996.0
960,50.0,0.0,TA,Missing,
1384,60.0,0.0,TA,Missing,1939.0
1100,60.0,0.0,TA,Missing,1930.0


In [26]:
# let's check that the numerical variables don't
# contain NA any more

X_train_t[imputer.variables_].isnull().mean()

BsmtQual       0.0
FireplaceQu    0.0
dtype: float64

In [27]:
# let's impute 1 variable only

imputer = CategoricalImputer(variables=["BsmtQual"])

imputer.fit(X_train)

In [28]:
# the selected variable

imputer.variables_

['BsmtQual']

In [29]:
# transform data set

X_train_t = imputer.transform(X_train)
X_test_t = imputer.transform(X_test)

X_train_t.head()

Unnamed: 0,LotFrontage,MasVnrArea,BsmtQual,FireplaceQu,GarageYrBlt
64,,573.0,Gd,,1998.0
682,,0.0,Gd,Gd,1996.0
960,50.0,0.0,TA,,
1384,60.0,0.0,TA,,1939.0
1100,60.0,0.0,TA,,1930.0


In [30]:
X_train_t[imputer.variables_].isnull().mean()


BsmtQual    0.0
dtype: float64

In [31]:
# let's check the percentage of NA in each categorical variable

X_train.isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
BsmtQual       0.023483
FireplaceQu    0.467710
GarageYrBlt    0.052838
dtype: float64

In [32]:
pipe = Pipeline(
    [
        (
            "imputer_mode",
            CategoricalImputer(imputation_method="frequent", variables=["BsmtQual"]),
        ),
        ("imputer_missing", CategoricalImputer(variables=["FireplaceQu"])),
    ]
)

In [33]:
pipe.fit(X_train)

In [34]:
pipe.named_steps["imputer_mode"].variables

['BsmtQual']

In [35]:
pipe.named_steps["imputer_missing"].variables

['FireplaceQu']

In [36]:
# let's transform the data with the pipeline
X_train_t = pipe.transform(X_train)
X_test_t = pipe.transform(X_test)

# let's check null values are gone
X_train_t.isnull().mean()

LotFrontage    0.184932
MasVnrArea     0.004892
BsmtQual       0.000000
FireplaceQu    0.000000
GarageYrBlt    0.052838
dtype: float64