In [2]:
!pip install feature-engine

Collecting feature-engine
  Downloading feature_engine-1.6.2-py2.py3-none-any.whl (328 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/328.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/328.9 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m328.9/328.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: feature-engine
Successfully installed feature-engine-1.6.2


In [3]:
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

from feature_engine.imputation import DropMissingData

In [4]:
# Download the data from Kaggle and store it
# in the same folder as this notebook.

data = pd.read_csv("/content/drive/MyDrive/Feature Engineering/Datasets/houseprice.csv")

data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
# Separate the data into train and test sets.

X_train, X_test, y_train, y_test = train_test_split(
    data.drop(["Id", "SalePrice"], axis=1),
    data["SalePrice"],
    test_size=0.3,
    random_state=0,
)

X_train.shape, X_test.shape

((1022, 79), (438, 79))

In [6]:
# Drop data when there are NA in any of the indicated variables

imputer = DropMissingData(
    variables=["Alley", "MasVnrType", "LotFrontage", "MasVnrArea"],
    missing_only=False,
)

In [7]:
imputer.fit(X_train)

In [8]:
# variables from which observations with NA will be deleted

imputer.variables_


['Alley', 'MasVnrType', 'LotFrontage', 'MasVnrArea']

In [9]:
# Number of observations with NA before the transformation

X_train[imputer.variables].isna().sum()

Alley          960
MasVnrType       5
LotFrontage    189
MasVnrArea       5
dtype: int64

In [14]:
# After the transformation the rows with NA values are
# deleted form the dataframe

train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)

In [15]:
# Number of observations with NA after transformation

train_t[imputer.variables].isna().sum()

Alley          0
MasVnrType     0
LotFrontage    0
MasVnrArea     0
dtype: int64

In [16]:
# Shape of dataframe before transformation

X_train.shape

(1022, 79)

In [17]:
# Shape of dataframe after transformation

train_t.shape

(59, 79)

In [18]:
# The "return_na_data()" method, returns a dataframe that contains
# the observations with NA.

# That is, the portion of the data that is dropped when
# we apply the transform() method.

tmp = imputer.return_na_data(X_train)

tmp.shape

(963, 79)

In [19]:
# total obs - obs with NA = final dataframe shape
#  after the transformation

1022 - 963

59

In [20]:
# Drop data if an observation contains NA in
# 2 of the 4 indicated variables (50%).

imputer = DropMissingData(
    variables=["Alley", "MasVnrType", "LotFrontage", "MasVnrArea"],
    missing_only=False,
    threshold=0.5,
)

In [21]:
imputer.fit(X_train)

In [22]:
# After the transformation the rows with NA values are
# deleted form the dataframe

train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)

In [23]:
# Number of observations with NA after transformation

train_t[imputer.variables].isna().sum()

Alley          955
MasVnrType       0
LotFrontage    188
MasVnrArea       0
dtype: int64

In [25]:
# Shape of dataframe before transformation

X_train.shape

(1022, 79)

In [26]:
# Shape of dataframe after transformation

train_t.shape

(1017, 79)

In [27]:
# Find variables with NA

imputer = DropMissingData(missing_only=True)

imputer.fit(X_train)

In [28]:
# variables with NA in the train set

imputer.variables_

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [29]:
# Number of observations with NA

X_train[imputer.variables_].isna().sum()

LotFrontage      189
Alley            960
MasVnrType         5
MasVnrArea         5
BsmtQual          24
BsmtCond          24
BsmtExposure      24
BsmtFinType1      24
BsmtFinType2      25
Electrical         1
FireplaceQu      478
GarageType        54
GarageYrBlt       54
GarageFinish      54
GarageQual        54
GarageCond        54
PoolQC          1019
Fence            831
MiscFeature      978
dtype: int64

In [30]:
# After the transformation the rows with NA are deleted form the dataframe

train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)

In [31]:
# Number of observations with NA after the transformation

train_t[imputer.variables_].isna().sum()

LotFrontage     0.0
Alley           0.0
MasVnrType      0.0
MasVnrArea      0.0
BsmtQual        0.0
BsmtCond        0.0
BsmtExposure    0.0
BsmtFinType1    0.0
BsmtFinType2    0.0
Electrical      0.0
FireplaceQu     0.0
GarageType      0.0
GarageYrBlt     0.0
GarageFinish    0.0
GarageQual      0.0
GarageCond      0.0
PoolQC          0.0
Fence           0.0
MiscFeature     0.0
dtype: float64

In [32]:
# in this case, all observations will be dropped
# because all of them show NA at least in 1 variable

train_t.shape

(0, 79)

In [33]:
# Find variables with NA

imputer = DropMissingData(
    missing_only=True,
    threshold=0.75,
)

imputer.fit(X_train)

In [34]:
# After the transformation the rows with NA are deleted form the dataframe

train_t = imputer.transform(X_train)
test_t = imputer.transform(X_test)

In [35]:
train_t.shape

(1022, 79)