In [115]:
#Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.impute import SimpleImputer

In [18]:
#import raw data
house_data = pd.read_csv('./raw_data/train_data.csv')

In [19]:
# first 5 row of the dataset
house_data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


There are missing values in 'Alley', 'PoolQC', Fence, 'MiscFeature' columns in the first 5 rows. 'Id' column is not a predictor in the dataset.

In [20]:
# data types in the data set
house_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

There are 37 columns containing numeric observations and 38 objects columns.

In [21]:
# size of the dataset
house_data.shape

(1460, 81)

The dataset has 1460 rows and 81 columns

Count the missing values in each columns and the percentages 

In [22]:
missing = pd.concat([house_data.isnull().sum(), 100 * house_data.isnull().mean()], axis=1)
missing.columns=['Count', '%']
missing.sort_values(by= 'Count', ascending= False)

Unnamed: 0,Count,%
PoolQC,1453,99.520548
MiscFeature,1406,96.301370
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274
...,...,...
ExterQual,0,0.000000
Exterior2nd,0,0.000000
Exterior1st,0,0.000000
RoofMatl,0,0.000000


In [96]:
# features that contains about 50% or higher missing values
missing_50 = missing[round(missing['%'], ndigits = -1) >= 50]
missing_50.sort_values(by= 'Count', ascending= False)

Unnamed: 0,Count,%
PoolQC,1453,99.520548
MiscFeature,1406,96.30137
Alley,1369,93.767123
Fence,1179,80.753425
FireplaceQu,690,47.260274


There are 5 columns of 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu' which are missing too many values (about 50%)

In [24]:
# drop 'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu' and 'Id' columns
house_data.drop(columns= ['Id','PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], inplace= True)

In [25]:
house_data.shape

(1460, 75)

In [26]:
house_data.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,LotShape,LandContour,Utilities,LotConfig,LandSlope,...,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,Reg,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,Reg,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,IR1,Lvl,AllPub,Inside,Gtl,...,0,0,0,0,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,IR1,Lvl,AllPub,Corner,Gtl,...,272,0,0,0,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,IR1,Lvl,AllPub,FR2,Gtl,...,0,0,0,0,0,12,2008,WD,Normal,250000


In [27]:
# missing values in float64 features
house_data.select_dtypes('float64').isnull().sum().sort_values(ascending= False)

LotFrontage    259
GarageYrBlt     81
MasVnrArea       8
dtype: int64

There are some missing values in all three columns that contains float64 data.

In [28]:
# replace missing values with the mean of the remaining observations
house_data['LotFrontage'] = house_data['LotFrontage'].fillna(house_data['LotFrontage'].mean())
house_data['MasVnrArea'] = house_data['MasVnrArea'].fillna(house_data['MasVnrArea'].mean())
house_data['GarageYrBlt'] = house_data['GarageYrBlt'].fillna(house_data['GarageYrBlt'].mean())

In [29]:
# missing values in int64 features
house_data.select_dtypes('int64').isnull().sum().sort_values(ascending= False)

MSSubClass       0
OpenPorchSF      0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageCars       0
GarageArea       0
WoodDeckSF       0
EnclosedPorch    0
LotArea          0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
BedroomAbvGr     0
HalfBath         0
FullBath         0
BsmtHalfBath     0
BsmtFullBath     0
GrLivArea        0
LowQualFinSF     0
2ndFlrSF         0
1stFlrSF         0
TotalBsmtSF      0
BsmtUnfSF        0
BsmtFinSF2       0
BsmtFinSF1       0
YearRemodAdd     0
YearBuilt        0
OverallCond      0
OverallQual      0
SalePrice        0
dtype: int64

No missing value in int64 features

Seperate the dataset into 2 datasets of numerical and categorical features

In [31]:
# numerical dataset
num_df = house_data.select_dtypes(exclude= 'object')

In [36]:
# check if any missing value in the numeric dataset
num_df.isnull().sum().sort_values(ascending= False)

MSSubClass       0
HalfBath         0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
BedroomAbvGr     0
FullBath         0
LotFrontage      0
BsmtHalfBath     0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
SalePrice        0
dtype: int64

As the cleaning steps above, there is no missing values in the numerical dataset.

In [38]:
# categorical dataset
cat_df = house_data.select_dtypes(include= 'object')

In [40]:
cat_df.isnull().sum().sort_values(ascending= False)

GarageCond       81
GarageQual       81
GarageFinish     81
GarageType       81
BsmtExposure     38
BsmtFinType2     38
BsmtCond         37
BsmtFinType1     37
BsmtQual         37
MasVnrType        8
Electrical        1
Heating           0
MSZoning          0
HeatingQC         0
CentralAir        0
Functional        0
PavedDrive        0
SaleType          0
KitchenQual       0
Foundation        0
Street            0
ExterCond         0
LotShape          0
LandContour       0
Utilities         0
LotConfig         0
LandSlope         0
Neighborhood      0
Condition1        0
Condition2        0
BldgType          0
HouseStyle        0
RoofStyle         0
RoofMatl          0
Exterior1st       0
Exterior2nd       0
ExterQual         0
SaleCondition     0
dtype: int64

There is 11 columns that contain missing values. Since only 1 missing value in 'Electrical' feature and 8 missing values in 'MasVnrType' feature, I will fill with the most frequent value in these 2 columns.

In [62]:
# replace missing value for Electrical
cat_df['Electrical'].fillna(cat_df['Electrical'].value_counts().index[0], inplace = True)
cat_df['Electrical'].isnull().sum()

0

In [63]:
# replace missing value for MasVnrType
cat_df['MasVnrType'].fillna(cat_df['MasVnrType'].value_counts().index[0], inplace = True)
cat_df['MasVnrType'].isnull().sum()

0

In [121]:
cat_df[cat_df['GarageCond'].isnull()].isnull().sum().sort_values(ascending= False)

GarageCond       81
GarageQual       81
GarageFinish     81
GarageType       81
BsmtQual          7
BsmtCond          7
BsmtExposure      7
BsmtFinType1      7
BsmtFinType2      7
HeatingQC         0
Heating           0
MSZoning          0
CentralAir        0
Electrical        0
Street            0
Functional        0
PavedDrive        0
SaleType          0
KitchenQual       0
Foundation        0
ExterCond         0
ExterQual         0
LotShape          0
LandContour       0
Utilities         0
LotConfig         0
LandSlope         0
Neighborhood      0
Condition1        0
Condition2        0
BldgType          0
HouseStyle        0
RoofStyle         0
RoofMatl          0
Exterior1st       0
Exterior2nd       0
MasVnrType        0
SaleCondition     0
dtype: int64

It looks like all of the missing values related to garage in the same houses.

In [123]:
garage_col = ['GarageCond', 'GarageQual', 'GarageFinish', 'GarageType']

In [124]:
cat_df[garage_col] = SimpleImputer(
    strategy= 'most_frequent').fit_transform(cat_df[garage_col])

In [125]:
cat_df[garage_col].isnull().sum()

GarageCond      0
GarageQual      0
GarageFinish    0
GarageType      0
dtype: int64

In [126]:
bsmt_col = ['BsmtExposure', 'BsmtFinType2', 'BsmtCond', 
                   'BsmtFinType1', 'BsmtQual']

In [127]:
missing_cat_df = cat_df[bsmt_col]
missing_cat_df

Unnamed: 0,BsmtExposure,BsmtFinType2,BsmtCond,BsmtFinType1,BsmtQual
0,No,Unf,TA,GLQ,Gd
1,Gd,Unf,TA,ALQ,Gd
2,Mn,Unf,TA,GLQ,Gd
3,No,Unf,Gd,ALQ,TA
4,Av,Unf,TA,GLQ,Gd
...,...,...,...,...,...
1455,No,Unf,TA,Unf,Gd
1456,No,Rec,TA,ALQ,Gd
1457,No,Unf,Gd,GLQ,TA
1458,Mn,Rec,TA,GLQ,TA


In [119]:
missing_cat_df[['GarageCond', 'GarageQual', 'GarageFinish', 'GarageType']] = SimpleImputer(strategy= 'most_frequent').fit_transform(missing_cat_df[['GarageCond', 'GarageQual', 'GarageFinish', 'GarageType']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  missing_cat_df[['GarageCond', 'GarageQual', 'GarageFinish', 'GarageType']] = SimpleImputer(strategy= 'most_frequent').fit_transform(missing_cat_df[['GarageCond', 'GarageQual', 'GarageFinish', 'GarageType']])


In [120]:
missing_cat_df.isnull().sum()

GarageCond       0
GarageQual       0
GarageFinish     0
GarageType       0
BsmtExposure    38
BsmtFinType2    38
BsmtCond        37
BsmtFinType1    37
BsmtQual        37
dtype: int64