In [None]:
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import missingno as mo
from matplotlib import pyplot as plt

In [None]:
train = pd.read_csv('../data/train.csv',index_col='Id')

In [None]:
train.head(5)

### find missing values

In [None]:
plt.figure(figsize=[12,6],dpi=300)
mo.matrix(train.select_dtypes(include=np.number))
plt.show()

In [None]:
plt.figure(figsize=[12,6],dpi=300)
mo.matrix(train.select_dtypes(include=np.object))
plt.show()

In [None]:
# train data
mask = train.isna().sum()
mask = mask[mask>0].sort_values(ascending=False)
mask

In [None]:
mask.index

In [None]:
num_feature_miss = train[mask.index].select_dtypes(include=np.number)
cat_feature_miss = train[mask.index].select_dtypes(include=np.object)

In [None]:
np.mean(num_feature_miss['LotFrontage'])

In [None]:
# impute lotfrontarea
impute_with = train[['Street','LotFrontage']].groupby(by='Street')['LotFrontage'].mean().to_dict()
impute_with

In [None]:
replace_with = train.Street.map(impute_with)
train['LotFrontage'] = np.where(train.LotFrontage.isna(),replace_with,train.LotFrontage)

In [None]:
train[['LotFrontage']].info()

In [None]:
# impute garage built year
num_feature_miss[['GarageYrBlt']].info()

In [None]:
train['GarageYrBlt'].fillna(value=0, inplace=True)

In [None]:
plt.figure(figsize=[24,6],dpi=300)
g = sns.countplot(x=train['GarageYrBlt'],palette='viridis')
g.set_xticklabels(g.get_xticklabels(),rotation=90)
plt.show()

In [None]:
train['GarageYrBlt'] = train['GarageYrBlt'].astype(int)
train['GarageYrBlt']

In [None]:
num_feature_miss[['MasVnrArea']].info()

In [None]:
# impute masvnrarea and msvnrtype
impute_with = train[['MasVnrArea','MasVnrType']].groupby(by='MasVnrType')['MasVnrArea'].mean().to_dict()
impute_with

In [None]:
train[['HouseStyle','MasVnrType']].groupby(by=['HouseStyle','MasVnrType'])['MasVnrType'].count()/train.shape[0]

In [None]:
impute_with = {'1.5Fin':'None',
               '1.5Unf':'None',
               '1Story':'Stone',
               '2.5Fin':'None',
               '2.5Unf':'None',
               '2Story':'None',
               'SFoyer':'None',
               'SLvl':'BrkFace'}
replace_with = train.HouseStyle.map(impute_with)
train['MasVnrType'] = np.where(train.MasVnrType.isna(),replace_with,train.MasVnrType)

In [None]:
train[['MasVnrType']].info()

In [None]:
impute_with = train[['MasVnrArea','MasVnrType']].groupby(by=['MasVnrType'])['MasVnrArea'].mean().to_dict()
impute_with

In [None]:
plt.figure(figsize=[12,6],dpi=300)
g = sns.countplot(x=train['MasVnrType'],hue=train['HouseStyle'],palette='viridis')
# g.set_xticklabels(g.get_xticklabels(),rotation=90)
plt.show()

In [None]:
replace_with = train['MasVnrType'].map(impute_with)
train['MasVnrArea'] = np.where(train.MasVnrArea.isna(),replace_with,train.MasVnrArea)

In [None]:
train[['MasVnrArea']].info()

In [None]:
# categorical feature imputation
cat_feature_miss

In [None]:
train[['PoolQC','MiscFeature','Alley','Fence']] = train.loc[:,['PoolQC','MiscFeature','Alley','Fence']].fillna(value='None')

In [None]:
fig,axes = plt.subplots(nrows=2,ncols=2,figsize=[12,8],dpi=300)
ax = axes.ravel()
sns.countplot(x=train['PoolQC'],ax=ax[0],palette='viridis')
sns.countplot(x=train['MiscFeature'],ax=ax[1],palette='viridis')
sns.countplot(x=train['Alley'],ax=ax[2],palette='viridis')
sns.countplot(x=train['Fence'],ax=ax[3],palette='viridis')
plt.show()

In [None]:
train['FireplaceQu'].fillna(value='None',inplace=True)

plt.figure(figsize=[12,6],dpi=300)
sns.countplot(x=train['FireplaceQu'],palette='viridis')
plt.show()

In [None]:
pd.options.display.max_rows = 81
train[train['GarageType'].isna()][['GarageType','GarageFinish','GarageQual','GarageCond']]

In [None]:
train[['GarageType','GarageFinish','GarageQual','GarageCond']] = train.loc[:,['GarageType','GarageFinish','GarageQual','GarageCond']].fillna(value='None')

In [None]:
train[train['BsmtExposure'].isna()][['BsmtExposure','BsmtFinType2','BsmtQual','BsmtCond','BsmtFinType1']]

In [None]:
train[['BsmtQual','BsmtCond','BsmtFinType1']] = train.loc[:,['BsmtQual','BsmtCond','BsmtFinType1']].fillna(value='None')

In [None]:
plt.figure(figsize=[12,6],dpi=300)
sns.countplot(x=train['BsmtCond'],palette='viridis')
plt.show()

In [None]:
train[['BsmtCond','BsmtExposure']].groupby(by=['BsmtCond','BsmtExposure'])['BsmtExposure'].count()/train.shape[0]

In [None]:
impute_with = {'Fa':'No',
               'Gd':'No',
               'None':'None',
               'Po':'Gd',
               'TA':'No'}
replace_with = train.BsmtCond.map(impute_with)
train['BsmtExposure'] = np.where(train['BsmtExposure'].isna(),impute_with,train['BsmtExposure'])

In [None]:
train[['BsmtCond','BsmtFinType2']].groupby(by=['BsmtCond','BsmtFinType2'])['BsmtFinType2'].count()/train.shape[0]

In [None]:
impute_with = {'Fa':'Unf',
               'Gd':'Unf',
               'None':'None',
               'Po':'Unf',
               'TA':'Unf'}
replace_with = train.BsmtCond.map(impute_with)
train['BsmtFinType2'] = np.where(train['BsmtFinType2'].isna(),impute_with,train['BsmtFinType2'])

In [None]:
plt.figure(figsize=[12,6],dpi=300)
sns.countplot(x=train['Electrical'],hue=train['HouseStyle'],palette='viridis')
plt.show()

In [None]:
train[['HouseStyle','Electrical']].groupby(by=['HouseStyle','Electrical'])['Electrical'].count()/train.shape[0]

In [None]:
train[train['Electrical'].isna()]['HouseStyle']

In [None]:
train['Electrical'].fillna(value='SBrkr',inplace=True)

In [None]:
train[['Electrical']].info()

In [None]:
train.info()
