In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math

#### Sanity Check

In [2]:
df = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [4]:
for perc, col in zip(df.isnull().sum().index, df.isnull().sum()):
    if(col != 0.0):
        print(f'{perc} --> {round(col*100/len(df), 4)}% nulos')

LotFrontage --> 17.7397% nulos
Alley --> 93.7671% nulos
MasVnrType --> 0.5479% nulos
MasVnrArea --> 0.5479% nulos
BsmtQual --> 2.5342% nulos
BsmtCond --> 2.5342% nulos
BsmtExposure --> 2.6027% nulos
BsmtFinType1 --> 2.5342% nulos
BsmtFinType2 --> 2.6027% nulos
Electrical --> 0.0685% nulos
FireplaceQu --> 47.2603% nulos
GarageType --> 5.5479% nulos
GarageYrBlt --> 5.5479% nulos
GarageFinish --> 5.5479% nulos
GarageQual --> 5.5479% nulos
GarageCond --> 5.5479% nulos
PoolQC --> 99.5205% nulos
Fence --> 80.7534% nulos
MiscFeature --> 96.3014% nulos


In [5]:
#LotFrontage é o único com NaNs em potencial, pois os demais representam alguma informação, como quantia 0.
#incialmente essa coluna será dropada. Outra opção seria estimar o dado através de casas com caracteristicas próximas

In [6]:
df_copy = df.drop(['LotFrontage'], axis=1).copy()
df_copy.dropna(subset=['Electrical'])

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


In [7]:
PoolQC_to_numeric = {'PoolQC': {None:0, 'Fa':1, 'TA':2, 'Gd':3, 'Ex':4}}
Fence_to_numeric = {'Fence': {None:-1, 'MnWw':0, 'GdWo':1, 'MnPrv':0, 'GdPrv':1}}
GarageCond_to_numeric = {'GarageCond': {None:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}}
GarageQual_to_numeric = {'GarageQual': {None:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}}
GarageFinish_to_numeric = { 'GarageFinish' : {None:0, 'Unf':1, 'RFn':2, 'Fin':3}}
FireplaceQu_to_numeric = {'FireplaceQu': {None:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}}
BsmtFinType2_to_numeric = {'BsmtFinType2': {None:0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}}
BsmtFinType1_to_numeric = {'BsmtFinType1': {None:0, 'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}}
BsmtExposure_to_numeric = {'BsmtExposure': {None:0, 'No':0, 'Mn':1, 'Av':2, 'Gd':3}}
BsmtCond_to_numeric = {'BsmtCond': {None:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}}
BsmtQual_to_numeric = {'BsmtQual': {None:0, 'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}}
Alley_to_numeric = {'Alley': {math.nan:0, 'Grvl':1, 'Pave':2}}

In [9]:
df_copy['MiscFeature'] = df['MiscFeature'].map(lambda x: 0 if x==None else 1)
df_copy['GarageType'] = df['GarageType'].map(lambda x: 0 if math.isnan() else (2 if x=='2Types' else 1))
df_copy['GarageYrBlt'] = df['GarageYrBlt'].map(lambda x: 0 if x==math.nan else x)
df_copy['MasVnrArea'] = df['MasVnrArea'].map(lambda x: 0 if x==math.nan else x)
df_copy.replace(PoolQC_to_numeric, inplace=True)
df_copy.replace(Fence_to_numeric, inplace=True)
df_copy.replace(GarageCond_to_numeric, inplace=True)
df_copy.replace(GarageQual_to_numeric, inplace=True)
df_copy.replace(GarageFinish_to_numeric, inplace=True)
df_copy.replace(FireplaceQu_to_numeric, inplace=True)
df_copy.replace(BsmtFinType2_to_numeric, inplace=True)
df_copy.replace(BsmtFinType1_to_numeric, inplace=True)
df_copy.replace(BsmtExposure_to_numeric, inplace=True)
df_copy.replace(BsmtCond_to_numeric, inplace=True)
df_copy.replace(BsmtQual_to_numeric, inplace=True)

In [10]:
for perc, col in zip(df_copy.isnull().sum().index, df_copy.isnull().sum()):
    if(col != 0.0):
        print(f'{perc} --> {round(col*100/len(df_copy), 4)}% nulos')

Alley --> 93.7671% nulos
MasVnrType --> 0.5479% nulos
MasVnrArea --> 0.5479% nulos
Electrical --> 0.0685% nulos
GarageYrBlt --> 5.5479% nulos


In [36]:
np.isnan('')

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''