In [1]:
import numpy as np
import pandas as pd
import scipy.stats as ss
import seaborn as sns
import plotly.express as px
import plotly.graph_objs as go
from plotly.offline import plot,download_plotlyjs
from matplotlib import pyplot as plt

## EDA on house sale prices

In [2]:
data = pd.read_pickle('../data/cleaned_train.pkl')
data

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,8,2007,WD,Normal,175000
1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,...,0,,MnPrv,,0,2,2010,WD,Normal,210000
1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,...,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,4,2010,WD,Normal,142125


### Identify data

- type of data quantitative **(discrete,continues)** or qualitative **(nominal,ordinal)**

In [3]:
quantitative = data.select_dtypes(include=np.number)
qualitative = data.select_dtypes(include=np.object)

In [10]:
unq_classes = qualitative.nunique()
print(f'binary categorical variables : {unq_classes[unq_classes==2].index}')
print(f'multi class categorical variables : {unq_classes[unq_classes>2].index}')

binary categorical variables : Index(['Street', 'Utilities', 'CentralAir'], dtype='object')
multi class categorical variables : Index(['MSZoning', 'Alley', 'LotShape', 'LandContour', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating',
       'HeatingQC', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu',
       'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive',
       'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition'],
      dtype='object')


note : a binary class usually call as **dichotomous variable** and multi class variable call as **polytomous variable**.

- measurement scales **nominal,ordinal,interval,ratio**<br/><br/>
without clear understanding about scale type of data, can't do analysis correctly. it's means what type of
measures should use what are the visualization methods used like..

In [12]:
ordinal = qualitative[['LotShape','ExterQual','ExterCond','BsmtQual','BsmtCond','BsmtExposure',
                       'BsmtFinType1','BsmtFinType2','HeatingQC','KitchenQual','FireplaceQu',
                       'GarageFinish','GarageQual','GarageCond','PoolQC']]
nominal = qualitative.drop(ordinal.columns,axis=1)

In [14]:
wrong_scale_nominal = quantitative[['MSSubClass']].astype('object')
wrong_scale_ordinal = quantitative[['OverallQual','OverallCond']].astype('object')

ordinal = pd.concat([ordinal,wrong_scale_ordinal],axis=1)
nominal = pd.concat([nominal,wrong_scale_nominal],axis=1)

In [28]:
quantitative.drop(['MSSubClass','OverallQual','OverallCond'],axis=1,inplace=True)

interval = quantitative[['YearBuilt','YearRemodAdd','GarageYrBlt','MoSold','YrSold']]
ratio = quantitative.drop(interval.columns,axis=1)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [29]:
print(f'Ordinal scale variables: {ordinal.columns}\nnominal scale variables: {nominal.columns}')
print(f'interval scale variables: {interval.columns}\nratio scale variables: {ratio.columns}')

Ordinal scale variables: Index(['LotShape', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'HeatingQC',
       'KitchenQual', 'FireplaceQu', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PoolQC', 'OverallQual', 'OverallCond'],
      dtype='object')
nominal scale variables: Index(['MSZoning', 'Street', 'Alley', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd',
       'MasVnrType', 'Foundation', 'Heating', 'CentralAir', 'Electrical',
       'Functional', 'GarageType', 'PavedDrive', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition', 'MSSubClass'],
      dtype='object')
interval scale variables: Index(['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'MoSold', 'YrSold'], dtype='object')
ratio scale variables: Index(['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',