# B''H

## House Prices - EDA

**Step 3: Where applicable, convert categorical text variables to new numerical variables.**

**Key Takeaway:** 
- The following fields have been converted to new numerical ordinal variables:
    1. `ExterQual`
    2. `ExterCond`
    3. `BsmtQual`
    4. `BsmtCond`
    5. `HeatingQC`
    6. `KitchenQual`
    7. `FireplaceQu`
    8. `GarageQual`
    9. `GarageCond`
    10. `PoolQC`

In [1]:
import os
import sys

import math

import numpy as np
import pandas as pd

from scipy import stats

import matplotlib.pyplot as plt

import seaborn as sns

---
## Set the plot output sizes

In [2]:
# Get current size
fig_size = plt.rcParams["figure.figsize"]
 
# Prints: [8.0, 6.0]
print ("Prior size:", fig_size)
 
# Set figure width to 12 and height to 9
fig_size[0] = 12
fig_size[1] = 9
plt.rcParams["figure.figsize"] = fig_size

print ("Current size:", fig_size)

Prior size: [6.0, 4.0]
Current size: [12, 9]


---
## Get project info

In [3]:
NOTEBOOKS_DIR = os.path.join(os.pardir)

print(os.path.abspath(NOTEBOOKS_DIR))

/home/laz/repos/springboard-mini-projects/notebooks


In [4]:
PROJ_ROOT = os.path.join(NOTEBOOKS_DIR,os.pardir)

print(os.path.abspath(PROJ_ROOT))

/home/laz/repos/springboard-mini-projects


In [5]:
# add the 'src' directory as one where we can import modules
SRC_DIR = os.path.join(PROJ_ROOT, 'src')
sys.path.append(SRC_DIR)

print(os.path.abspath(SRC_DIR))

/home/laz/repos/springboard-mini-projects/src


In [6]:
# Load the "autoreload" extension
%load_ext autoreload

# always reload modules marked with "%aimport"
%autoreload 1

# import my method from the source code
%aimport helper_functions
import helper_functions as hf

---
### Import the data:

In [7]:
df_train = pd.read_csv(
    '~/.kaggle/competitions/house-prices-advanced-regression-techniques/train.csv',
    index_col = 'Id'
)

df_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


---

In [8]:
def recode_qual(val):
    
    if val == 'Po':
        return 1
        
    elif val == 'Fa':
        return 2

    elif val == 'TA':
        return 3
    
    elif val == 'Gd':
        return 4
    
    elif val == 'Ex':
        return 5
    
    else:
        return np.nan

---
### Create field recodes

In [9]:
fields = ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','FireplaceQu','GarageQual','GarageCond','PoolQC']

In [10]:
for field in fields:    
    
    field_recode = field+'Recode'
    
    df_train[field_recode]   = df_train[field].apply(recode_qual)
    
    print('__________________________________________')
    
    print(df_train[[field, field_recode]].groupby([field, field_recode])[field_recode].count())

__________________________________________
ExterQual  ExterQualRecode
Ex         5                   52
Fa         2                   14
Gd         4                  488
TA         3                  906
Name: ExterQualRecode, dtype: int64
__________________________________________
ExterCond  ExterCondRecode
Ex         5                     3
Fa         2                    28
Gd         4                   146
Po         1                     1
TA         3                  1282
Name: ExterCondRecode, dtype: int64
__________________________________________
BsmtQual  BsmtQualRecode
Ex        5.0               121
Fa        2.0                35
Gd        4.0               618
TA        3.0               649
Name: BsmtQualRecode, dtype: int64
__________________________________________
BsmtCond  BsmtCondRecode
Fa        2.0                 45
Gd        4.0                 65
Po        1.0                  2
TA        3.0               1311
Name: BsmtCondRecode, dtype: int64
___________

---

In [11]:
df_train.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ExterQualRecode,ExterCondRecode,BsmtQualRecode,BsmtCondRecode,HeatingQCRecode,KitchenQualRecode,FireplaceQuRecode,GarageQualRecode,GarageCondRecode,PoolQCRecode
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,4,3,4.0,3.0,5,4,,3.0,3.0,
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,3,3,4.0,3.0,5,3,3.0,3.0,3.0,
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,4,3,4.0,3.0,5,4,3.0,3.0,3.0,
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,3,3,3.0,4.0,4,4,4.0,3.0,3.0,
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,4,3,4.0,3.0,5,4,3.0,3.0,3.0,


---
### Drop old fields

In [12]:
df_train_recode = df_train.drop(fields, axis=1)

df_train_recode.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ExterQualRecode,ExterCondRecode,BsmtQualRecode,BsmtCondRecode,HeatingQCRecode,KitchenQualRecode,FireplaceQuRecode,GarageQualRecode,GarageCondRecode,PoolQCRecode
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,4,3,4.0,3.0,5,4,,3.0,3.0,
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,3,3,4.0,3.0,5,3,3.0,3.0,3.0,
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,4,3,4.0,3.0,5,4,3.0,3.0,3.0,
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,3,3,3.0,4.0,4,4,4.0,3.0,3.0,
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,4,3,4.0,3.0,5,4,3.0,3.0,3.0,


---
### Save to csv

In [13]:
file = os.path.join(hf.DATA_INTERIM_DIR, 'df_train_step_03.csv')

df_train_recode.to_csv(file)