In [10]:
"""
Notes:
- GarageType variable might be biased. Note that it has a 2types variable making our dummy variables technically incorrect since we dont know which 2 types it is
- PavedDrive: It could be argued that this variable should be numerically categorized rather than turned into dummy variables
"""

'\nNotes:\n- GarageType variable might be biased. Note that it has a 2types variable making our dummy variables technically incorrect since we dont know which 2 types it is\n- PavedDrive: It could be argued that this variable should be numerically categorized rather than turned into dummy variables\n'

In [11]:
import numpy as np
import pandas as pd

import os

In [12]:
# SET ENV VARIABLES
data_path = '../../data'
train_csv = os.path.join(data_path, 'train.csv')
test_csv = os.path.join(data_path, 'test.csv')

In [13]:
# Read CSV
train_df = pd.read_csv(train_csv)
test_df = pd.read_csv(test_csv)

In [14]:
# Note the difference between columns in train_df and test_df
test_df = test_df[set(train_df.columns).intersection(set(test_df.columns))]

In [15]:
class HousePreprocessor():
    def __init__(self, verbose=True):
        self.verbose = verbose
        # Cnvrt disp Vars
        self.disp_cats = [('Electrical', 'SBrkr'), ('BsmtFinType2', 'Unf')]
        
        # Cnvrt Relative Vars
        rating_order_map = {
            'Po': 0,
            'Fa': 1,
            'TA': 2,
            'Gd': 3,
            'Ex': 4,
            'NA': 'NA',
            np.nan: 'NA'
        }

        basement_exposure_map = {
            'No': 0,
            'Mn': 1,
            'Av': 2,
            'Gd': 3,
            'NA': 'NA',
            np.nan: 'NA'
        }

        basement_finish_map = {
            'Unf': 0,
            'LwQ': 1,
            'Rec': 2,
            'BLQ': 3,
            'ALQ': 4,
            'GLQ': 5,
            'NA': 'NA',
            np.nan: 'NA'
        }

        functional_rating = {
            'Sal': 0,
            'Sev': 1,
            'Maj2': 2,
            'Maj1': 3,
            'Mod': 4,
            'Min2': 5,
            'Min1': 6,
            'Typ': 7,
            'NA': 'NA',
            np.nan: 'NA'
        }

        garage_finish_rating = {
            'Unf': 0,
            'RFn': 1,
            'Fin': 2,
            'NA': 'NA',
            np.nan: 'NA'
        }

        fence_quality_rating = {
            "MnWw": 0,
            "GdWo": 1,
            "MnPrv": 2,
            "GdPrv": 3,
            "NA": "NA",
            np.nan: 'NA'
        }

        bool_map = {
            'N': 0,
            'Y': 1
        }

        self.relative_categories = (
            [(cat, rating_order_map) for cat in ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']] + 
            [(cat, basement_exposure_map) for cat in ['BsmtExposure']] +
            [(cat, basement_finish_map) for cat in ['BsmtFinType1']] +
            [(cat, functional_rating) for cat in ['Functional']] +
            [(cat, garage_finish_rating) for cat in ['GarageFinish']] +
            [(cat, fence_quality_rating) for cat in ['Fence']] +
            [(cat, bool_map) for cat in ['CentralAir']]
        )
        
        # Cyclical Vars
        self.cyclical_features = ['MoSold']

        # Dummy Vars
        self.dummy_vars_to_append = ['MSSubClass']
        
    def process(self, df):
        df = self.__cnvrt_disp_class_vars__(df)
        df = self.__cnvrt_relative_cats__(df)
        df = self.__cnvrt_cyclical_vars__(df)
        
        self.__log_remaining_category_vars__(df)
        
        df =  self.__add_dummy_vars__(df)
        
        df = self.__cnvrt_na_to_avg__(df)
        
        return df
    
    def __cnvrt_disp_class_vars__(self, df):
        # Convert Disproportionate Classification Variables
        """
        NOTE: The electrical variable is disproportionately SBrkr. As a result, we will convert Electrical to a boolean of SBrkr vs Not SBrkr rather than creating more variables

        SBrkr - 1334 (91.5%)
        FuseA - 1    (0.05%)
        FuseF - 94   (6.5%)
        FuseP - 27   (1.8%)
        Mix   - 3    (0.25%)

        NOTE: The electrical variable is disproportionately SBrkr. As a result, we will convert Electrical to a boolean of SBrkr vs Not SBrkr rather than creating more variables

        GLQ - 14
        ALQ - 19
        BLQ - 33
        Rec - 54
        LwQ - 46
        Unf - 1256
        NA - 38


        Note: An argument could be made to do the same for BsmtExposure, though we have decided against converting it into a boolean variable

        GD - 134
        Av - 221
        Mn - 114
        No - 953
        NA - 38

        """       
        for cat, key in self.disp_cats:
            df[cat] = [1 if e == key else 0 for e in df[cat]]
            
        return df
            
    def __cnvrt_relative_cats__(self, df):
        # Convert Relative Categorical into numerical
        for cat, map in self.relative_categories:
            num_na = len([i for i in df[cat] if i is np.nan])
            df[cat] = [map[rating] for rating in df[cat]]
            cat_na = f'{cat}_NA'

            # Create Boolean NA Column
            df[cat_na] = [1 if v == 'NA' else 0 for v in df[cat]]

            assert(num_na == sum(df[cat_na]))

            # Convert NA to average (IS THIS REALLY THE BEST STRATEGY?)
            avg = np.average([i for i in df[cat] if i != 'NA'])
            df[cat] = [i if i != 'NA' else avg for i in df[cat]]
        return df
            
    def __cnvrt_cyclical_vars__(self, df):
        # Convert to cyclical (See for more information)
        for f in self.cyclical_features:
            df[f'{f}_sin'] = np.sin(df[f])
            df[f'{f}_cos'] = np.cos(df[f])
            df.drop(f, 1)
            
        return df
            
    def __log_remaining_category_vars__(self, df):
        if self.verbose:
            # Decide which categorical variables you want to use
            # TODO: Convert to bar chart
            for col_name in df.columns:
                if df[col_name].dtypes == 'object':
                    unique_cat = len(df[col_name].unique())
                    print(f'Feature {col_name} has {unique_cat} unique categories')
            
    def __add_dummy_vars__(self, df):           
        dummy_list = [col for col in df.columns if df[col].dtypes == 'object']
        dummy_list = dummy_list + self.dummy_vars_to_append
        
        for cat in dummy_list:
            dummies = pd.get_dummies(df[cat], prefix=cat, dummy_na=True)
            df = df.drop(cat, 1)
            df = pd.concat([df, dummies], axis=1)
            
        return df
    
    def __cnvrt_na_to_avg__(self, df):
        # NOTE: Since Garage YR blt has NA values, we have decided to deal with this by using the average year as the NA value 
        # (IS THIS REALLY THE BEST STRATEGY?)
        cols_with_na = df.isnull().sum().sort_values(ascending=False)
        cols_with_na = list(cols_with_na[cols_with_na > 0].keys())
        
        for col in cols_with_na:
            if self.verbose:
                print(f"Converting nan's in {col} to avg of {col}")
            
            avg = np.average([i for i in df[col] if not np.isnan(i)])
            df[col] = [avg if np.isnan(i) else i for i in df[col]]
        
        cols_with_na = df.isnull().sum().sort_values(ascending=False)
        cols_with_na = cols_with_na[cols_with_na > 0]
        assert 0 == cols_with_na.shape[0]
        
        return df       
        

In [16]:
hp = HousePreprocessor()
train_df = hp.process(train_df)
print("-------------------------")
test_df = hp.process(test_df)

Feature Foundation has 6 unique categories
Feature Heating has 6 unique categories
Feature GarageType has 7 unique categories
Feature PavedDrive has 3 unique categories
Feature MiscFeature has 5 unique categories
Feature SaleType has 9 unique categories
Feature SaleCondition has 6 unique categories
Converting nan's in GarageYrBlt to avg of GarageYrBlt
-------------------------
Feature Foundation has 6 unique categories
Feature GarageType has 7 unique categories
Feature SaleCondition has 6 unique categories
Feature PavedDrive has 3 unique categories
Feature Heating has 4 unique categories
Feature SaleType has 10 unique categories
Feature MiscFeature has 4 unique categories
Converting nan's in GarageYrBlt to avg of GarageYrBlt
Converting nan's in BsmtFullBath to avg of BsmtFullBath
Converting nan's in BsmtHalfBath to avg of BsmtHalfBath
Converting nan's in BsmtFinSF1 to avg of BsmtFinSF1
Converting nan's in BsmtFinSF2 to avg of BsmtFinSF2
Converting nan's in BsmtUnfSF to avg of BsmtUnfSF

In [17]:
# NOTE: PROBLEM, Functional has no N/A's in train but has it in test o_O 